In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
import scipy.stats as stats

In [2]:
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Imported the libraries

In [3]:
columns = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "last_flag"]

Defined the columns

In [4]:
train = pd.read_csv('C:/Users/HP/Downloads/3. Network Intrusion Detection System/NSL_Dataset/Train.txt' , 
                      sep = "," , names = columns)
train = train.iloc[:,:-1]

In [5]:
test = pd.read_csv('C:/Users/HP/Downloads/3. Network Intrusion Detection System/NSL_Dataset/Test.txt' , 
                      sep = "," , names = columns)
test = test.iloc[:,:-1]

imported the datasets

In [6]:
train.shape

(125973, 42)

In [7]:
test.shape

(22544, 42)

In [8]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,125973.0,287.14465,2604.515,0.0,0.0,0.0,0.0,42908.0
src_bytes,125973.0,45566.743,5870331.0,0.0,0.0,44.0,276.0,1379964000.0
dst_bytes,125973.0,19779.114421,4021269.0,0.0,0.0,0.0,516.0,1309937000.0
land,125973.0,0.000198,0.01408607,0.0,0.0,0.0,0.0,1.0
wrong_fragment,125973.0,0.022687,0.25353,0.0,0.0,0.0,0.0,3.0
urgent,125973.0,0.000111,0.01436603,0.0,0.0,0.0,0.0,3.0
hot,125973.0,0.204409,2.149968,0.0,0.0,0.0,0.0,77.0
num_failed_logins,125973.0,0.001222,0.04523914,0.0,0.0,0.0,0.0,5.0
logged_in,125973.0,0.395736,0.4890101,0.0,0.0,0.0,1.0,1.0
num_compromised,125973.0,0.27925,23.94204,0.0,0.0,0.0,0.0,7479.0


In [9]:
test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,22544.0,218.859076,1407.176612,0.0,0.0,0.0,0.0,57715.0
src_bytes,22544.0,10395.450231,472786.431088,0.0,0.0,54.0,287.0,62825648.0
dst_bytes,22544.0,2056.018808,21219.297609,0.0,0.0,46.0,601.0,1345927.0
land,22544.0,0.000311,0.017619,0.0,0.0,0.0,0.0,1.0
wrong_fragment,22544.0,0.008428,0.142599,0.0,0.0,0.0,0.0,3.0
urgent,22544.0,0.00071,0.036473,0.0,0.0,0.0,0.0,3.0
hot,22544.0,0.105394,0.928428,0.0,0.0,0.0,0.0,101.0
num_failed_logins,22544.0,0.021647,0.150328,0.0,0.0,0.0,0.0,4.0
logged_in,22544.0,0.442202,0.496659,0.0,0.0,0.0,1.0,1.0
num_compromised,22544.0,0.119899,7.269597,0.0,0.0,0.0,0.0,796.0


checked the characterstics of the data

In [10]:
cat_var = train[['protocol_type', 'service', 'flag']]

In [11]:
cat_var.columns

Index(['protocol_type', 'service', 'flag'], dtype='object')

separated the categorical columns

In [12]:
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname, drop_first = True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace = True )
    return df

In [13]:
for c_feature in ['protocol_type', 'service', 'flag']:
    cat_var[c_feature] = cat_var[c_feature].astype('category')
    cat_var = create_dummies(cat_var, c_feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
cat_var.columns

Index(['protocol_type_tcp', 'protocol_type_udp', 'service_X11',
       'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp',
       'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime',
       'service_discard', 'service_domain', 'service_domain_u', 'service_echo',
       'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec',
       'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher',
       'service_harvest', 'service_hostnames', 'service_http',
       'service_http_2784', 'service_http_443', 'service_http_8001',
       'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell',
       'service_ldap', 'service_link', 'service_login', 'service_mtp',
       'service_name', 'service_netbios_dgm', 'service_netbios_ns',
       'service_netbios_ssn', 'service_netstat', 'service_nnsp',
       'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump',
       'service_pop_2', 'service_pop_3', 'service_printer', 'se

created dummies for the categorical values

In [15]:
cat_var_test = test[['protocol_type', 'service', 'flag']]

In [16]:
for c_feature in ['protocol_type', 'service', 'flag']:
    cat_var_test[c_feature] =cat_var_test[c_feature].astype('category')
    cat_var_test = create_dummies(cat_var_test, c_feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [17]:
cat_var_test.columns

Index(['protocol_type_tcp', 'protocol_type_udp', 'service_X11',
       'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp',
       'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime',
       'service_discard', 'service_domain', 'service_domain_u', 'service_echo',
       'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec',
       'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher',
       'service_harvest', 'service_hostnames', 'service_http',
       'service_http_2784', 'service_http_443', 'service_http_8001',
       'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell',
       'service_ldap', 'service_link', 'service_login', 'service_mtp',
       'service_name', 'service_netbios_dgm', 'service_netbios_ns',
       'service_netbios_ssn', 'service_netstat', 'service_nnsp',
       'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump',
       'service_pop_2', 'service_pop_3', 'service_printer', 'se

repeated the same for test dataset

In [18]:
trainservice = train['service'].tolist()

testservice = test['service'].tolist()

difference = list(set(trainservice) - set(testservice))

string = 'service_'

difference = [string + x for x in difference]

difference

['service_http_8001',
 'service_http_2784',
 'service_harvest',
 'service_urh_i',
 'service_red_i',
 'service_aol']

In [19]:
for x in difference:
    cat_var_test[x] = 0

cat_var_test.shape

(125973, 81)

In [20]:
new_train = train.join(cat_var)
new_train.drop('flag', axis=1, inplace=True)
new_train.drop('protocol_type', axis=1, inplace=True)
new_train.drop('service', axis=1, inplace=True)


new_test=test.join(cat_var_test)
new_test.drop('flag', axis=1, inplace=True)
new_test.drop('protocol_type', axis=1, inplace=True)
new_test.drop('service', axis=1, inplace=True)

print(new_train.shape)
print(new_test.shape)

(125973, 120)
(22544, 120)


merged the categprical columns with original train data

dropped flag , protocol_type , service columns because already made dummies for them

repeated for test data as well

In [21]:
label_train = new_train['label']
label_test = new_test['label']

In [22]:
newlabel_train = label_train.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 
                                      'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                                       'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,
                                      'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,
                                      'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,
                                      'httptunnel': 3,'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

In [23]:
newlabel_test = label_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 
                                      'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                                       'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,
                                      'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,
                                      'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,
                                      'httptunnel': 3,'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

labelled the Y column in both train & test data

here, 0 = normal

1 = DoS

2 = Probe

3 = R2L

4 = U2R

In [24]:
new_train['label'] = newlabel_train
new_test['label'] = newlabel_test

In [25]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [26]:
feature_columns = new_train.columns.difference(['label'])

#defining the feature columns

In [27]:
logreg = LogisticRegression(solver='lbfgs')
rfe = RFE(logreg, n_features_to_select = 20, verbose = 10)

In [28]:
new_train['label'] = new_train['label'].astype('int')

In [29]:
rfe = rfe.fit(new_train[feature_columns], new_train.label)

Fitting estimator with 119 features.




Fitting estimator with 118 features.




Fitting estimator with 117 features.




Fitting estimator with 116 features.




Fitting estimator with 115 features.




Fitting estimator with 114 features.




Fitting estimator with 113 features.




Fitting estimator with 112 features.




Fitting estimator with 111 features.




Fitting estimator with 110 features.




Fitting estimator with 109 features.




Fitting estimator with 108 features.




Fitting estimator with 107 features.




Fitting estimator with 106 features.




Fitting estimator with 105 features.




Fitting estimator with 104 features.




Fitting estimator with 103 features.




Fitting estimator with 102 features.




Fitting estimator with 101 features.




Fitting estimator with 100 features.




Fitting estimator with 99 features.




Fitting estimator with 98 features.




Fitting estimator with 97 features.




Fitting estimator with 96 features.




Fitting estimator with 95 features.




Fitting estimator with 94 features.




Fitting estimator with 93 features.




Fitting estimator with 92 features.




Fitting estimator with 91 features.




Fitting estimator with 90 features.




Fitting estimator with 89 features.




Fitting estimator with 88 features.




Fitting estimator with 87 features.




Fitting estimator with 86 features.




Fitting estimator with 85 features.




Fitting estimator with 84 features.




Fitting estimator with 83 features.




Fitting estimator with 82 features.




Fitting estimator with 81 features.




Fitting estimator with 80 features.




Fitting estimator with 79 features.




Fitting estimator with 78 features.




Fitting estimator with 77 features.




Fitting estimator with 76 features.




Fitting estimator with 75 features.




Fitting estimator with 74 features.




Fitting estimator with 73 features.




Fitting estimator with 72 features.




Fitting estimator with 71 features.




Fitting estimator with 70 features.




Fitting estimator with 69 features.




Fitting estimator with 68 features.




Fitting estimator with 67 features.




Fitting estimator with 66 features.




Fitting estimator with 65 features.




Fitting estimator with 64 features.




Fitting estimator with 63 features.




Fitting estimator with 62 features.




Fitting estimator with 61 features.




Fitting estimator with 60 features.




Fitting estimator with 59 features.




Fitting estimator with 58 features.




Fitting estimator with 57 features.




Fitting estimator with 56 features.




Fitting estimator with 55 features.




Fitting estimator with 54 features.




Fitting estimator with 53 features.




Fitting estimator with 52 features.




Fitting estimator with 51 features.




Fitting estimator with 50 features.




Fitting estimator with 49 features.




Fitting estimator with 48 features.




Fitting estimator with 47 features.




Fitting estimator with 46 features.




Fitting estimator with 45 features.




Fitting estimator with 44 features.




Fitting estimator with 43 features.




Fitting estimator with 42 features.




Fitting estimator with 41 features.




Fitting estimator with 40 features.




Fitting estimator with 39 features.




Fitting estimator with 38 features.




Fitting estimator with 37 features.




Fitting estimator with 36 features.




Fitting estimator with 35 features.




Fitting estimator with 34 features.




Fitting estimator with 33 features.




Fitting estimator with 32 features.




Fitting estimator with 31 features.




Fitting estimator with 30 features.




Fitting estimator with 29 features.




Fitting estimator with 28 features.




Fitting estimator with 27 features.




Fitting estimator with 26 features.




Fitting estimator with 25 features.




Fitting estimator with 24 features.




Fitting estimator with 23 features.




Fitting estimator with 22 features.




Fitting estimator with 21 features.





Selected important feature columns with the help of RFE

In [30]:
print (rfe.support_)
print (rfe.ranking_)

[False  True False False  True False  True  True  True False  True False
 False False False False False  True  True False False False False False
 False False False False  True False False False False False False False
  True False False False  True False False False False False False False
 False False False False False False False  True  True False False False
 False False False False False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False  True False False False False  True
 False False False False False  True False False False False False False
 False False False False False False  True  True False False  True]
[ 18   1  93  78   1   3   1   1   1  74   1  17   6  20  22   4  29   1
   1  25  76  81   2  16  21  24  99  77   1  73   9  52  26 100   8  27
   1   5  13   7   1  12  72  31  95  49  35  32  43  51  65  48  80  11
  64   1   1  37  38  41  30  15  54  97  50   1  98  34

In [31]:
X_final = new_train[feature_columns[rfe.support_]] #X = columns found by RFE
y_final = new_train['label'] #Y = Label

In [32]:
#splitting into train & test

from sklearn.model_selection import train_test_split


train_X, test_X, train_y, test_y = train_test_split(X_final, y_final, test_size = 0.3, random_state = 12345)

In [33]:
from sklearn.tree import DecisionTreeClassifier
clf1 = DecisionTreeClassifier(random_state=0)

#trying decision tree model first

In [34]:
clf1.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [35]:
acc_dt = round( clf1.score(train_X, train_y) * 100, 2)
print (str(acc_dt) + ' percent')

99.17 percent


Decision Tree model seems to be over fitting; trying random forest next

In [36]:
from sklearn.ensemble import RandomForestRegressor

clf2 = RandomForestRegressor(n_estimators=100)
clf2.fit(train_X, train_y)

acc_rf= round(clf2.score(train_X, train_y) * 100, 2)
print ("Accuracy: %i %% \n"%acc_rf)

Accuracy: 96 % 



Accuracy of random forest is appropriate, predicting for test data with the same

In [37]:
new_test1 = new_test[['diff_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_same_srv_rate',
       'dst_host_serror_rate', 'dst_host_srv_diff_host_rate', 'flag_RSTR',
       'flag_S0', 'logged_in', 'protocol_type_tcp', 'same_srv_rate',
       'service_eco_i', 'service_ecr_i', 'service_http', 'service_private',
       'service_smtp', 'service_telnet', 'srv_rerror_rate', 'srv_serror_rate',
       'wrong_fragment']]

In [38]:
clf2.predict(new_test1)

array([1.        , 0.08      , 1.02      , ..., 0.01271776, 0.        ,
       1.84      ])

In [42]:
Y_pred=clf2.predict(new_test1)

In [43]:
# Creating confusion matrix

pd.crosstab(new_test.label , Y_pred, rownames=['Actual attacks'], colnames=['Predicted attacks']).T

Actual attacks,0,1,2,3,4
Predicted attacks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.000000,6075,519,234,1749,46
0.000222,0,1,0,0,0
0.002056,2,0,0,3,0
0.002591,0,2,0,0,0
0.003529,0,0,1,0,0
0.003617,1,0,0,0,0
0.006550,1,0,0,0,1
0.007500,1,0,0,0,0
0.007833,0,1,0,0,0
0.008571,0,1,0,0,0
