In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle

In [2]:
train_url = 'NSL_KDD_Train.csv'
test_url = 'NSL_KDD_Test.csv'

In [3]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


df = pd.read_csv(train_url,header=None, names = col_names)

df_test = pd.read_csv(test_url, header=None, names = col_names)

print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

Dimensions of the Training set: (125973, 42)
Dimensions of the Test set: (22544, 42)


In [4]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [5]:
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64

Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178


**Step 1: Data preprocessing:**

In [6]:
labeldf=df['label']
labeldf_test=df_test['label']


# change the label column
newlabeldf=labeldf.replace({ 'normal' : 'normal', 'neptune' : 'DoS' ,'back': 'DoS', 'land': 'DoS', 'pod': 'DoS',
                            'smurf':'DoS',  'teardrop': 'DoS','mailbomb': 'DoS', 'apache2': 'DoS', 'processtable': 'DoS', 'udpstorm': 'DoS', 'worm': 'DoS',
                           'ipsweep' : 'Probe','nmap' : 'Probe','portsweep' : 'Probe','satan' : 'Probe','mscan' : 'Probe','saint' : 'Probe'
                           ,'ftp_write': 'R2L','guess_passwd': 'R2L','imap': 'R2L','multihop': 'R2L','phf': 'R2L','spy': 'R2L','warezclient': 'R2L','warezmaster': 'R2L','sendmail': 'R2L','named': 'R2L','snmpgetattack': 'R2L','snmpguess': 'R2L','xlock': 'R2L','xsnoop': 'R2L','httptunnel': 'R2L',
                           'buffer_overflow': 'U2R','loadmodule': 'U2R','perl': 'U2R','rootkit': 'U2R','ps': 'U2R','sqlattack': 'U2R','xterm': 'U2R'})
newlabeldf_test=labeldf_test.replace({ 'normal' : 'normal', 'neptune' : 'DoS' ,'back': 'DoS', 'land': 'DoS', 'pod': 'DoS', 'smurf': 'DoS', 'teardrop': 'DoS','mailbomb': 'DoS', 'apache2': 'DoS', 'processtable': 'DoS', 'udpstorm': 'DoS', 'worm': 'DoS',
                           'ipsweep' : 'Probe','nmap' : 'Probe','portsweep' : 'Probe','satan' : 'Probe','mscan' : 'Probe','saint' : 'Probe'
                           ,'ftp_write': 'R2L','guess_passwd': 'R2L','imap': 'R2L','multihop': 'R2L','phf': 'R2L','spy': 'R2L','warezclient': 'R2L','warezmaster': 'R2L','sendmail': 'R2L','named': 'R2L','snmpgetattack': 'R2L','snmpguess': 'R2L','xlock': 'R2L','xsnoop': 'R2L','httptunnel': 'R2L',
                           'buffer_overflow': 'U2R','loadmodule': 'U2R','perl': 'U2R','rootkit': 'U2R','ps': 'U2R','sqlattack': 'U2R','xterm': 'U2R'})


In [7]:
#0:Normal ; 1:DoS 2:Probe 3:R2L 4:U2R
# put the new label column back
df['label'] = newlabeldf
df_test['label'] = newlabeldf_test
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Training set:
normal    67343
DoS       45927
Probe     11656
R2L         995
U2R          52
Name: label, dtype: int64

Label distribution Test set:
normal    9711
DoS       7460
R2L       2885
Probe     2421
U2R         67
Name: label, dtype: int64


In [8]:
df['label']=np.where(df['label']=='normal',0,1)
print(df['label'].value_counts())
df_test['label']=np.where(df_test['label']=='normal',0,1)
df_test['label'].value_counts()

0    67343
1    58630
Name: label, dtype: int64


1    12833
0     9711
Name: label, dtype: int64

In [9]:
# protocol_type (column 2), service (column 3), flag (column 4).

print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories

Distribution of categories in service:
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: service, dtype: int64


In [10]:
# Test set
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))


Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories


In [11]:
newdf = pd.get_dummies(df)
newdf_test = pd.get_dummies(df_test)
print(newdf.shape)
print(newdf_test.shape)

(125973, 123)
(22544, 117)


In [12]:
X_train = newdf.drop('label',1)
y_train = newdf['label']
print(X_train.shape)
print(y_train.shape)
X_test = newdf_test.drop('label',1)
y_test = newdf_test['label']
print(X_test.shape)
print(y_test.shape)
X_train.head()

(125973, 122)
(125973,)
(22544, 116)
(22544,)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [13]:
from sklearn.preprocessing import normalize
X_train_norm = normalize(X_train, norm='l2', axis=1, copy=True, return_norm=False)
df_norm = pd.DataFrame(X_train_norm, columns= [list(X_train)])
X_test_norm = normalize(X_test, norm='l2', axis=1, copy=True, return_norm=False)
df_norm_test = pd.DataFrame(X_test_norm, columns= [list(X_test)])
df_norm.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.03,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.34,0.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
colNames=list(df_norm)
#colNames_test=list(X_test)

In [15]:
from sklearn.feature_selection import SelectFpr, chi2
chi = SelectFpr(chi2, alpha = 0.08)
X_chi = chi.fit_transform(df_norm, y_train)

true=chi.get_support(indices = True)
chi_col=[i for i, x in enumerate(true) if x]
chi_col=list(colNames[i] for i in chi_col)
print(chi_col)

X_chi.shape

[('src_bytes',), ('dst_bytes',), ('land',), ('wrong_fragment',), ('urgent',), ('hot',), ('num_failed_logins',), ('logged_in',), ('num_compromised',), ('root_shell',), ('su_attempted',), ('num_root',), ('num_file_creations',), ('num_shells',), ('num_access_files',), ('num_outbound_cmds',), ('is_host_login',), ('is_guest_login',), ('count',), ('srv_count',), ('serror_rate',), ('srv_serror_rate',), ('rerror_rate',), ('srv_rerror_rate',), ('same_srv_rate',), ('diff_srv_rate',), ('srv_diff_host_rate',), ('dst_host_count',), ('dst_host_srv_count',), ('dst_host_same_srv_rate',), ('dst_host_diff_srv_rate',), ('dst_host_same_src_port_rate',), ('dst_host_srv_diff_host_rate',), ('dst_host_serror_rate',), ('dst_host_srv_serror_rate',)]


(125973, 36)

#### feature selection result - authors
src_bytes, dst_bytes, protocol_type (icmp, tcp, udp), Service (domain_u, eco_i, http,
imap4, pop3, private, telnet), flag (REJ, RSTO, RSTR, S0, SH), wrong_fragment,
logged-in, count, srv_count, serror_rate, srv_serror_rate, rerror_rate,
srv_rerror_rate, diff-srv-rate, srv-diff-host-rate, dst_host_rerror_rate,
dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate,
dst_host_serror_rate, dst_host_srv_count, dst_host_count,
dst_host_srv_rerror_rate, dst_host_srv_serror_rate

In [16]:
df_train_sel = df_norm[['src_bytes', 'dst_bytes', 'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp',
'service_domain_u', 'service_eco_i', 'service_http', 'service_imap4', 'service_pop_3', 'service_private', 'service_telnet',
'flag_REJ', 'flag_RSTO', 'flag_RSTR', 'flag_S0', 'flag_SH', 'wrong_fragment', 'logged_in', 'count', 
'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
'dst_host_rerror_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
'dst_host_serror_rate', 'dst_host_srv_count', 'dst_host_count', 'dst_host_srv_rerror_rate', 'dst_host_srv_serror_rate']]

df_test_sel = df_norm_test[['src_bytes', 'dst_bytes', 'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp',
'service_domain_u', 'service_eco_i', 'service_http', 'service_imap4', 'service_pop_3', 'service_private', 'service_telnet',
'flag_REJ', 'flag_RSTO', 'flag_RSTR', 'flag_S0', 'flag_SH', 'wrong_fragment', 'logged_in', 'count', 
'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
'dst_host_rerror_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
'dst_host_serror_rate', 'dst_host_srv_count', 'dst_host_count', 'dst_host_srv_rerror_rate', 'dst_host_srv_serror_rate']]

df_train_sel.head()

Unnamed: 0,src_bytes,dst_bytes,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_domain_u,service_eco_i,service_http,service_imap4,service_pop_3,...,srv_diff_host_rate,dst_host_rerror_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_count,dst_host_count,dst_host_srv_rerror_rate,dst_host_srv_serror_rate
0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.29,0.0,0.0
1,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.87,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.9,0.0,0.0
3,0.03,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0
4,0.34,0.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.43,0.43,0.0,0.0


In [17]:
from statistics import mean
#Creating a dict of the models
model_dict = {'QDA' : QuadraticDiscriminantAnalysis(),
              'RF': RandomForestClassifier(),
              'ADB': AdaBoostClassifier(),
              'GNB': GaussianNB(),
              'KNN': KNeighborsClassifier()}

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, acc_score, fpr_res, auc_res = [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(df_train_sel, y_train)
        y_pred = v.predict(df_test_sel)
        acc_score.append(accuracy_score(y_test, y_pred))
        c_fpr, c_tpr, thresholds = metrics.roc_curve(y_test, y_pred)
        auc_res.append(metrics.auc(c_fpr, c_tpr))
        
        tnr = recall_score(y_test, y_pred, pos_label = 0) 
        fpr = 1 - tnr
        fpr_res.append(fpr)
        
        model_comparison_df = pd.DataFrame([model_name, acc_score,fpr_res, auc_res]).T
        model_comparison_df.columns = ['model_name', 'accuracy', 'fpr', 'auc']
        model_comparison_df = model_comparison_df.sort_values(by='accuracy', ascending=False)
    return model_comparison_df

model_score_df(model_dict)

Unnamed: 0,model_name,accuracy,fpr,auc
1,RF,0.77,0.03,0.8
4,KNN,0.77,0.03,0.79
2,ADB,0.75,0.07,0.77
0,QDA,0.69,0.02,0.72
3,GNB,0.66,0.02,0.7


In [18]:
clf1 = QuadraticDiscriminantAnalysis()
clf2 = RandomForestClassifier()
clf3 = AdaBoostClassifier()
clf4 = GaussianNB()
clf5 = KNeighborsClassifier()
eclf1 = VotingClassifier(estimators=[('rf', clf2), ('adb', clf3), ('gnb', clf4)], voting='soft')
eclf1 = eclf1.fit(df_train_sel, y_train)
clf_vect = eclf1.predict(df_test_sel)
print(accuracy_score(y_test, clf_vect))

0.7461852377572746


In [19]:
prob_pred = eclf1.predict_proba(df_test_sel)

threshold = [0.5, 0.6, 0.7, 0.8] # define threshold here
for t in threshold:
    preds = [1 if prob_pred[i][1]> t else 0 for i in range(len(prob_pred))]
    print(accuracy_score(y_test, preds))
    
prob_pred.shape

0.7461852377572746
0.6532114975159687
0.6343151171043293
0.6059705464868701


(22544, 2)

In [20]:
eclf2 = VotingClassifier(estimators=[('rf', clf2), ('adb', clf3), ('qda', clf1)], voting='soft')
eclf2 = eclf2.fit(df_train_sel, y_train)
clf_vect1 = eclf2.predict(df_test_sel)

print(accuracy_score(y_test, clf_vect1))

0.7246273953158269


In [21]:
prob_preds = eclf2.predict_proba(df_test_sel)
confidence_vec = np.concatenate((prob_pred, prob_preds),axis=0)
confidence_vec

array([[0.15318093, 0.84681907],
       [0.46843379, 0.53156621],
       [0.83738812, 0.16261188],
       ...,
       [0.49802286, 0.50197714],
       [0.86248308, 0.13751692],
       [0.65966536, 0.34033464]])

In [22]:
threshold = [0.5, 0.6, 0.7, 0.8] # define threshold here
for t in threshold:
    preds = [1 if prob_preds[i][1]> t else 0 for i in range(len(prob_preds))]
    print(accuracy_score(y_test, preds))

0.7246273953158269
0.6875
0.6766323633782825
0.6664300922640171


In [23]:
threshold = 0.7
classification_vec = [1 if confidence_vec[i][1]> threshold else 0 for i in range(len(confidence_vec))]

In [24]:
len(classification_vec)

45088