In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB 

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
train_df = pd.read_csv('../input/nslkdd/KDDTrain+.txt')
test_df = pd.read_csv('../input/nslkdd/KDDTest+.txt')

In [None]:
# column labels
columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment',
            'urgent','hot','num_failed_logins','logged_in','num_compromised'
            ,'root_shell','su_attempted','num_root','num_file_creations','num_shells'
            ,'num_access_files','num_outbound_cmds','is_host_login','is_guest_login'
            ,'count','srv_count','serror_rate','srv_serror_rate','rerror_rate'
            ,'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate'
            ,'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate'
            ,'dst_host_diff_srv_rate','dst_host_same_src_port_rate'
            ,'dst_host_srv_diff_host_rate','dst_host_serror_rate'
            ,'dst_host_srv_serror_rate','dst_host_rerror_rate'
            ,'dst_host_srv_rerror_rate','attack','level'])

train_df.columns = columns
test_df.columns = columns

In [None]:
train_df.head()

In [None]:
#attack flag for binary classification

is_attack = train_df.attack.map(lambda a: 'no' if a == 'normal' else 'yes')
test_flag = test_df.attack.map(lambda a: 'no' if a == 'normal' else 'yes')

train_df['attack_flag'] = is_attack
test_df['attack_flag'] = test_flag

train_df.head()

In [None]:
train_df['attack_flag'].value_counts()

In [None]:
#attack types
set(train_df['attack'])

Next, we'll classify each of the attacks according to attack type for a more granular prediction model.

Denial of Service attacks: 
apache2
back
land
neptune
mailbomb
pod
processtable
smurf
teardrop
udpstorm
worm

Probe attacks: 
ipsweep
mscan
nmap
portsweep
saint
satan

Privilege escalation attacks: 
buffer_overflow
loadmdoule
perl
ps
rootkit
sqlattack
xterm

Remote access attacks: 
ftp_write
guess_passwd
http_tunnel
imap
multihop
named
phf
sendmail
snmpgetattack
snmpguess
spy
warezclient
warezmaster
xclock
xsnoop

In [None]:
#attack classification
privilege_attacks = ['buffer_overflow','loadmdoule','perl','ps','rootkit','sqlattack','xterm']
access_attacks = ['ftp_write','guess_passwd','http_tunnel','imap','multihop','named','phf','sendmail',
                  'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xclock','xsnoop']
dos_attacks = ['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm']
probe_attacks = ['ipsweep','mscan','nmap','portsweep','saint','satan']

attack_labels = ['Normal','DoS','Probe','Privilege','Access']

def attack_mapping(attack):
    if attack in dos_attacks:
        attack_type = 'DoS'
    elif attack in probe_attacks:
        attack_type = 'Probe'
    elif attack in privilege_attacks:
        attack_type = 'privilege'
    elif attack in access_attacks:
        attack_type = 'Access'
    else:
        attack_type = 'Normal'
        
    return attack_type


attack_map = train_df.attack.apply(attack_mapping)
train_df['attack_class'] = attack_map
test_attack_map = test_df.attack.apply(attack_mapping)
test_df['attack_class'] = test_attack_map


In [None]:
set(train_df['attack_class'])

In [None]:
#missing values
train_df.isnull().sum()

# NSL KDD DATASET ANALYSIS 

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df['attack'].value_counts()

In [None]:
train_df['attack_class'].value_counts()

In [None]:
barplot = sns.countplot(y='attack_class', data=train_df)
barplot.set_title('Attack types')

 In network traffic analysis, protocol is used as a simple tool for creating some initial buckets to categorize our data. 'normal' is left in the set at this point as a benchmark.

In [None]:
#type of protocols
set(train_df['protocol_type'])

In [None]:
#attack vs protocol 
attack_vs_protocol = pd.crosstab(train_df.attack, train_df.protocol_type)
attack_vs_protocol

most attacks are going to target a specific protocol. There are several (satan, nmap, ipsweep) that are cross-prototcol attacks. 

In [None]:
attack_vs_protocol = pd.crosstab(train_df.attack_class, train_df.protocol_type)
attack_vs_protocol



icmp data is less frequently found in normal traffic than other protocols. 

Access and privelege attacks through the tcp protocol. 

In [None]:
def plots(data_list,labels):
    list_length = len(data_list)
    color_list =sns.color_palette("Set2")
    color_cycle = itertools.cycle(color_list)
    
    cdict = {}
    
    fig, axs = plt.subplots(1, list_length,figsize=(15,10), tight_layout=False)
    plt.subplots_adjust(wspace=1/list_length)
    
    for count, data_set in enumerate(data_list):         
        for num, value in enumerate(np.unique(data_set.index)):
            if value not in cdict:
                cdict[value] = next(color_cycle)
        wedges,texts = axs[count].pie(data_set,colors=[cdict[v] for v in data_set.index])
        axs[count].legend(wedges, data_set.index,title="Flags",loc="center left",bbox_to_anchor=(1, 0, 0.5, 1))
        axs[count].set_title(labels[count])
        
    return axs   

In [None]:
icmp_attacks = attack_vs_protocol.icmp
tcp_attacks = attack_vs_protocol.tcp
udp_attacks = attack_vs_protocol.udp

plots([icmp_attacks, tcp_attacks, udp_attacks],['icmp','tcp','udp'])
plt.show()

The thing to notice here is the difference in each protocol type. Protocol may be useful in being able to identify the type of traffic we are observing.

In [None]:
print('Services appearing in normal class :')
print(train_df.loc[train_df.attack_flag == 'no'].groupby('service').count().index.tolist())

In [None]:
print('Services appearing in attack class :')
print(train_df.loc[train_df.attack_flag == 'yes'].groupby('service').count().index.tolist())

In [None]:
# get a series with the count of each service for attack and normal traffic
normal_services = train_df.loc[train_df.attack_flag == 'no'].service.value_counts()
attack_services = train_df.loc[train_df.attack_flag == 'yes'].service.value_counts()
service_axs = plots([normal_services, attack_services], ['normal','attack'])        
plt.show()

In [None]:
print('Flags appearing in normal class :')
print(train_df.loc[train_df.attack_flag == 'no'].groupby('flag').count().index.tolist())

In [None]:
print('Flags appearing in attack class :')
print(train_df.loc[train_df.attack_flag == 'yes'].groupby('flag').count().index.tolist())

In [None]:
# get a series with the count of each flag for attack and normal traffic
normal_flags = train_df.loc[train_df.attack_flag == 'no'].flag.value_counts()
attack_flags = train_df.loc[train_df.attack_flag == 'yes'].flag.value_counts()

flag_axs = plots([normal_flags, attack_flags], ['normal','attack'])        
plt.show()

many services are in the attack set! Whereas a huge amount of normal traffic is http, our attack traffic is all over the place. it means that attacks are searching for many different paths into systems.

If we think about this from the eyes of a network adminstrator, the combination of protocol, flag and service seem like they should tell us a lot about the nature of our traffic. Coupling them with the duration of a connection and the amount of data in that connection can be helpful. 

In this data set, attackers' traffic uses more various protocols and services than the legitimate one. Protocols and services that do not appear in legitimate traffic are suspicious.

In [None]:
train_df.drop(['land','wrong_fragment',
            'urgent','hot','num_failed_logins','logged_in','num_compromised'
            ,'root_shell','su_attempted','num_root','num_file_creations','num_shells'
            ,'num_access_files','num_outbound_cmds','is_host_login','is_guest_login'
            ,'serror_rate','srv_serror_rate','rerror_rate'
            ,'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate'
            ,'dst_host_serror_rate'
            ,'dst_host_srv_serror_rate'
            ,'dst_host_srv_rerror_rate','level'], axis = 1, inplace = True)

In [None]:
test_df.drop(['land','wrong_fragment',
            'urgent','hot','num_failed_logins','logged_in','num_compromised'
            ,'root_shell','su_attempted','num_root','num_file_creations','num_shells'
            ,'num_access_files','num_outbound_cmds','is_host_login','is_guest_login'
            ,'serror_rate','srv_serror_rate','rerror_rate'
            ,'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate'
            ,'dst_host_serror_rate'
            ,'dst_host_srv_serror_rate'
            ,'dst_host_srv_rerror_rate','level'], axis = 1, inplace = True)



In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# extract numerical attributes and scale it to have zero mean and unit variance  
cols = train_df.select_dtypes(include=['float64','int64']).columns
sc_train = scaler.fit_transform(train_df.select_dtypes(include=['float64','int64']))
sc_test = scaler.fit_transform(test_df.select_dtypes(include=['float64','int64']))

# turn the result back to a dataframe
sc_train_df = pd.DataFrame(sc_train, columns = cols)
sc_test_df = pd.DataFrame(sc_test, columns = cols)

# Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# extract categorical attributes from both training and test sets 
cattrain = train_df.select_dtypes(include=['object']).copy()
cattest = test_df.select_dtypes(include=['object']).copy()

# encode the categorical attributes
traincat = cattrain.apply(encoder.fit_transform)
testcat = cattest.apply(encoder.fit_transform)

# separate target column from encoded data 
enctrain = traincat.copy()
enctrain.head()

In [None]:
train_x = pd.concat([sc_train_df,enctrain],axis=1)
train_x.head()

In [None]:
test_df = pd.concat([sc_test_df,testcat],axis=1)
test_df.head()

TRAIN-TEST SPLIT

In [None]:
to_fit=train_df.append(test_df, sort=False)
to_fit=train_x.drop(['attack','attack_flag', 'attack_class'], axis=1 )

binary_y = train_x['attack_flag']
test_binary_y = test_df['attack_flag']

#target classifications : multi 
multi_y = train_x['attack_class']
test_multi_y = test_df['attack_class']

# build the training sets
binary_train_X, binary_val_X, binary_train_y, binary_val_y = train_test_split(to_fit, binary_y, test_size=0.4)
multi_train_X, multi_val_X, multi_train_y, multi_val_y = train_test_split(to_fit, multi_y, test_size = 0.4)

BINARY CLASSIFICATION : 

In [None]:
# Gaussian Naive Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(binary_train_X, binary_train_y)
pred_y=BNB_Classifier.predict(binary_val_X)

accuracy = metrics.accuracy_score(binary_val_y, pred_y)
print ("\nModel Accuracy:" "\n", accuracy)

print("\nCONFUSION MATRIX")
confusion_matrix = metrics.confusion_matrix(binary_val_y, pred_y)
sns.heatmap(confusion_matrix , square=True, annot=True, fmt='d', cbar=False,
            xticklabels = ['Predicted Normal','Predicted Attack'],
            yticklabels = ['Actual Normal','Actual Attack'])
plt.show()

classification = metrics.classification_report(binary_val_y, pred_y)
print("\nClassification report:" "\n", classification) 

print(metrics.recall_score(binary_val_y, pred_y))
print(metrics.precision_score(binary_val_y, pred_y))

In [None]:
# KNeighborsClassifier Model
print("KNeighborsClassifier Model")
KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier.fit(binary_train_X, binary_train_y)
pred_y=KNN_Classifier.predict(binary_val_X)
accuracy = metrics.accuracy_score(binary_val_y, pred_y)
print ("\nModel Accuracy:" "\n", accuracy)

print("\nCONFUSION MATRIX")
confusion_matrix = metrics.confusion_matrix(binary_val_y, pred_y)
sns.heatmap(confusion_matrix , square=True, annot=True, fmt='d', cbar=False,
            xticklabels = ['Predicted Normal','Predicted Attack'],
            yticklabels = ['Actual Normal','Actual Attack'])

plt.show()

classification = metrics.classification_report(binary_val_y, pred_y)
print("\nClassification report:" "\n", classification) 

In [None]:
# LogisticRegression Model
LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier.fit(binary_train_X, binary_train_y);
pred_y=LGR_Classifier.predict(binary_val_X)
accuracy = metrics.accuracy_score(binary_val_y, pred_y)
print ("\nModel Accuracy:" "\n", accuracy)

print("\nCONFUSION MATRIX")
confusion_matrix = metrics.confusion_matrix(binary_val_y, pred_y)
sns.heatmap(confusion_matrix , square=True, annot=True, fmt='d', cbar=False,
            xticklabels = ['Predicted Normal','Predicted Attack'],
            yticklabels = ['Actual Normal','Actual Attack'])

plt.show()

classification = metrics.classification_report(binary_val_y, pred_y)
print("\nClassification report:" "\n", classification) 

In [None]:
barWidth = 0.15
fig = plt.subplots(figsize =(8, 8))
NB = [0.91,0.85,0.94 ]
KN = [0.998, 0.998,0.998]
LR=[0.9774,0.98,0.9709]
br1 = np.arange(len(NB))
br2 = [x + barWidth for x in br1]
br3 = [x + barWidth for x in br2]
plt.bar(br1, KN, color ='y', width = barWidth, edgecolor ='grey', label ='K NEIGHBOURS')
plt.bar(br2, LR, color ='orange', width = barWidth, edgecolor ='grey', label ='LOGISTIC REGRESSION')
plt.bar(br3, NB, color ='r', width = barWidth, edgecolor ='grey', label ='NAIVE BAYES')
plt.xlabel('Performance Metrics', fontweight ='bold', fontsize = 15)
plt.ylabel('Value', fontweight ='bold', fontsize = 15)
plt.ylim(0.80, 1)
plt.xticks([r + barWidth for r in range(len(NB))], ['Accuracy', 'Recall','Precision'])
plt.legend()
plt.title("Comparison of performance metrics")

# MULTI CLASSIFICATION

In [None]:
# model for the mulit classification
RFC = RandomForestClassifier()
RFC.fit(multi_train_X, multi_train_y)
multi_predictions = RFC.predict(multi_val_X)
print("ACCURACY ", accuracy_score(multi_predictions, multi_val_y))

In [None]:
confusion_matrix = metrics.confusion_matrix(multi_predictions, multi_val_y)
sns.heatmap(confusion_matrix,
            xticklabels = ['Predicted ' + x for x in attack_labels],
            yticklabels = ['Actual ' + x for x in attack_labels],
            cmap="YlGnBu",
            fmt='d',
            annot=True)

In [None]:
target_names = ['NORMAL TRAFFIC', 'DoS ATTACK', 'PROBE ATTACK','PRIVELEGE ATTACK','ACCESS CONTROL ATTACK']
classification = metrics.classification_report(multi_predictions, multi_val_y,target_names=target_names)
print("\nClassification report:" "\n", classification) 