1. IMPORTING MODULES

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn import preprocessing

2.READING DATA OF CSV FILE


In [None]:
df=pd.read_csv("dataset_sdn.csv")
df

3.DATA PREPROCESSING AND DATA VISUALIZATION 


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#COLUMN NAMES
column_names= df.columns
column_names

In [None]:
#NULL VALUES SUM
df.isnull().sum().plot.bar()
plt.title("NULL Values for each column ")
plt.xlabel("Column names")
plt.ylabel("Count")

In [None]:
# Dropping rows having null values
df=df.dropna()

In [None]:
df.info()

In [None]:
# Getting unique destination 
uniq_dest=df['dst'].unique()
total_dst=len(uniq_dest)
print("Total destination : ", total_dst)
print("Different destination : ",uniq_dest)

In [None]:
# Doing analysis for malicious and normal traffic 
gp=df.groupby('label')['label'].count()
plt.bar(list(gp.index),list(gp.values),color=['g','r'])
plt.xticks(list(gp.index))
plt.xlabel("Traffic label")
plt.ylabel("Count")
plt.title("Traffic for normal and Malicious traffic")

In [None]:
ip_addr=df[df['label']==0].groupby('dst').count()['label'].index
normal_traffic=df.groupby(['dst','label']).size().unstack().fillna(0)[0]
attack_traffic=df.groupby(['dst','label']).size().unstack().fillna(0)[1]
plt.barh(ip_addr,normal_traffic,color='g', label='Normal Traffic')
plt.barh(ip_addr,attack_traffic,color='r', label='Attack Traffic')
plt.legend()
plt.xlabel("Count")
plt.ylabel("Destination IP Adresses")
plt.title("Attack and Normal traffic ")


In [None]:
# Columns containing object(string) type data
# Port no column also does not do much so ignoring it also
object_col= list(df.select_dtypes(include=['object']).columns)
object_col=object_col+['port_no']
print(object_col)
data=df.drop(columns=object_col)


4.FEATURE SELECTION


In [None]:
y=data['label']
x=data.drop(['label'],axis=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=104,test_size=0.2)
train_data=pd.concat([x_train,y_train],axis=1,join='inner')

correl=train_data.corr(numeric_only=True)['label'].sort_values()
correl.drop('label',inplace=True)

In [None]:
selected=correl.loc[abs(correl)>0.08]
attr_selected=list(selected.index)
print(attr_selected)

5.NORMALIZATON


In [None]:
norm_x_train_data=(x_train.select_dtypes(include=['float64','int64'])-x_train.min(numeric_only=True))
norm_x_test_data=(x_test.select_dtypes(include=['float64','int64'])-x_test.min(numeric_only=True))
norm_x_train_data=preprocessing.scale(norm_x_train_data[attr_selected])
norm_x_test_data=preprocessing.scale(norm_x_test_data[attr_selected])

norm_x_train_data.shape
y_train.shape

6.TRAINING INDIVIDUAL MODEL

In [None]:
train_x,val_x,train_y,val_y=train_test_split(norm_x_train_data,y_train,stratify=y_train,test_size=0.2,random_state=0)

In [None]:
nb=GaussianNB()
nb.fit(norm_x_train_data,y_train)
nb.score(norm_x_test_data,y_test)


In [None]:
knn=KNeighborsClassifier()

knn.fit(norm_x_train_data,y_train)
knn.score(norm_x_test_data,y_test)

In [None]:
svc=SVC()

svc.fit(norm_x_train_data,y_train)
svc.score(norm_x_test_data,y_test)

7.HYBRID MODEL


In [None]:
train_x,val_x,train_y,val_y=train_test_split(norm_x_train_data,y_train,stratify=y_train,test_size=0.2,random_state=42)

In [None]:
model1 = GaussianNB()
model1.fit(norm_x_train_data, y_train)


In [None]:
model2 = KNeighborsClassifier()
model2.fit(norm_x_train_data, y_train)


In [None]:
model3 = SVC()
model3.fit(norm_x_train_data, y_train)

In [None]:
y_pred1 = model1.predict(norm_x_test_data)
y_pred2 = model2.predict(norm_x_test_data)
y_pred3 = model3.predict(norm_x_test_data)



In [None]:
ensemble_X = pd.DataFrame({'NB': y_pred1, 'KNN': y_pred2, 'SVM': y_pred3})
ensemble_y = y_test


In [None]:
ensemble = RandomForestClassifier()
ensemble.fit(ensemble_X, ensemble_y)

In [None]:
y_pred = ensemble.predict(ensemble_X)

In [None]:
accuracy = accuracy_score(ensemble_y, y_pred)

print('Accuracy:', accuracy)