In [2]:
cd C:\\Users\\Sharuka Thirimanne\\Desktop\\FYP-ML-IDS

C:\Users\Sharuka Thirimanne\Desktop\FYP-ML-IDS


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [4]:
def load_train_dataset():
    df_train = pd.read_csv('Datasets/UNSW_NB15_training-set.csv')
    df_test = pd.read_csv('Datasets/UNSW_NB15_testing-set.csv')
    df = pd.concat([df_train,df_test], axis=0,sort=False)
    df.drop(['id'], axis = 'columns' , inplace = True)
    
    #Below categories in the state feature are available in the test set but not in the training set
    df.drop(df[df['state'] == 'CLO'].index, inplace = True)
    df.drop(df[df['state'] == 'ACC'].index, inplace = True)
    limit = df.shape[1]-1
    
    X = df.iloc[:,0:limit]
    Y = df.iloc[:,limit]
    
    return  X, Y, df

X, Y, df = load_train_dataset()

In [5]:
ohe = OneHotEncoder()
categorical_cols = ['proto','service','state','attack_cat']
array_hot_encoded = ohe.fit_transform(X[categorical_cols]).toarray()
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=X.index)
X = X.drop(columns=categorical_cols)
X = pd.concat([X,data_hot_encoded], axis=1)

In [6]:
X_train, X_test , Y_train , Y_test = train_test_split(X, Y, test_size=0.319,shuffle=False)

print('Training X :',X_train.shape)
print('Test_X :',X_test.shape)
print('Training_Y :',Y_train.shape)
print('Test_Y :',Y_test.shape)

Training X : (175468, 204)
Test_X : (82195, 204)
Training_Y : (175468,)
Test_Y : (82195,)


In [3]:
print('X_train shape:',X_train.shape)
print('X_test shape:',X_test.shape)
print('Y_train shape:',Y_train.shape)
print('Y_test shape:',Y_test.shape)

X_train shape: (175341, 43)
X_test shape: (82332, 43)
Y_train shape: (175341,)
Y_test shape: (82332,)


In [4]:
print('X_train shape:',X_train.shape)
print('X_test shape:',X_test.shape)
print('Y_train shape:',Y_train.shape)
print('Y_test shape:',Y_test.shape)

X_train shape: (175341, 43)
X_test shape: (82332, 43)
Y_train shape: (175341,)
Y_test shape: (82332,)


In [None]:
model = SVC(C= 50, gamma= 0.05,verbose=True).fit(X_train,Y_train)

[LibSVM]

In [5]:
model_poly = SVC(C= 15, kernel= 'poly',degree=4,gamma=0.5,verbose=True).fit(X_train,Y_train)

[LibSVM]

In [None]:
model_linear = SVC(C= 10, kernel= 'linear',verbose=True).fit(X_train,Y_train)

[LibSVM]

In [None]:
print('RBF Kernel')
test_predictions = model.predict(X_test)
print("Train set accuracy           : {:.6f} %".format(model.score(X_train,Y_train)*100))
print("Cross-Validation set accuracy: {:.6f} %".format((cross_val_score(model, X_train, Y_train, cv=3,scoring='accuracy').mean())*100))
print("Test set accuracy            : {:.6f} %".format(accuracy_score(Y_test, test_predictions)*100))

In [None]:
print('Poly Kernel')
test_predictions_poly = model_poly.predict(X_test)
print("Train set accuracy           : {:.6f} %".format(model_poly.score(X_train,Y_train)*100))
print("Cross-Validation set accuracy: {:.6f} %".format((cross_val_score(model_poly, X_train, Y_train, cv=3,scoring='accuracy').mean())*100))
print("Test set accuracy            : {:.6f} %".format(accuracy_score(Y_test, test_predictions_poly)*100))

In [None]:
print('Linear Kernel')
test_predictions_linear = model_linear.predict(X_test)
print("Train set accuracy           : {:.6f} %".format(model_linear.score(X_train,Y_train)*100))
print("Cross-Validation set accuracy: {:.6f} %".format((cross_val_score(model_linear, X_train, Y_train, cv=3,scoring='accuracy').mean())*100))
print("Test set accuracy            : {:.6f} %".format(accuracy_score(Y_test, test_predictions_linear)*100))

In [None]:
predictions = [test_predictions,test_predictions_poly,test_predictions_linear]
kernel_names = ['RBF','Poly','Linear']

def plot_cm(labels, predictions,kernel_names, p=0.85):
    fig, axs = plt.subplots(nrows = 1,ncols = 3, figsize=(20, 5))
    for i in range(0,3): 
        pred = predictions[i]
        k_names = kernel_names[i]
        cm = confusion_matrix(labels, pred > p)
        sns.heatmap(cm, annot=True, fmt="d",ax=axs[i])
        axs[i].set(xlabel='Predicted label',ylabel='Actual label',title=str('Confusion matrix @{:.2f}'.format(p))+' '+ str(k_names))
        
        axs[i].text(0.3,2.6,'True Negatives  : '+ str(cm[0][0]),size=18)
        axs[i].text(0.3,2.8,'False Positives   : '+ str(cm[0][1]),size=18)
        axs[i].text(0.3,3,'False Negatives : '+ str(cm[1][0]),size=18)
        axs[i].text(0.3,3.2,'True Positives    : '+ str(cm[1][1]),size=18)

In [None]:
plot_cm(test_Y, predictions,kernel_names)

In [None]:
print('RBF Kernel Classification Report')
print(classification_report(Y_test, test_predictions))
print('\n')
print('Poly Kernel Classification Report')
print(classification_report(Y_test, test_predictions_poly))
print('\n')
print('Linear Kernel Classification Report')
print(classification_report(Y_test, test_predictions_linear))