In [None]:
# import relevant modules
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import imblearn

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.inf)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
data = pd.read_csv("/content/drive/MyDrive/intrusion-data/data.csv")

In [None]:
# Drop all other classes except 'smurf,' 'neptune', and 'normal.'
data = data[data['class'].isin(['smurf','neptune','normal'])]
print(len(data))
print(data.head())
data['class'].unique()

485269
   duration protocol_type service flag  src_bytes  dst_bytes  land  \
0         0           tcp    http   SF        181       5450     0   
1         0           tcp    http   SF        239        486     0   
2         0           tcp    http   SF        235       1337     0   
3         0           tcp    http   SF        219       1337     0   
4         0           tcp    http   SF        217       2032     0   

   wrong_fragment  urgent  hot  num_failed_logins  logged_in  num_compromised  \
0               0       0    0                  0          1                0   
1               0       0    0                  0          1                0   
2               0       0    0                  0          1                0   
3               0       0    0                  0          1                0   
4               0       0    0                  0          1                0   

   root_shell  su_attempted  num_root  num_file_creations  num_shells  \
0           

array(['normal', 'neptune', 'smurf'], dtype=object)

# SCALING

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# extract numerical attributes and scale it to have zero mean and unit variance
cols = data.select_dtypes(include=['float64','int64']).columns
sc_train = scaler.fit_transform(data.select_dtypes(include=['float64','int64']))


# turn the result back to a dataframe
sc_traindf = pd.DataFrame(sc_train, columns = cols)

# ENCODING

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# extract categorical attributes from both training and test sets
cattrain = data.select_dtypes(include=['object']).copy()

# encode the categorical attributes
traincat = cattrain.apply(encoder.fit_transform)

# separate target column from encoded data
enctrain = traincat.drop(['class'], axis=1)
cat_Ytrain = traincat[['class']].copy()

In [None]:
sc_traindf.reset_index(drop=True, inplace=True)
enctrain.reset_index(drop=True, inplace=True)

train_x = pd.concat([sc_traindf ,enctrain],axis=1)
print(len(train_x))
print(len(enctrain))
print(len(sc_traindf))
train_y = data['class']

print(train_x.head())

485269
485269
485269
   duration  src_bytes  dst_bytes      land  wrong_fragment    urgent  \
0 -0.070654  -0.038642   0.282684 -0.001436             0.0 -0.001436   
1 -0.070654  -0.034859  -0.011404 -0.001436             0.0 -0.001436   
2 -0.070654  -0.035120   0.039013 -0.001436             0.0 -0.001436   
3 -0.070654  -0.036164   0.039013 -0.001436             0.0 -0.001436   
4 -0.070654  -0.036294   0.080187 -0.001436             0.0 -0.001436   

       hot  num_failed_logins  logged_in  num_compromised  root_shell  \
0 -0.02319          -0.003987   2.436896        -0.003157   -0.006885   
1 -0.02319          -0.003987   2.436896        -0.003157   -0.006885   
2 -0.02319          -0.003987   2.436896        -0.003157   -0.006885   
3 -0.02319          -0.003987   2.436896        -0.003157   -0.006885   
4 -0.02319          -0.003987   2.436896        -0.003157   -0.006885   

   su_attempted  num_root  num_file_creations  num_shells  num_access_files  \
0     -0.004532 -0.005

# DATASET PARTITION

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_x, train_y, train_size=0.70, random_state=2)

K-Nearest Neighbors model to classify by attack type.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Train KNeighborsClassifier Model
KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier.fit(X_train, Y_train);
Y_Predict = KNN_Classifier.predict(X_test)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(KNN_Classifier, X_train, Y_train, cv=10)
print ("Cross Validation Mean Score:" "\n", scores.mean())

Cross Validation Mean Score:
 0.9999440661290094


In [None]:
from sklearn import metrics

accuracy = metrics.accuracy_score(Y_test, Y_Predict)
precision = metrics.precision_score(Y_test, Y_Predict, average='weighted')
confusion_matrix = metrics.confusion_matrix(Y_test, Y_Predict)
classification = metrics.classification_report(Y_test, Y_Predict)
print()
print('============================== Model Evaluation ==============================')
print()
print("Model Accuracy:" "\n", accuracy)
print()
print("Model Precision:" "\n", precision)
print()
print("Confusion matrix:" "\n", confusion_matrix)
print()
print("Classification report:" "\n", classification)
print()



Model Accuracy:
 0.9999656548588072

Model Precision:
 0.9999656548767031

Confusion matrix:
 [[32361     1     0]
 [    1 28832     3]
 [    0     0 84383]]

Classification report:
               precision    recall  f1-score   support

     neptune       1.00      1.00      1.00     32362
      normal       1.00      1.00      1.00     28836
       smurf       1.00      1.00      1.00     84383

    accuracy                           1.00    145581
   macro avg       1.00      1.00      1.00    145581
weighted avg       1.00      1.00      1.00    145581


