# INTRUSION DETECTION MODEL

## Import Modules

In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

## Load Dataset

In [2]:
train_df = pd.read_csv("Data/KDDSelectedTrain.csv")
test_df = pd.read_csv("Data/KDDSelectedTest.csv")

In [3]:
train_df.head()

Unnamed: 0,is_intrusion,flag_SF,dst_host_same_srv_rate,logged_in,serror_rate,count,service_http,service_private,dst_host_count,service_domain_u,srv_rerror_rate
0,0,1,0.0,0,0.0,13,0,0,255,0,0.0
1,1,0,0.1,0,1.0,123,0,1,255,0,0.0
2,0,1,1.0,1,0.2,5,1,0,30,0,0.0
3,0,1,1.0,1,0.0,30,1,0,255,0,0.0
4,1,0,0.07,0,0.0,121,0,1,255,0,1.0


In [4]:
test_df.head()

Unnamed: 0,is_intrusion,flag_SF,dst_host_same_srv_rate,logged_in,serror_rate,count,service_http,service_private,dst_host_count,service_domain_u,srv_rerror_rate
0,1,0,0.0,0,0.0,136,0,1,255,0,1.0
1,0,1,0.61,0,0.0,1,0,0,134,0,0.0
2,1,1,1.0,0,0.0,1,0,0,3,0,0.0
3,1,0,0.31,0,0.0,1,0,0,29,0,0.5
4,0,1,1.0,1,0.0,4,1,0,155,0,0.0


## Create Feature Set (Data Matrix X) and Target (1D Vector y)

In [13]:
y = train_df['is_intrusion']
X = train_df.drop(columns = 'is_intrusion')

## Standardize Data

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

## Train-Test split

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Benchmarking Classifiers

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

classifier_list = ['Logistic Regression', 'K Nearest Neighbors', 'Decision Tree']

classifiers = [
    LogisticRegression(n_jobs=-1, random_state=42),
    KNeighborsClassifier(),
    DecisionTreeClassifier(random_state=42)
]

In [28]:
test_scores = {}
train_scores = {}
y_pred = {}

for name, clfr in zip(classifier_list, classifiers):
    clfr.fit(X_train, y_train)
    
    train_score = clfr.score(X_train, y_train)
    test_score = clfr.score(X_test, y_test)
    pred = clfr.predict(X_test)
    
    train_scores[name] = train_score
    test_scores[name] = test_score
    y_pred[name] = pred

  " = {}.".format(effective_n_jobs(self.n_jobs)))


In [29]:
print('Train Accuracies:')
train_scores

Train Accuracies:


{'Logistic Regression': 0.9588100459430227,
 'K Nearest Neighbors': 0.9821784732627484,
 'Decision Tree': 0.9861178641952033}

In [30]:
print('Test Accuracies:')
test_scores

Test Accuracies:


{'Logistic Regression': 0.9594760865251042,
 'K Nearest Neighbors': 0.9771780115102203,
 'Decision Tree': 0.9746775153800358}

## Create Classifier Model

In [33]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [35]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Training accuracy is ", train_score)
print("Testing accuracy is ", test_score)

Training accuracy is  0.9861178641952033
Testing accuracy is  0.9746775153800358


## Performance Measures

In [40]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_score)
print('F1 Score:',f1_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Performance measures for test:
--------
Accuracy: 0.9746775153800358
F1 Score: 0.9727117194183063
Precision Score: 0.9827154092126869
Recall Score: 0.9629096451858752
Confusion Matrix:
 [[13186   200]
 [  438 11371]]


## Model Evaluation

Now we run our model against some data that it has not seen before ie. test_df. We can think of this data as new network traffic.

In [46]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns = 'is_intrusion')