# INTRUSION DETECTION MODEL

## Import Modules

In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

## Load Dataset

In [2]:
train_df = pd.read_csv("Data/KDDSelectedTrain.csv")
test_df = pd.read_csv("Data/KDDSelectedTest.csv")

In [3]:
train_df.head()

Unnamed: 0,is_intrusion,flag_SF,same_srv_rate,dst_host_srv_count,logged_in,serror_rate,count,service_http,service_private,dst_host_count,service_domain_u,srv_rerror_rate
0,0,1,0.08,1,0,0.0,13,0,0,255,0,0.0
1,1,0,0.05,26,0,1.0,123,0,1,255,0,0.0
2,0,1,1.0,255,1,0.2,5,1,0,30,0,0.0
3,0,1,1.0,255,1,0.0,30,1,0,255,0,0.0
4,1,0,0.16,19,0,0.0,121,0,1,255,0,1.0


In [4]:
test_df.head()

Unnamed: 0,is_intrusion,flag_SF,same_srv_rate,dst_host_srv_count,logged_in,serror_rate,count,service_http,service_private,dst_host_count,service_domain_u,srv_rerror_rate
0,1,0,0.01,1,0,0.0,136,0,1,255,0,1.0
1,0,1,1.0,86,0,0.0,1,0,0,134,0,0.0
2,1,1,1.0,57,0,0.0,1,0,0,3,0,0.0
3,1,0,1.0,86,0,0.0,1,0,0,29,0,0.5
4,0,1,1.0,255,1,0.0,4,1,0,155,0,0.0


## Create Feature Set (Data Matrix X) and Target (1D Vector y)

In [5]:
y = train_df['is_intrusion']
X = train_df.drop(columns = 'is_intrusion')

## Standardize Data

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

## Train-Test split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict

## Benchmarking Classifiers

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

classifier_list = ['Logistic Regression', 'K Nearest Neighbors', 'Decision Tree']

classifiers = [
    LogisticRegression(n_jobs=-1, random_state=42),
    KNeighborsClassifier(),
    DecisionTreeClassifier(random_state=42)
]

In [10]:
test_scores = {}
train_scores = {}
y_pred = {}

for name, clfr in zip(classifier_list, classifiers):
    clfr.fit(X_train, y_train)
    
    train_score = clfr.score(X_train, y_train)
    test_score = clfr.score(X_test, y_test)
    pred = clfr.predict(X_test)
    
    train_scores[name] = train_score
    test_scores[name] = test_score
    y_pred[name] = pred

  " = {}.".format(effective_n_jobs(self.n_jobs)))


In [11]:
print('Train Accuracies:')
train_scores

Train Accuracies:


{'Logistic Regression': 0.9410976710955873,
 'K Nearest Neighbors': 0.9866239320479872,
 'Decision Tree': 0.9926471317860226}

In [12]:
print('Test Accuracies:')
test_scores

Test Accuracies:


{'Logistic Regression': 0.94129787656281,
 'K Nearest Neighbors': 0.9824568366739432,
 'Decision Tree': 0.9792022226632269}

## Create Classifier Model

## Hyperparameter Tuning

In [13]:
%%time
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
param_grid = {'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8],
              'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

dt_clf = DecisionTreeClassifier()

dt_clf_cv = GridSearchCV(dt_clf, param_grid, scoring='f1', cv=10, verbose=0, n_jobs=-1)
dt_clf_cv.fit(X_train, y_train)

params_optimal = dt_clf_cv.best_params_

print("Best Score (accuracy): %f" % dt_clf_cv.best_score_)
print("Optimal Hyperparameter Values: ", params_optimal)
print("\n")

Best Score (accuracy): 0.977468
Optimal Hyperparameter Values:  {'max_depth': 10, 'min_samples_leaf': 2}


Wall time: 45.7 s


In [14]:
model = DecisionTreeClassifier(**params_optimal)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [15]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Training accuracy is ", train_score)
print("Testing accuracy is ", test_score)

Training accuracy is  0.9819403236849678
Testing accuracy is  0.9797578884699345


## Performance Measures

In [16]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_score)
print('F1 Score:',f1_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Performance measures for test:
--------
Accuracy: 0.9797578884699345
F1 Score: 0.9782793867120955
Precision Score: 0.984063062291149
Recall Score: 0.9725632991785926
Confusion Matrix:
 [[13200   186]
 [  324 11485]]


## Model Evaluation

Now we run our model against some data that it has not seen before ie. test_df. We can think of this data as new network traffic.

In [17]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns = 'is_intrusion')

In [18]:
# Predict on the new unseen test data
y_evalpred = model.predict(X_eval)

In [20]:
# Performance measures
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', model.score(X_eval, y_eval))
print('F1 Score:',f1_score(y_eval, y_evalpred))
print('Precision Score:',precision_score(y_eval, y_evalpred))
print('Recall Score:', recall_score(y_eval, y_evalpred))
print('Confusion Matrix:\n', confusion_matrix(y_eval, y_evalpred))

Performance measures for test:
--------
Accuracy: 0.7745641662600363
F1 Score: 0.7969149616368287
Precision Score: 0.8178313648293963
Recall Score: 0.777041770573566
Confusion Matrix:
 [[7490 2221]
 [2861 9971]]


It is clear that our model is overfitting. 