In [353]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from imblearn.over_sampling import SMOTE
from collections import Counter

### Read data

In [354]:
#Read in dataset to dataframe
df = pd.read_csv('Intrusion Detection.csv')

In [355]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Class
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


### Understand dataset

In [356]:
#Check df length
len(df)

97308

In [357]:
#Check column names
df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'Class'],
      dtype='object')

In [358]:
df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Class
count,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,...,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0,97308.0
mean,216.618798,1157.123,3385.562,1e-05,0.0,3.1e-05,0.045135,0.000185,0.719047,0.02899,...,202.006084,0.845039,0.056462,0.134121,0.024139,0.002121,0.001068,0.057706,0.055819,0.000308
std,1359.006741,34220.86,37573.05,0.003206,0.0,0.009617,0.859471,0.020775,0.449467,4.046376,...,86.965239,0.305171,0.180003,0.280997,0.049664,0.029417,0.015721,0.224963,0.218861,0.017556
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,147.0,136.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,170.0,0.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,231.0,421.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,1.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
75%,0.0,313.0,2124.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,1.0,0.01,0.07,0.03,0.0,0.0,0.0,0.0,0.0
max,58329.0,2194619.0,5134218.0,1.0,0.0,3.0,30.0,4.0,1.0,884.0,...,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [359]:
#Check the count of the bad vs. normal connections
df.Class.value_counts()

0    97278
1       30
Name: Class, dtype: int64

In [360]:
#Check for numeric vs. non-numeric columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97308 entries, 0 to 97307
Data columns (total 42 columns):
duration                       97308 non-null int64
protocol_type                  97308 non-null object
service                        97308 non-null object
flag                           97308 non-null object
src_bytes                      97308 non-null int64
dst_bytes                      97308 non-null int64
land                           97308 non-null int64
wrong_fragment                 97308 non-null int64
urgent                         97308 non-null int64
hot                            97308 non-null int64
num_failed_logins              97308 non-null int64
logged_in                      97308 non-null int64
num_compromised                97308 non-null int64
root_shell                     97308 non-null int64
su_attempted                   97308 non-null int64
num_root                       97308 non-null int64
num_file_creations             97308 non-null int64
num_

In [361]:
#df = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'])

### Converting Categorical Features

In [362]:
P_type = pd.get_dummies(df['protocol_type'],drop_first=True)
Service = pd.get_dummies(df['service'],drop_first=True)
Flag = pd.get_dummies(df['flag'],drop_first=True)

In [363]:
df.drop(['protocol_type', 'service', 'flag'], axis=1,inplace=True)

In [364]:
df = pd.concat([df,P_type,Serv,Flg],axis=1)

### Test Train Split

In [365]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Class',axis=1), df['Class'], test_size=0.30, random_state=101)

In [366]:
#check the y_train values
y_train.value_counts()

0    68092
1       23
Name: Class, dtype: int64

### Training and Predicting

In [367]:
# Initialize the model
lr = LogisticRegression()

In [368]:
# Fit the model to the data
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [369]:
# Get predictions
y_pred = lr.predict(X_test)

### Evaluate the model

In [370]:
# Check the confusion matrix
confusion_matrix(y_test, y_pred)

array([[29184,     2],
       [    7,     0]])

In [371]:
# Check accuracy
accuracy_score(y_test, y_pred)

0.9996917069160415

In [372]:
# Check recall
recall_score(y_test, y_pred)

0.0

### Address over-sampling using SMOTE

In [373]:
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
#np.bincount(y_train)
#X_res, y_res = sm.fit_sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_train)))

Resampled dataset shape Counter({0: 68092, 1: 68092})


In [374]:
# Initialize the model
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [375]:
# Get predictions
y_pred = lr.predict(X_test)

In [376]:
# Check the confusion matrix
confusion_matrix(y_test, y_pred)

array([[28891,   295],
       [    0,     7]])

In [377]:
# Check accuracy
accuracy_score(y_test, y_pred)

0.989894837803583

In [378]:
# Check recall
recall_score(y_test, y_pred)

1.0

### Findings

By applying SMOTE I was able to create balanced dataset (0: 68092, 1: 68092) which decreased the overall accuracy; however, increased recall. 