#                   Intrusion detection system

Intrusion Detection System is a software application to detect network intrusion using various machine learning algorithms.IDS monitors a network or system for malicious activity and protects a computer network from unauthorized access from users, including perhaps insider. The intrusion detector learning task is to build a predictive model (i.e. a classifier) capable of distinguishing between ‘bad connections’ (intrusion/attacks) and a ‘good (normal) connections’.

#####   --->>The task is to build a network intrusion detector, a predictive model capable of distinguishing between bad connections, called intrusions or attacks, and good normal connections.

#### Attacks fall into four main categories:
     DOS: denial-of-service, e.g. syn flood;
     R2L: unauthorized access from a remote machine, e.g. guessing password;
     U2R: unauthorized access to local superuser(root),e.g.“buffer overflow”attacks;
     probing: surveillance and another probing, e.g., port scanning.

#### Dataset Used : KDD Cup 1999 dataset
    https://www.kaggle.com/datasets/galaxyh/kdd-cup-1999-data

![title](photo1.jpeg)

![title](photo2.jpeg)

![title](photo3.jpeg)

##### Importing packages

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time 

##### Reading data files and preparing dataset for evaluation

In [2]:
# Reading features list
with open("KDD-data\kddcup.names", 'r') as f:
    print(f.read())

back,buffer_overflow,ftp_write,guess_passwd,imap,ipsweep,land,loadmodule,multihop,neptune,nmap,normal,perl,phf,pod,portsweep,rootkit,satan,smurf,spy,teardrop,warezclient,warezmaster.
duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: con

In [3]:
# Append columns to the dataset and add ‘target’ column.
cols ="""duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins, 
logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,
is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,
srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,
dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate"""

columns =[]
for c in cols.split(','):
    if(c.strip()):
        columns.append(c.strip())
columns.append('target')
print(len(columns))

42


In [4]:
# Read the 'training_attack_types' file
with open("KDD-data\\training_attack_types", 'r') as f:
    print(f.read())

back dos
buffer_overflow u2r
ftp_write r2l
guess_passwd r2l
imap r2l
ipsweep probe
land dos
loadmodule u2r
multihop r2l
neptune dos
nmap probe
perl u2r
phf r2l
pod dos
portsweep probe
rootkit u2r
satan probe
smurf dos
spy r2l
teardrop dos
warezclient r2l
warezmaster r2l




In [5]:
# creating dictionary of training_attack_types
attacks_types={   'normal': 'normal',
    'back': 'dos',
    'buffer_overflow': 'u2r',
    'ftp_write': 'r2l',
    'guess_passwd': 'r2l',
    'imap': 'r2l',
    'ipsweep': 'probe',
    'land': 'dos',
    'loadmodule': 'u2r',
    'multihop': 'r2l',
    'neptune': 'dos',
    'nmap': 'probe',
    'perl': 'u2r',
    'phf': 'r2l',
    'pod': 'dos',
    'portsweep': 'probe',
    'rootkit': 'u2r',
    'satan': 'probe',
    'smurf': 'dos',
    'spy': 'r2l',
    'teardrop': 'dos',
    'warezclient': 'r2l',
    'warezmaster': 'r2l',
}

In [6]:
path = "KDD-data\kddcup.data_10_percent_corrected"
kdd_df = pd.read_csv(path, names = columns)

In [8]:
kdd_df.head().T

Unnamed: 0,0,1,2,3,4
duration,0,0,0,0,0
protocol_type,tcp,tcp,tcp,tcp,tcp
service,http,http,http,http,http
flag,SF,SF,SF,SF,SF
src_bytes,181,239,235,219,217
dst_bytes,5450,486,1337,1337,2032
land,0,0,0,0,0
wrong_fragment,0,0,0,0,0
urgent,0,0,0,0,0
hot,0,0,0,0,0


In [10]:
# Add Attack Type column to DataFrame
kdd_df['Attack_Type'] = kdd_df.target.apply(lambda r:attacks_types[r[:-1]])
kdd_df.tail(20)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target,Attack_Type
494001,0,tcp,http,S0,0,0,0,0,0,0,...,1.0,0.0,0.15,0.04,0.08,0.0,0.08,0.0,normal.,normal
494002,0,tcp,http,SF,215,2649,0,0,0,0,...,1.0,0.0,0.04,0.04,0.04,0.0,0.04,0.0,normal.,normal
494003,0,tcp,http,SF,341,326,0,0,0,0,...,1.0,0.0,1.0,0.05,0.0,0.01,0.0,0.0,normal.,normal
494004,0,tcp,http,SF,341,1943,0,0,0,0,...,1.0,0.0,0.09,0.05,0.0,0.01,0.0,0.0,normal.,normal
494005,0,tcp,http,SF,341,1663,0,0,0,0,...,1.0,0.0,0.05,0.05,0.0,0.01,0.0,0.0,normal.,normal
494006,0,tcp,http,SF,235,501,0,0,0,0,...,1.0,0.0,0.5,0.05,0.0,0.01,0.0,0.0,normal.,normal
494007,0,tcp,http,SF,320,13828,0,0,0,0,...,1.0,0.0,0.1,0.05,0.0,0.01,0.0,0.0,normal.,normal
494008,0,tcp,http,SF,319,1435,0,0,0,0,...,1.0,0.0,0.17,0.07,0.0,0.01,0.0,0.0,normal.,normal
494009,0,tcp,http,SF,335,3435,0,0,0,0,...,1.0,0.0,0.06,0.07,0.0,0.01,0.0,0.0,normal.,normal
494010,0,tcp,http,SF,291,236,0,0,0,0,...,1.0,0.0,0.04,0.06,0.0,0.01,0.0,0.0,normal.,normal


### Exploratory data analysis

In [11]:
kdd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494021 entries, 0 to 494020
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     494021 non-null  int64  
 1   protocol_type                494021 non-null  object 
 2   service                      494021 non-null  object 
 3   flag                         494021 non-null  object 
 4   src_bytes                    494021 non-null  int64  
 5   dst_bytes                    494021 non-null  int64  
 6   land                         494021 non-null  int64  
 7   wrong_fragment               494021 non-null  int64  
 8   urgent                       494021 non-null  int64  
 9   hot                          494021 non-null  int64  
 10  num_failed_logins            494021 non-null  int64  
 11  logged_in                    494021 non-null  int64  
 12  num_compromised              494021 non-null  int64  
 13 

In [12]:
kdd_df.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [13]:
kdd_df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
494016    False
494017    False
494018    False
494019    False
494020    False
Length: 494021, dtype: bool

In [14]:
kdd_df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,...,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0
mean,47.979302,3025.61,868.5324,4.5e-05,0.006433,1.4e-05,0.034519,0.000152,0.148247,0.010212,...,232.470778,188.66567,0.75378,0.030906,0.601935,0.006684,0.176754,0.176443,0.058118,0.057412
std,707.746472,988218.1,33040.0,0.006673,0.134805,0.00551,0.782103,0.01552,0.355345,1.798326,...,64.74538,106.040437,0.410781,0.109259,0.481309,0.042133,0.380593,0.380919,0.23059,0.23014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,46.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,693375600.0,5155468.0,1.0,3.0,3.0,30.0,5.0,1.0,884.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
kdd_df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'target', 'Attack_Type'],
      dtype='object')

In [16]:
# Finding categorical features
numerical_cols = kdd_df._get_numeric_data().columns
  
categorical_cols = list(set(kdd_df.columns)-set(numerical_cols))
categorical_cols.remove('target')
categorical_cols.remove('Attack_Type')

In [18]:
categorical_cols

['flag', 'protocol_type', 'service']

In [17]:
numerical_cols

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

In [19]:
kdd_df['service'].unique()

array(['http', 'smtp', 'finger', 'domain_u', 'auth', 'telnet', 'ftp',
       'eco_i', 'ntp_u', 'ecr_i', 'other', 'private', 'pop_3', 'ftp_data',
       'rje', 'time', 'mtp', 'link', 'remote_job', 'gopher', 'ssh',
       'name', 'whois', 'domain', 'login', 'imap4', 'daytime', 'ctf',
       'nntp', 'shell', 'IRC', 'nnsp', 'http_443', 'exec', 'printer',
       'efs', 'courier', 'uucp', 'klogin', 'kshell', 'echo', 'discard',
       'systat', 'supdup', 'iso_tsap', 'hostnames', 'csnet_ns', 'pop_2',
       'sunrpc', 'uucp_path', 'netbios_ns', 'netbios_ssn', 'netbios_dgm',
       'sql_net', 'vmnet', 'bgp', 'Z39_50', 'ldap', 'netstat', 'urh_i',
       'X11', 'urp_i', 'pm_dump', 'tftp_u', 'tim_i', 'red_i'],
      dtype=object)

In [20]:
kdd_df['flag'].unique()

array(['SF', 'S1', 'REJ', 'S2', 'S0', 'S3', 'RSTO', 'RSTR', 'RSTOS0',
       'OTH', 'SH'], dtype=object)

In [21]:
kdd_df.groupby(['protocol_type']).count()

Unnamed: 0_level_0,duration,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target,Attack_Type
protocol_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
icmp,283602,283602,283602,283602,283602,283602,283602,283602,283602,283602,...,283602,283602,283602,283602,283602,283602,283602,283602,283602,283602
tcp,190065,190065,190065,190065,190065,190065,190065,190065,190065,190065,...,190065,190065,190065,190065,190065,190065,190065,190065,190065,190065
udp,20354,20354,20354,20354,20354,20354,20354,20354,20354,20354,...,20354,20354,20354,20354,20354,20354,20354,20354,20354,20354


##### Satistical analysis

In [None]:
df2 = pd.DataFrame(columns = ['Protocol'])
df2['Protocol']=[each for each in kdd_df.protocol_type.unique()]
df2['Count']=[len(kdd_df[kdd_df.protocol_type==each]) for each in df2.Protocol]
df2=df2.sort_values(by=['Count'],ascending=False)

plt.figure(figsize=(25,15))
sns.barplot(x=df2.Protocol.head(50), y=df2.Count.head(50))
plt.xticks(rotation= 90)
plt.xlabel('Protocol')
plt.ylabel('Count')
plt.show()

In [None]:
df2['Count']

In [None]:
df3 = pd.DataFrame(columns = ['Service'])
df3['Service']=[each for each in kdd_df.service.unique()]
df3['Count']=[len(kdd_df[kdd_df.service==each]) for each in df3.Service]
df3=df3.sort_values(by=['Count'],ascending=False)

plt.figure(figsize=(25,15))
sns.barplot(x=df3.Service.head(50), y=df3.Count.head(50))
plt.xticks(rotation= 90)
plt.xlabel('Service')
plt.ylabel('Count')
plt.show()

In [None]:
df4 = pd.DataFrame(columns = ['Flag'])
df4['Flag']=[each for each in kdd_df.flag.unique()]
df4['Count']=[len(kdd_df[kdd_df.flag==each]) for each in df4.Flag]
df4=df4.sort_values(by=['Count'],ascending=False)

plt.figure(figsize=(25,15))
sns.barplot(x=df4.Flag.head(50), y=df4.Count.head(50))
plt.xticks(rotation= 90)
plt.xlabel('Flag')
plt.ylabel('Count')
plt.show()

In [None]:
df5 = pd.DataFrame(columns = ['Logged_In'])
df5['Logged_In']=[each for each in kdd_df.logged_in.unique()]
df5['Count']=[len(kdd_df[kdd_df.logged_in==each]) for each in df5.Logged_In]
df5=df5.sort_values(by=['Count'],ascending=False)

plt.figure(figsize=(25,15))
sns.barplot(x=df5.Logged_In.head(50), y=df5.Count.head(50))
plt.xticks(rotation= 90)
plt.xlabel('Logged In')
plt.ylabel('Count')
plt.show()

In [None]:
df6 = pd.DataFrame(columns = ['Target'])
df6['Target']=[each for each in kdd_df.target.unique()]
df6['Count']=[len(kdd_df[kdd_df.target==each]) for each in df6.Target]
df6=df6.sort_values(by=['Count'],ascending=False)

plt.figure(figsize=(25,15))
sns.barplot(x=df6.Target.head(50), y=df6.Count.head(50))
plt.xticks(rotation= 90)
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

In [None]:
df7 = pd.DataFrame(columns = ['Attack_Type'])
df7['Attack_Type']=[each for each in kdd_df.Attack_Type.unique()]
df7['Count']=[len(kdd_df[kdd_df.Attack_Type==each]) for each in df7.Attack_Type]
df7=df7.sort_values(by=['Count'],ascending=False)

plt.figure(figsize=(25,15))
sns.barplot(x=df7.Attack_Type.head(50), y=df7.Count.head(50))
plt.xticks(rotation= 90)
plt.xlabel('Attack Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Heat map
kdd_df = kdd_df.dropna('columns') 
kdd_df = kdd_df[[col for col in kdd_df if kdd_df[col].nunique() > 1]] 
plt.figure(figsize=(15,12))
corr = kdd_df.corr()
sns.heatmap(corr)
plt.show()

In [None]:
kdd_df['num_root'].corr(kdd_df['num_compromised'])

In [None]:
kdd_df['srv_serror_rate'].corr(kdd_df['serror_rate'])

In [None]:
kdd_df['srv_count'].corr(kdd_df['count'])

In [None]:
kdd_df['srv_rerror_rate'].corr(kdd_df['rerror_rate'])

In [None]:
kdd_df['dst_host_same_srv_rate'].corr(kdd_df['dst_host_srv_count'])

In [None]:
kdd_df['dst_host_srv_serror_rate'].corr(kdd_df['dst_host_serror_rate'])

In [None]:
kdd_df['dst_host_srv_rerror_rate'].corr(kdd_df['dst_host_rerror_rate'])

In [None]:
kdd_df['dst_host_same_srv_rate'].corr(kdd_df['same_srv_rate'])

In [None]:
kdd_df['dst_host_srv_count'].corr(kdd_df['same_srv_rate'])

In [None]:
kdd_df['dst_host_same_src_port_rate'].corr(kdd_df['srv_count'])

In [None]:
kdd_df['dst_host_serror_rate'].corr(kdd_df['serror_rate'])

In [None]:
kdd_df['dst_host_serror_rate'].corr(kdd_df['srv_serror_rate'])

In [None]:
kdd_df['dst_host_srv_serror_rate'].corr(kdd_df['serror_rate'])

In [None]:
kdd_df['dst_host_srv_serror_rate'].corr(kdd_df['srv_serror_rate'])

In [None]:
kdd_df['dst_host_rerror_rate'].corr(kdd_df['rerror_rate'])

In [None]:
kdd_df['dst_host_rerror_rate'].corr(kdd_df['srv_rerror_rate'])

In [None]:
kdd_df['dst_host_srv_rerror_rate'].corr(kdd_df['rerror_rate'])

In [None]:
kdd_df['dst_host_srv_rerror_rate'].corr(kdd_df['srv_rerror_rate'])

##### Data cleaning

In [None]:
# Drop highly correlated variables as these should be ignored for learning
kdd_df.drop('num_root',axis = 1,inplace = True)
kdd_df.drop('srv_serror_rate',axis = 1,inplace = True)
kdd_df.drop('srv_rerror_rate',axis = 1, inplace=True)
kdd_df.drop('dst_host_srv_serror_rate',axis = 1, inplace=True)
kdd_df.drop('dst_host_serror_rate',axis = 1, inplace=True)
kdd_df.drop('dst_host_rerror_rate',axis = 1, inplace=True)
kdd_df.drop('dst_host_srv_rerror_rate',axis = 1, inplace=True)
kdd_df.drop('dst_host_same_srv_rate',axis = 1, inplace=True)

# Drop 'service' since provides no useful information for learning
kdd_df.drop('service',axis = 1, inplace=True)

In [None]:
kdd_df.info()

In [None]:
kdd_df.head()

#### Feature Engineering

In [None]:
# Feature Mapping
pmap = {'icmp':0,'tcp':1,'udp':2}
kdd_df['protocol_type'] = kdd_df['protocol_type'].map(pmap)
#flag feature mapping
fmap = {'SF':0,'S0':1,'REJ':2,'RSTR':3,'RSTO':4,'SH':5 ,'S1':6 ,'S2':7,'RSTOS0':8,'S3':9 ,'OTH':10}
kdd_df['flag'] = kdd_df['flag'].map(fmap)
kdd_df.head()

In [None]:
kdd_df.info()

In [None]:
kdd_df.columns

##### Standardization and Splitting of data into test and train

In [None]:
# Import sklearn modelling tools 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [None]:
# Split the dataset
kdd_df = kdd_df.drop(['target', ], axis = 1)
print(kdd_df.shape)
y = kdd_df[['Attack_Type']]
X = kdd_df.drop(['Attack_Type', ], axis = 1)

In [None]:
min_max_sc = MinMaxScaler() 
X = min_max_sc.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

### Modelling

In [None]:
from sklearn.naive_bayes import BernoulliNB 
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Train LogisticRegression Model
LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier.fit(X_train, y_train);

# Train Gaussian Naive Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(X_train, y_train)
            
# Train Decision Tree Model
DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
DTC_Classifier.fit(X_train, y_train)

#### Training analyis

In [None]:
from sklearn import metrics

models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('LogisticRegression', LGR_Classifier))

for i, v in models:
    accuracy = metrics.accuracy_score(y_train, v.predict(X_train))
    confusion_matrix = metrics.confusion_matrix(y_train, v.predict(X_train))
    classification = metrics.classification_report(y_train, v.predict(X_train))
    print()
    print('============================== {} Model Evaluation =============================='.format(i))
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()

### Testing analysis

In [None]:
for i, v in models:
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    classification = metrics.classification_report(y_test, v.predict(X_test))
    print()
    print('============================== {} Model Test Results =============================='.format(i))
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()   

In [None]:
names = ['NB','DT','LR']
values = [92.198, 99.944, 99.361]
f = plt.figure(figsize =(20, 10), num = 5)
plt.subplot(131)
plt.bar(names, values)

In [None]:
pred_NB = BNB_Classifier.predict(X_test)
for i in range(10):
	print(X_test[i], pred_NB[i])
print('===========================================================================================,')    
pred_log = LGR_Classifier.predict(X_test)
for i in range(10):
	print(X_test[i], pred_log[i])
print('==============================================================================================')        
pred_dt = DTC_Classifier.predict(X_test)
for i in range(10):
	print(X_test[i], pred_dt[i])

### The above analysis of different models states that the Decision Tree model best fits our data considering accuracy