<a href="https://colab.research.google.com/github/shubhamrastoginew1/youtube/blob/main/Network_Intrusion_Detection_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intrusion Detection System

Download Dataset : https://raw.githubusercontent.com/shubhamrastoginew1/dataset/main/networkintrusion%20(1).csv

In [4]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

# Loading the dataset

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/shubhamrastoginew1/dataset/main/networkintrusion%20(1).csv")

# Summarizing the Dataset

In [6]:
df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'num_root', 'num_shells', 'num_access_files', 'num_outbound_cmds',
       'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class'],
      dtype='object')

In [None]:
df.nunique()

duration                        624
protocol_type                     3
service                          64
flag                             11
src_bytes                      1149
dst_bytes                      3649
land                              2
wrong_fragment                    3
urgent                            4
hot                              16
num_failed_logins                 5
logged_in                         2
num_compromised                  23
root_shell                        2
num_root                         20
num_shells                        4
num_access_files                  5
num_outbound_cmds                 1
is_host_login                     2
is_guest_login                    2
count                           495
srv_count                       457
serror_rate                      88
srv_serror_rate                  82
rerror_rate                      90
srv_rerror_rate                  93
same_srv_rate                    75
diff_srv_rate               

In [None]:
df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,22544.0,22538.0,22538.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,...,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0,22544.0
mean,218.859076,10398.12,2055.584,0.000311,0.008428,0.00071,0.105394,0.021647,0.442202,0.119899,...,193.869411,140.750532,0.608722,0.09054,0.132261,0.019638,0.097814,0.099426,0.233385,0.226683
std,1407.176612,472849.3,21221.9,0.017619,0.142599,0.036473,0.928428,0.150328,0.496659,7.269597,...,94.035663,111.783972,0.435688,0.220717,0.306268,0.085394,0.273139,0.281866,0.387229,0.400875
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,121.0,15.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,54.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,168.0,0.92,0.01,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,287.0,601.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,255.0,1.0,0.06,0.03,0.01,0.0,0.0,0.36,0.17
max,57715.0,62825650.0,1345927.0,1.0,3.0,3.0,101.0,4.0,1.0,796.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Pre-processing and Cleaning

#### We will convert the string variables to categorical variables wherever needed - in our case, protocol_type, service, flag and the output variable, class

In [None]:
df['protocol_type'] = df['protocol_type'].map({'tcp':0, 'udp':1, 'icmp':2})

service_map = dict()
services = tuple(df['service'].unique())
for i in range(len(services)):
    service_map[services[i]] = i
df['service'] = df['service'].map(service_map)

flag_map = dict()
flags = tuple(df['flag'].unique())
for i in range(len(flags)):
    flag_map[flags[i]] = i
df['flag'] = df['flag'].map(flag_map)

df['class'] = df['class'].map({'normal':0, 'anomaly':1})

In [None]:
df.shape

(22544, 40)

#### Remove duplicate rows from the loaded dataset

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(22541, 40)

In [None]:
print(df.isnull().sum())

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      6
dst_bytes                      6
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
num_root                       0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          9
srv_count                      9
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_host_rate             0
dst_host_count                 0
dst_host_s

In [None]:
df.dropna(how='any', inplace=True)

In [None]:
print(df.isnull().sum())

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
num_root                       0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_host_rate             0
dst_host_count                 0
dst_host_s

In [None]:
df.shape

(22526, 40)

#### Remove columns/features zero or close to zero variance entirely (removing quasi-constant features)

In [None]:
filter_cols = VarianceThreshold(threshold=0.01)
filter_cols.fit(df)

VarianceThreshold(threshold=0.01)

In [None]:
col_ids = np.where(filter_cols.variances_ <= 0.01)[0]

In [None]:
dropped_cols = [df.columns[i] for i in col_ids]

In [None]:
df.drop(columns=dropped_cols, axis=1, inplace=True)

In [None]:
df.shape

(22526, 32)

#### Remove columns/features that are highly correlated

In [None]:
corr_cols = set()
corr_mat = df.corr()

In [None]:
for i in range(len(corr_mat.columns)):
    for j in range(i):
        if abs(corr_mat.iloc[i, j]) > 0.9:
            col = corr_mat.columns[i]
            corr_cols.add(col)

In [None]:
corr_cols

{'dst_host_same_srv_rate',
 'dst_host_serror_rate',
 'dst_host_srv_rerror_rate',
 'dst_host_srv_serror_rate',
 'num_root',
 'srv_rerror_rate',
 'srv_serror_rate'}

In [None]:
df.drop(columns=list(corr_cols), axis=1, inplace=True)

In [None]:
df.shape

(22526, 25)

#### Creating the model using different algorithms

##### Splitting the dataset to train and test sets

In [None]:
y = df[['class']]
X = df.drop(columns=['class',], axis=1)

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
print("Input Training Set Shape:", X_train.shape)
print("Input Testing Set Shape:", X_test.shape)
print("Output Training Set Shape:", y_train.shape)
print("Output Testing Set Shape:", y_test.shape)

Input Training Set Shape: (15092, 24)
Input Testing Set Shape: (7434, 24)
Output Training Set Shape: (15092, 1)
Output Testing Set Shape: (7434, 1)


##### Using decision tree algorithm to train and test the model. Also check the scores/accuracy

In [None]:
classifier = DecisionTreeClassifier(criterion ="entropy", max_depth = 4)
start_time = time.time()
classifier.fit(X_train, y_train.values.ravel())
end_time = time.time()
print("Total training time: ", end_time - start_time)
dec_tree_train_time = end_time - start_time

Total training time:  0.037574052810668945


In [None]:
start_time = time.time()
y_test_pred = classifier.predict(X_train)
end_time = time.time()
print("Total testing time: ", end_time - start_time)
dec_tree_test_time = end_time - start_time

Total testing time:  0.005797147750854492


In [None]:
print("Training accuracy is:", str(round(100 * classifier.score(X_train, y_train), 2)) + str("%"))
print("Testing accuracy is:", str(round(100 * classifier.score(X_test, y_test), 2)) + str("%"))
dec_tree_train_accuracy = 100 * classifier.score(X_train, y_train)
dec_tree_test_accuracy = 100 * classifier.score(X_test, y_test)

Training accuracy is: 91.27%
Testing accuracy is: 90.66%


##### Using xGBoost Classifer instead to try and see the accuracies

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores_train = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
n_scores_test = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
print('Training accuracy:', str(round(100 * np.mean(n_scores_train), 2)) + str("%"))
print('Testing accuracy:', str(round(100 * np.mean(n_scores_test), 2)) + str("%"))

Training accuracy: 97.59%
Testing accuracy: 97.52%
