# **Detection of Cyber Attacks using ML**

In [None]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
#importing basic packages

import os
import timeit
import warnings
from collections import defaultdict

import catboost as cb
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
#plot_confusion_matrix
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from termcolor import colored

# **Loading Data**

In [None]:
#warnings.filterwarnings('ignore’)

np.random.seed(100)

dataset_root = r'/content/drive/MyDrive/archive'
train_file = os.path.join(dataset_root, 'KDDTrain+.txt')
test_file = os.path.join(dataset_root, 'KDDTest+.txt')

In [None]:
header_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type',
'success_pred']

In [None]:
len(header_names)

43

# **Data Preprocessing**

In [None]:
#Converting header into array

col_names = np.array(header_names)

#No retationship b/w nuserics

nominal_idx = [1, 2, 3]

#binaryb values

binary_idx = [6, 11, 13, 14, 20, 21]
#difference between two sets.

numeric_idx = list(set(range(41)).difference(nominal_idx).difference(binary_idx))
nominal_cols = col_names[nominal_idx].tolist()
binary_cols = col_names[binary_idx].tolist()
numeric_cols = col_names[numeric_idx].tolist()

In [None]:
print(nominal_cols)

['protocol_type', 'service', 'flag']


In [None]:
print(binary_cols)

['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']


In [None]:
print(numeric_cols)

['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


In [None]:
category = defaultdict(list)
category['begin'].append('normal')

with open(r'/content/drive/MyDrive/archive/training_attack_types.txt', 'r') as f:
  for line in f.readlines():
    attack, cat = line.strip().split(' ')
    category[cat].append(attack)

attack_mapping = dict((v,k) for k in category for v in category[k])
train_df = pd.read_csv(train_file, names=header_names)

train_df['attack_category'] = train_df['attack_type'] \
  .map(lambda x: attack_mapping[x])
train_df.drop(['success_pred'],axis=1, inplace=True)

test_df = pd.read_csv(test_file, names=header_names)
test_df['attack_category'] = test_df['attack_type'] \
  .map(lambda x: attack_mapping[x])
test_df.drop(['success_pred'],axis=1, inplace=True)

train_attack_types = train_df['attack_type'].value_counts()
train_attack_cats = train_df['attack_category'].value_counts()

test_attack_types = test_df['attack_type'].value_counts()
test_attack_cats = test_df['attack_category'].value_counts()

train_attack_types.plot(kind= 'barh', figsize=(20,10), fontsize=20)

train_attack_cats.plot(kind= 'barh', figsize=(20,10), fontsize=30)

test_attack_types.plot(kind='barh', figsize=(20,10), fontsize=15)

test_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)


FileNotFoundError: ignored

In [None]:
print(category)

In [None]:
train_df.head()

In [None]:
train_df[binary_cols].describe().transpose()

In [None]:
train_df.groupby(['su_attempted']).size()

In [None]:
train_df['su_attempted'].replace(2,0,inplace=True)
test_df['su_attempted'].replace(2,0,inplace=True)
train_df.groupby(['su_attempted']).size()

In [None]:
train_df.groupby(['num_outbound_cmds']).size()

In [None]:
train_df.drop('num_outbound_cmds', axis=1, inplace=True)
test_df.drop('num_outbound_cmds', axis=1, inplace=True)
#numeric_cols.remove('num_outbound_cmds')
#Data Preparation
train_Y = train_df['attack_category']
train_x_raw = train_df.drop(['attack_category','attack_type'], axis=1)
test_Y = test_df['attack_category']
test_x_raw = test_df.drop(['attack_category','attack_type'], axis=1)

combined_df_raw = pd.concat([train_x_raw, test_x_raw])
combined_df = pd.get_dummies(combined_df_raw, columns=nominal_cols, drop_first=True)

train_x = combined_df[:len(train_x_raw)]
test_x = combined_df[len(train_x_raw):]

In [None]:
x_train = train_x_raw
x_test = test_x_raw

# **Training**

In [None]:
# Creating holders to store the model performance results
ML_Model = []
acc_train = []
acc_test = []
#function to call for storing the results
def storeResults(model, a,b):
  ML_Model.append(model)
  acc_train.append(round(a, 3))
  acc_test.append(round(b, 3))

In [None]:
#importing packoges
from sklearn.metrics import accuracy_score
X_train=train_x
y_train=train_Y
X_test=test_x
y_test=test_Y

# **1. Decision Tree Classifier**

In [None]:
# Dectsion Tree model
from sklearn.tree import DecisionTreeClassifier
# instantiate the model
tree = DecisionTreeClassifier(max_depth = 5)
# fit the model
tree.fit(X_train, y_train)

In [None]:
#predicting the target value from the model for the somples
y_test_tree = tree.predict(X_test)
y_train_tree = tree.predict(X_train)

In [None]:
#computing the accuracy of the model performance
acc_train_tree = accuracy_score(y_train,y_train_tree)
acc_test_tree = accuracy_score(y_test,y_test_tree)
print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_tree))
print("Decision Tree: Accuracy on test Data: {:.3f}".format(acc_test_tree))

In [None]:
#checking the feature improtance in the model
plt.figure(figsize=(43,43))

n_features = X_train.shape[1]

plt.barh(range(n_features), tree.feature_importances_, align='center')
plt.yticks(np.arange(n_features), X_train.columns)
plt.xlabel("Feature importance")

plt.ylabel("Feature")

plt.show()

In [None]:
#storing the resuits. The delow mentioned order of porameter possing {s (sportant.
#Coution: Execute only once to owoid duplications.
storeResults('Decision Tree', acc_train_tree, acc_test_tree)

# **2. Random Forest Classifier**

In [None]:
# Randow Forest model

from sklearn.ensemble import RandomForestClassifier
# instontiote the model

forest = RandomForestClassifier(max_depth=5)

# fit the model

forest.fit(X_train, y_train)

In [None]:
#predicting the target volue from the model for the somples
y_test_forest = forest.predict(X_test)
y_train_forest = forest.predict(X_train)

In [None]:
#computing the occuracy of the model performonce

acc_train_forest = accuracy_score(y_train,y_train_forest)

acc_test_forest = accuracy_score(y_test,y_test_forest)

print("Random forest: Accuracy on training Data: {:.3f}".format(acc_train_forest))
print("Random forest: Accuracy om test Dota: {:.3f}".format(acc_test_forest))

In [None]:
#checking the feature {mprotance in the model
plt.figure(figsize=(45,45))

n_features = X_train.shape[1]

plt.barh(range(n_features), forest.feature_importances_, align='center')
plt.yticks(np.arange(n_features), X_train.columns)

plt.xlabel("Feature importance")

plt.ylabel("Feature")

plt.show()

# **3. Multi Layer Perceptron**

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10,10,10), activation='relu', solver='adam', max_iter=1500)
mlp.fit(X_train,y_train)

y_train_mlp = mlp.predict(X_train)

y_test_mlp = mlp.predict(X_test)

In [None]:
acc_train_mlp = accuracy_score(y_train,y_train_mlp)

acc_test_mlp = accuracy_score(y_test,y_test_mlp)

print("MLP: Accuracy on training Data: {:.5f}".format(acc_train_mlp))
print("MLP: Accuracy on test Data: {:.5f}".format(acc_test_mlp))

# **4. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(solver='lbfgs', max_iter=100)
regressor.fit(X_train, y_train)

In [None]:
#predicting the target value from the model for the samples
y_test_regressor = regressor.predict(X_test)
y_train_regressor = regressor.predict(X_train)

In [None]:
#computing the accuracy of the model performance

from sklearn.metrics import accuracy_score

acc_train_regressor = accuracy_score(y_train,y_train_regressor)

acc_test_regressor = accuracy_score(y_test,y_test_regressor)

print("Random forest: Accuracy on training Data: {:.5f}".format(acc_train_regressor))
print("Random forest: Accuracy on test Data: {:.5f}".format(acc_test_regressor))

In [None]:
storeResults('Regression', acc_train_regressor, acc_test_regressor)

# **5. Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(X_train, y_train)

In [None]:
y_test_nb = naive.predict(X_test)
y_train_nb = naive.predict(X_train)

In [None]:
acc_train_nb = accuracy_score(y_train,y_train_nb)

acc_test_nb = accuracy_score(y_test,y_test_nb)

print("Naive Bayes: Accuracy on training Data: {:.5f}".format(acc_train_nb))
print("Naive Bayes: Accuracy on test Data: {:.5f}".format(acc_test_nb))

In [None]:
storeResults('NaiveBayes', acc_train_nb, acc_test_nb)

# **Comparison of Models**

In [None]:
#creating dataframe

results = pd.DataFrame({ 'ML Model': ML_Model,
  'Train Accuracy': acc_train,
  'Test Accuracy': acc_test})

results

In [None]:
#Sorting the datafram on accuracy
results.sort_values(by=[ 'Test Accuracy', 'Train Accuracy'], ascending=False)