In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

In [2]:
!gdown 'https://drive.google.com/uc?id=1AjWeHutvCSHCfB_si95rgF3904cuymxq'

Downloading...
From (original): https://drive.google.com/uc?id=1AjWeHutvCSHCfB_si95rgF3904cuymxq
From (redirected): https://drive.google.com/uc?id=1AjWeHutvCSHCfB_si95rgF3904cuymxq&confirm=t&uuid=37982b8c-a282-4f7f-8f19-9590b033f140
To: /content/out.csv
100% 110M/110M [00:02<00:00, 40.0MB/s]


In [3]:
df1 = pd.read_csv('out.csv')

In [37]:
df1['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,100000
3,80000
5,75000
2,65000
1,13835
6,2180
4,2013


In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df1['Label'] = le.fit_transform(df1['Label'])

In [5]:
from sklearn.preprocessing import StandardScaler
#Scaling every column in the dataframe
scaler = StandardScaler()
df1.iloc[:, :-1] = pd.DataFrame(scaler.fit_transform(df1.iloc[:, :-1]), columns=df1.iloc[:, :-1].columns)

In [6]:
correlation = df1.select_dtypes(include='float64').corr().abs()

In [31]:
upper_triangle = correlation.where(np.triu(np.ones(correlation.shape), k=1).astype(bool))

to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.95)]

to_drop.append('Destination Port')

df = df1.drop(columns=to_drop)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338028 entries, 0 to 338027
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Flow Duration                338028 non-null  float64
 1   Total Fwd Packets            338028 non-null  float64
 2   Total Length of Fwd Packets  338028 non-null  float64
 3   Fwd Packet Length Max        338028 non-null  float64
 4   Fwd Packet Length Min        338028 non-null  float64
 5   Fwd Packet Length Mean       338028 non-null  float64
 6   Bwd Packet Length Max        338028 non-null  float64
 7   Bwd Packet Length Min        338028 non-null  float64
 8   Flow Bytes/s                 338028 non-null  float64
 9   Flow Packets/s               338028 non-null  float64
 10  Flow IAT Mean                338028 non-null  float64
 11  Flow IAT Std                 338028 non-null  float64
 12  Flow IAT Min                 338028 non-null  float64
 13 

In [32]:
from sklearn.model_selection import train_test_split

In [38]:
X = df.iloc[:, :-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=2000000).fit(X_train, y_train)

In [39]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',
    'num_class': 7,
    'max_depth': 5,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

num_round = 1000  # Number of boosting iterations
bst = xgb.train(params, dtrain, num_round)

y_pred = bst.predict(dtest)


In [35]:
import joblib

joblib.dump(model, 'regression_model.joblib')

['regression_model.joblib']

In [40]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

#y_pred = model.predict(X_test)

rec = recall_score(y_test, y_pred, average=None)
pre = precision_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Recall: {rec}')
print(f'Precision: {pre}')
print(f'F1 Score: {f1}')

Accuracy: 0.9990829216341744
Recall: [0.99786027 1.         0.99984687 0.99981157 0.99257426 0.99966586
 0.98607889]
Precision: [0.99935214 0.9996337  0.99961727 0.99893317 0.9616307  0.99939872
 0.99765258]
F1 Score: [0.99860565 0.99981682 0.99973206 0.99937217 0.97685749 0.99953227
 0.99183197]


# XGBoost

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',
    'num_class': 7,
    'max_depth': 5,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

num_round = 1000  # Number of boosting iterations
bst = xgb.train(params, dtrain, num_round)

y_pred = bst.predict(dtest)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

rec = recall_score(y_test, y_pred, average=None)
pre = precision_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Recall: {rec}')
print(f'Precision: {pre}')
print(f'F1 Score: {f1}')

In [None]:
joblib.dump(bst, 'xgboost_model.joblib')

# Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
model = tf.keras.Sequential(
    [layers.InputLayer(shape=(41,)),
     layers.Dense(128, activation='relu'),
     layers.Dense(128,activation='relu'),
     layers.Dense(7, activation='softmax')]
)

In [None]:
model.summary()

In [None]:
model.compile(optimizer = 'adam', loss='categorical_crossentropy', metrics=['accuracy', 'precision', 'recall', 'f1_score'])

In [None]:
from sklearn.preprocessing import OneHotEncoder

y_onehot_train = OneHotEncoder().fit_transform(y_train.values.reshape(-1,1)).toarray()

In [None]:
history = model.fit(X_train, y_onehot_train, epochs=30, validation_split=0.2, batch_size=64)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

In [None]:
def return_max(y):
  dim = y.shape[0]
  y_ret = np.zeros(shape=(dim, 7))
  for i in range(y.shape[0]):
    max = np.max(y[i])
    y_ret[i] = np.where(y[i] == max, 1, 0)
  return y_ret

In [None]:
y_onehot_test = OneHotEncoder().fit_transform(y_test.values.reshape(-1,1)).toarray()

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

y_pred = model.predict(X_test)
y_pred = return_max(y_pred)
rec = recall_score(y_onehot_test, y_pred, average=None)
pre = precision_score(y_onehot_test, y_pred, average=None)
f1 = f1_score(y_onehot_test, y_pred, average=None)
print(f'Accuracy: {accuracy_score(y_onehot_test, y_pred)}')
print(f'Recall: {rec}')
print(f'Precision: {pre}')
print(f'F1 Score: {f1}')