In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report

### Load Data

In [None]:
train = pd.read_csv('./data/train_unscaled.csv')
test = pd.read_csv('./data/test_unscaled.csv')

print(test['Label'].value_counts())

Label
0    419297
1     85176
Name: count, dtype: int64


In [3]:
test = pd.concat([test[test['Label'] == 0].sample(170352), test[test['Label'] == 1].sample(85176)])
print(test['Label'].value_counts())

Label
0    170352
1     85176
Name: count, dtype: int64


In [4]:
X_train = train.drop(['Label'], axis=1)
y_train = train['Label']

X_test = test.drop(['Label'], axis=1)
y_test = test['Label']

del train
del test

In [5]:
print("Value counts in train set")
print(y_train.value_counts())
print("Value counts in test set")
print(y_test.value_counts())

Value counts in train set
Label
0    1677187
1    1677187
Name: count, dtype: int64
Value counts in test set
Label
0    170352
1     85176
Name: count, dtype: int64


In [6]:
dtypes = {
    # Network Identification
    'Destination Port': 'uint16',
    'Flow Duration': 'uint32',
    
    # Packet Count Metrics
    'Total Fwd Packets': 'uint32',
    'Total Backward Packets': 'uint32',
    'Total Length of Fwd Packets': 'uint32',
    'Total Length of Bwd Packets': 'uint32',
    
    # Packet Length Statistics
    'Fwd Packet Length Max': 'uint16',
    'Fwd Packet Length Min': 'uint8',
    'Fwd Packet Length Mean': 'float32',
    'Fwd Packet Length Std': 'float32',
    'Bwd Packet Length Max': 'uint16',
    'Bwd Packet Length Min': 'uint8',
    'Bwd Packet Length Mean': 'float32',
    'Bwd Packet Length Std': 'float32',
    
    # Flow Characteristics
    'Flow Bytes/s': 'float32',
    'Flow Packets/s': 'float32',
    
    # Inter-Arrival Times
    'Flow IAT Mean': 'float32',
    'Flow IAT Std': 'float32',
    'Flow IAT Max': 'uint32',
    'Flow IAT Min': 'uint32',
    'Fwd IAT Total': 'uint32',
    'Fwd IAT Mean': 'float32',
    'Fwd IAT Std': 'float32',
    'Fwd IAT Max': 'uint32',
    'Fwd IAT Min': 'uint32',
    'Bwd IAT Total': 'uint32',
    'Bwd IAT Mean': 'float32',
    'Bwd IAT Std': 'float32',
    'Bwd IAT Max': 'uint32',
    'Bwd IAT Min': 'uint32',
    
    # Header Information
    'Fwd Header Length': 'uint16',
    'Bwd Header Length': 'uint16',
    
    # Rate Metrics
    'Fwd Packets/s': 'float32',
    'Bwd Packets/s': 'float32',
    
    # Packet Size Characteristics
    'Min Packet Length': 'uint8',
    'Max Packet Length': 'uint16',
    'Packet Length Mean': 'float32',
    'Packet Length Std': 'float32',
    'Packet Length Variance': 'float32',
    
    # Protocol Flags
    'FIN Flag Count': 'uint8',
    'SYN Flag Count': 'uint8',
    'RST Flag Count': 'uint8',
    'PSH Flag Count': 'uint8',
    'ACK Flag Count': 'uint8',
    'URG Flag Count': 'uint8',
    'CWE Flag Count': 'uint8',
    'ECE Flag Count': 'uint8',
    
    # Traffic Ratios
    'Down/Up Ratio': 'uint8',
    
    # Size Metrics
    'Average Packet Size': 'float32',
    
    # Window Sizes
    'Init_Win_bytes_forward': 'uint16',
    'Init_Win_bytes_backward': 'uint16',
    
    # Activity Metrics
    'act_data_pkt_fwd': 'uint16',
    'min_seg_size_forward': 'uint8',
    
    # Timing Statistics
    'Active Mean': 'float32',
    'Active Std': 'float32',
    'Active Max': 'uint32',
    'Active Min': 'uint32',
    'Idle Mean': 'float32',
    'Idle Std': 'float32',
    'Idle Max': 'uint32',
    'Idle Min': 'uint32'
}

In [7]:
# Optimize numeric columns to reduce memory usage
def optimize_dtypes(df):
    for col in df.select_dtypes(include='integer').columns:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')
    for col in df.select_dtypes(include='float').columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

X_train = optimize_dtypes(X_train)

## Model Training

### Logistic Regression

In [8]:
lr = LogisticRegression(max_iter=100)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Logistic Regression")
print(classification_report(y_test, y_pred))

Logistic Regression
              precision    recall  f1-score   support

           0       0.86      0.86      0.86    170352
           1       0.72      0.72      0.72     85176

    accuracy                           0.81    255528
   macro avg       0.79      0.79      0.79    255528
weighted avg       0.81      0.81      0.81    255528



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [9]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print("Decision Tree")
print(classification_report(y_test, y_pred))

Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    170352
           1       1.00      1.00      1.00     85176

    accuracy                           1.00    255528
   macro avg       1.00      1.00      1.00    255528
weighted avg       1.00      1.00      1.00    255528



### Random Forest

In [10]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Random Forest")
print(classification_report(y_test, y_pred))

Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    170352
           1       1.00      1.00      1.00     85176

    accuracy                           1.00    255528
   macro avg       1.00      1.00      1.00    255528
weighted avg       1.00      1.00      1.00    255528



### XGBoost

In [11]:
xgb = XGBClassifier(n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print("XGBoost")
print(classification_report(y_test, y_pred))

XGBoost
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    170352
           1       1.00      1.00      1.00     85176

    accuracy                           1.00    255528
   macro avg       1.00      1.00      1.00    255528
weighted avg       1.00      1.00      1.00    255528



### Naive Bayes

In [12]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("Naive Bayes")
print(classification_report(y_test, y_pred))

Naive Bayes
              precision    recall  f1-score   support

           0       0.94      0.20      0.33    170352
           1       0.38      0.97      0.54     85176

    accuracy                           0.46    255528
   macro avg       0.66      0.59      0.44    255528
weighted avg       0.75      0.46      0.40    255528



### K-Nearest Neighbors

In [13]:
knn = KNeighborsClassifier(n_jobs=-1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("KNN")
print(classification_report(y_test, y_pred))

KNN
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    170352
           1       0.99      0.99      0.99     85176

    accuracy                           0.99    255528
   macro avg       0.99      0.99      0.99    255528
weighted avg       0.99      0.99      0.99    255528



### Support Vector Machine

In [14]:
svm = SVC(max_iter=100)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("SVM")
print(classification_report(y_test, y_pred))



SVM
              precision    recall  f1-score   support

           0       0.36      0.11      0.17    170352
           1       0.25      0.60      0.35     85176

    accuracy                           0.28    255528
   macro avg       0.31      0.36      0.26    255528
weighted avg       0.33      0.28      0.23    255528

