In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures # for feature lifting
from sklearn.feature_selection import SequentialFeatureSelector

In [5]:
# Load flagged dataset #
dataset_flagged = pd.read_csv("../data/flagged.csv")
X = dataset_flagged.drop(['cls', 'Unnamed: 0'], axis=1)
Y = dataset_flagged['cls']

# Define column types
numerical_cols = ['duration', 'pps', 'bps', 'max_flowiat', 'mean_flowiat']
binary_cols = [col for col in X.columns if col not in numerical_cols]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1, stratify=Y)

We will use the StandardScaler to normalize everything before training any of the models.

We will first do:
 - Feature Lifting for the 5 Numerical Values We Have
 - Feature Selection
 - Try out different values of K

In [6]:
summary_flagged = X.agg(['min', 'max']).T # agg applies both the min and max function, T transposes it
summary_flagged.columns = ['Min Value', 'Max Value'] # rename columns

display(summary_flagged)

Unnamed: 0,Min Value,Max Value
duration,2.0,601404954.0
pps,0.019762,1000000.0
bps,3.557943,617000000.0
max_flowiat,2.0,600109654.0
mean_flowiat,2.0,60700000.0
has_active,0.0,1.0
has_std_active,0.0,1.0
has_fiat,0.0,1.0
has_biat,0.0,1.0
has_min_flowiat,0.0,1.0


In [7]:
# Feature Lifting #
polynomial_features = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False) # Doesn't include bias columns, will create both interactions and polynomial features
lifted_numerical_train = polynomial_features.fit_transform(X_train[numerical_cols])
lifted_numerical_test = polynomial_features.transform(X_test[numerical_cols]) 

poly_feature_names = polynomial_features.get_feature_names_out(numerical_cols)
X_train_lifted = pd.DataFrame(lifted_numerical_train, columns=poly_feature_names, index=X_train.index)
X_train_lifted = pd.concat([X_train_lifted, X_train[binary_cols]], axis=1) # Combine with binary features

# Get new feature names #
poly_feature_names = polynomial_features.get_feature_names_out(numerical_cols)

# Convert the new arrays back to DataFrames #
X_train_lifted = pd.DataFrame(lifted_numerical_train, columns=poly_feature_names, index=X_train.index)
X_test_lifted = pd.DataFrame(lifted_numerical_test, columns=poly_feature_names, index=X_test.index)

# Combine the new numerical features with the original binary features #
X_train_final = pd.concat([X_train_lifted, X_train[binary_cols]], axis=1)
X_test_final = pd.concat([X_test_lifted, X_test[binary_cols]], axis=1)

print(f"Original feature count: {X_train.shape[1]}")
print(f"Final feature count after lifting: {X_train_final.shape[1]}")
display(X_train_final.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

Original feature count: 13
Final feature count after lifting: 28


(14580, 28)

After feature lifting, we will end up with 15 additional columns to select features from.

In [8]:
# Feature Selection #
neighbor = 5

knn = KNeighborsClassifier(n_neighbors=neighbor)
selector = SequentialFeatureSelector(knn, n_features_to_select='auto', scoring='accuracy')
selector.fit(X_train_scaled, Y_train)

selected_features_mask = selector.get_support() # Returns True and False for each feature. 
selected_feature_names = X_train_final.columns[selected_features_mask]
print("Selected feature names:")
print(list(selected_feature_names))

Selected feature names:
['bps', 'max_flowiat', 'mean_flowiat', 'duration^2', 'duration pps', 'duration mean_flowiat', 'pps^2', 'pps max_flowiat', 'pps mean_flowiat', 'bps max_flowiat', 'bps mean_flowiat', 'has_std_active', 'has_fiat', 'has_mean_biat']


In [None]:
# TODO: KNN with the original and feature selected dataset. Display Results. #

knn_original = KNeighborsClassifier(n_neighbors=5)
knn_changes = KNeighborsClassifier(n_neighbors=5)

# Run 10-fold cross-validation
score_original = cross_val_score(knn_original, X_train, Y_train, cv=10, n_jobs=-1).mean()
score_changes = cross_val_score(knn_changes)

# Store the result
scores_flagged['KNN_Original'] = score_original

print(f"Original Dataset CV Score: {score_original:.4f}")