# Milestone 3

Link to our [ReadME](README.md)

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
if 'df' not in globals():
    import pandas as pd
    df = pd.read_csv('main.csv')

## Data Exploration

In [44]:
# Examine the dataset structure
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes.value_counts())
print("\nTarget distribution:")
print(df['is_attack'].value_counts())
print("\nDevice distribution:")
print(df['Device'].value_counts())

Dataset shape: (7062606, 120)

Column names:
['MI_dir_L5_weight', 'MI_dir_L5_mean', 'MI_dir_L5_variance', 'MI_dir_L3_weight', 'MI_dir_L3_mean', 'MI_dir_L3_variance', 'MI_dir_L1_weight', 'MI_dir_L1_mean', 'MI_dir_L1_variance', 'MI_dir_L0.1_weight', 'MI_dir_L0.1_mean', 'MI_dir_L0.1_variance', 'MI_dir_L0.01_weight', 'MI_dir_L0.01_mean', 'MI_dir_L0.01_variance', 'H_L5_weight', 'H_L5_mean', 'H_L5_variance', 'H_L3_weight', 'H_L3_mean', 'H_L3_variance', 'H_L1_weight', 'H_L1_mean', 'H_L1_variance', 'H_L0.1_weight', 'H_L0.1_mean', 'H_L0.1_variance', 'H_L0.01_weight', 'H_L0.01_mean', 'H_L0.01_variance', 'HH_L5_weight', 'HH_L5_mean', 'HH_L5_std', 'HH_L5_magnitude', 'HH_L5_radius', 'HH_L5_covariance', 'HH_L5_pcc', 'HH_L3_weight', 'HH_L3_mean', 'HH_L3_std', 'HH_L3_magnitude', 'HH_L3_radius', 'HH_L3_covariance', 'HH_L3_pcc', 'HH_L1_weight', 'HH_L1_mean', 'HH_L1_std', 'HH_L1_magnitude', 'HH_L1_radius', 'HH_L1_covariance', 'HH_L1_pcc', 'HH_L0.1_weight', 'HH_L0.1_mean', 'HH_L0.1_std', 'HH_L0.1_magnitud

## Data Preprocessing

Following the preprocessing steps outlined in the README.

In [3]:
# Check for missing values and duplicates
print("Missing values per column:")
print(df.isnull().sum().sum())
print("\nNumber of duplicate rows:")
print(df.duplicated().sum())

# Check for any rows with minimal weight (close to 0)
weight_cols = [col for col in df.columns if 'weight' in col]
print(f"\nFound {len(weight_cols)} weight columns")

# Check for rows where all weights are very small
if weight_cols:
    min_weights = df[weight_cols].sum(axis=1)
    print(f"Rows with total weight < 0.001: {(min_weights < 0.001).sum()}")
    print(f"Min total weight: {min_weights.min()}")
    print(f"Max total weight: {min_weights.max()}")

Missing values per column:
0

Number of duplicate rows:
157779

Found 25 weight columns
Rows with total weight < 0.001: 0
Min total weight: 25.0
Max total weight: 252403.3509269681


In [4]:
# Step 1: Remove duplicates
print(f"Original shape: {df.shape}")
df_clean = df.drop_duplicates()
print(f"After removing duplicates: {df_clean.shape}")
print(f"Removed {df.shape[0] - df_clean.shape[0]} duplicate rows")

Original shape: (7062606, 120)
After removing duplicates: (6904827, 120)
Removed 157779 duplicate rows


In [5]:
# Step 4: Balanced sampling approach
print("\nClass distribution before balancing:")
class_counts = df_clean['is_attack'].value_counts()
print(class_counts)

# Calculate class imbalance ratio
attack_count = class_counts[1]
benign_count = class_counts[0]
imbalance_ratio = max(attack_count, benign_count) / min(attack_count, benign_count)

print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1")

# Only balance if significantly imbalanced (>3:1 ratio)
if imbalance_ratio > 3.0:
    # Use the minority class size as target for both classes
    target_size = min(attack_count, benign_count)
    
    # Sample equal amounts from both classes
    benign_data = df_clean[df_clean['is_attack'] == 0].sample(n=target_size, random_state=42)
    attack_data = df_clean[df_clean['is_attack'] == 1].sample(n=target_size, random_state=42)
    
    df_balanced = pd.concat([benign_data, attack_data], ignore_index=True)
    
    print(f"\nBalanced to {target_size} samples per class")
    print(f"Balanced dataset shape: {df_balanced.shape}")
    print("Class distribution after balancing:")
    print(df_balanced['is_attack'].value_counts())
else:
    df_balanced = df_clean.copy()
    print("Data is reasonably balanced - no sampling needed")


Class distribution before balancing:
is_attack
1    6391327
0     513500
Name: count, dtype: int64

Imbalance ratio: 12.45:1

Balanced to 513500 samples per class
Balanced dataset shape: (1027000, 120)
Class distribution after balancing:
is_attack
0    513500
1    513500
Name: count, dtype: int64


In [6]:
# Step 5: Simple normalization (min-max scaling)
# Only normalize numeric feature columns, preserve categorical
numeric_feature_cols = [col for col in df_balanced.columns 
                       if col not in ['Device', 'is_attack'] and df_balanced[col].dtype in ['float64', 'int64']]

print(f"Normalizing {len(numeric_feature_cols)} numeric columns...")

# Simple min-max scaling (vectorized operation)
df_processed = df_balanced.copy()
for col in numeric_feature_cols:
    col_min = df_processed[col].min()
    col_max = df_processed[col].max()
    
    # Avoid division by zero
    if col_max != col_min:
        df_processed[col] = (df_processed[col] - col_min) / (col_max - col_min)
    else:
        df_processed[col] = 0  # All values are the same

print(f"\nFinal preprocessed dataset shape: {df_processed.shape}")
print("\nPreprocessing complete!")
print("\nFinal class distribution:")
print(df_processed['is_attack'].value_counts())

# Save memory by deleting intermediate dataframes
del df_clean, df_balanced

Normalizing 116 numeric columns...

Final preprocessed dataset shape: (1027000, 120)

Preprocessing complete!

Final class distribution:
is_attack
0    513500
1    513500
Name: count, dtype: int64

Final preprocessed dataset shape: (1027000, 120)

Preprocessing complete!

Final class distribution:
is_attack
0    513500
1    513500
Name: count, dtype: int64


In [49]:
# Step 5: Simple normalization (min-max scaling)
# Only normalize numeric feature columns, preserve categorical
numeric_feature_cols = [col for col in df.columns 
                       if col not in ['Device', 'is_attack'] and df[col].dtype in ['float64', 'int64']]

print(f"Normalizing {len(numeric_feature_cols)} numeric columns...")

# Simple min-max scaling (vectorized operation)
df_processed_full = df.copy()
for col in numeric_feature_cols:
    col_min = df_processed_full[col].min()
    col_max = df_processed_full[col].max()
    
    # Avoid division by zero
    if col_max != col_min:
        df_processed_full[col] = (df_processed_full[col] - col_min) / (col_max - col_min)
    else:
        df_processed_full[col] = 0  # All values are the same

print(f"\nFinal preprocessed dataset shape: {df_processed_full.shape}")
print("\nPreprocessing complete!")
print("\nFinal class distribution:")
print(df_processed_full['is_attack'].value_counts())

Normalizing 116 numeric columns...

Final preprocessed dataset shape: (7062606, 120)

Preprocessing complete!

Final class distribution:
is_attack
1    6506674
0     555932
Name: count, dtype: int64

Final preprocessed dataset shape: (7062606, 120)

Preprocessing complete!

Final class distribution:
is_attack
1    6506674
0     555932
Name: count, dtype: int64


In [7]:
## Decision Tree Learning

# Prepare features and target
X = df_processed.drop(['is_attack', 'Device'], axis=1).select_dtypes(include=[np.number])  # Features only
y = df_processed['is_attack']  # Target

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Single proper train/test split (80/20) with stratification
from sklearn.model_selection import train_test_split
X_train_sklearn, X_test_sklearn, y_train, y_test = train_test_split(
    X, 
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Ensures balanced representation in both sets
)

print("Proper train/test split completed:")
print(f"Train shape: {X_train_sklearn.shape}")
print(f"Test shape: {X_test_sklearn.shape}")
print(f"Train class distribution: {y_train.value_counts(normalize=True)}")
print(f"Test class distribution: {y_test.value_counts(normalize=True)}")

Feature matrix shape: (1027000, 116)
Target vector shape: (1027000,)
Proper train/test split completed:
Train shape: (821600, 116)
Test shape: (205400, 116)
Train class distribution: is_attack
1    0.5
0    0.5
Name: proportion, dtype: float64
Test class distribution: is_attack
1    0.5
0    0.5
Name: proportion, dtype: float64
Proper train/test split completed:
Train shape: (821600, 116)
Test shape: (205400, 116)
Train class distribution: is_attack
1    0.5
0    0.5
Name: proportion, dtype: float64
Test class distribution: is_attack
1    0.5
0    0.5
Name: proportion, dtype: float64


# Training our First Model

In [51]:
## Scikit-Learn Decision Tree 

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Data is already properly split and numeric from the previous cell
print(f"Training set shape: {X_train_sklearn.shape}")
print(f"Test set shape: {X_test_sklearn.shape}")

dt_sklearn = DecisionTreeClassifier(
    max_depth=6,                    # Tree depth
    min_samples_split=1000,         # How many samples are required to split
    min_samples_leaf=500,           # Minimum samples per leaf
    max_features='sqrt',            # Feature randomization
    random_state=42,
    class_weight='balanced'         
)

dt_sklearn.fit(X_train_sklearn, y_train)

# Make predictions
y_pred_sklearn = dt_sklearn.predict(X_test_sklearn)
y_pred_train_sklearn = dt_sklearn.predict(X_train_sklearn)

# Calculate Test Scores
test_accuracy = accuracy_score(y_test, y_pred_sklearn)
test_precision = precision_score(y_test, y_pred_sklearn)
test_recall = recall_score(y_test, y_pred_sklearn)
test_f1 = f1_score(y_test, y_pred_sklearn)

# Calculate Training Scores
train_accuracy = accuracy_score(y_train, y_pred_train_sklearn)
train_precision = precision_score(y_train, y_pred_train_sklearn)
train_recall = recall_score(y_train, y_pred_train_sklearn)
train_f1 = f1_score(y_train, y_pred_train_sklearn)

print("TESTING SCORES")
print("model accuracy: ", test_accuracy)
print("model precision: ", test_precision)
print("model recall: ", test_recall)
print("model f1 score: ", test_f1)

print("\n TRAINING SCORES")
print("model accuracy: ", train_accuracy)
print("model precision: ", train_precision)
print("model recall: ", train_recall)
print("model f1 score: ", train_f1)

print("\nConfusion Matrix for TESTING:")
cm = confusion_matrix(y_test, y_pred_sklearn)
print(f"True Negatives:  {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives:  {cm[1,1]:,}")

Training set shape: (821600, 116)
Test set shape: (205400, 116)
TESTING SCORES
model accuracy:  0.9994888023369036
model precision:  0.9995228496586914
model recall:  0.9994547224926972
model f1 score:  0.9994887849147732

 TRAINING SCORES
model accuracy:  0.9994559396299902
model precision:  0.9994109586040206
model recall:  0.9995009737098345
model f1 score:  0.9994559641301451

Confusion Matrix for TESTING:
True Negatives:  102,651
False Positives: 49
False Negatives: 56
True Positives:  102,644
TESTING SCORES
model accuracy:  0.9994888023369036
model precision:  0.9995228496586914
model recall:  0.9994547224926972
model f1 score:  0.9994887849147732

 TRAINING SCORES
model accuracy:  0.9994559396299902
model precision:  0.9994109586040206
model recall:  0.9995009737098345
model f1 score:  0.9994559641301451

Confusion Matrix for TESTING:
True Negatives:  102,651
False Positives: 49
False Negatives: 56
True Positives:  102,644


### Testing the first model on our 7 million instance dataset

In [52]:

numeric_cols = df_processed_full.select_dtypes(include=[np.number]).columns
Full_sklearn = df_processed_full[numeric_cols]

Full_sklearn_dropped = Full_sklearn.drop(columns=['is_attack'])
print(f"Training set shape: {Full_sklearn.shape}")

# Make predictions

y_pred_sklearn = dt_sklearn.predict(Full_sklearn_dropped)

# Calculate comprehensive metrics
accuracy = accuracy_score(Full_sklearn['is_attack'], y_pred_sklearn)
precision = precision_score(Full_sklearn['is_attack'], y_pred_sklearn)
recall = recall_score(Full_sklearn['is_attack'], y_pred_sklearn)
f1 = f1_score(Full_sklearn['is_attack'], y_pred_sklearn)

print("TESTING SCORES ON 7 MILLION")
print("model accuracy: ", accuracy)
print("model precision: ", precision)
print("model recall: ", recall)
print("model f1 score: ", f1)

print("\nConfusion Matrix:")
cm = confusion_matrix(Full_sklearn['is_attack'], y_pred_sklearn)
print(f"True Negatives:  {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives:  {cm[1,1]:,}")


Training set shape: (7062606, 117)
TESTING SCORES ON 7 MILLION
model accuracy:  0.9993775668641292
model precision:  0.9998805319694605
model recall:  0.9994438018563708
model f1 score:  0.9996621192135033

Confusion Matrix:
TESTING SCORES ON 7 MILLION
model accuracy:  0.9993775668641292
model precision:  0.9998805319694605
model recall:  0.9994438018563708
model f1 score:  0.9996621192135033

Confusion Matrix:
True Negatives:  555,155
False Positives: 777
False Negatives: 3,619
True Positives:  6,503,055
True Negatives:  555,155
False Positives: 777
False Negatives: 3,619
True Positives:  6,503,055


## Making second new DTL with new Hyperparameters
### (Lower Depth, Higher Minimums)

In [53]:
dt_sklearn = DecisionTreeClassifier(
    max_depth=3,                    # Tree depth
    min_samples_split=10000,         # How many samples are required to split
    min_samples_leaf=5000,           # Minimum samples per leaf
    max_features='sqrt',            # Feature randomization
    random_state=42,
    class_weight='balanced'         
)

dt_sklearn.fit(X_train_sklearn, y_train)

# Make predictions
y_pred_sklearn = dt_sklearn.predict(X_test_sklearn)
y_pred_train_sklearn = dt_sklearn.predict(X_train_sklearn)

# Calculate Test Scores
test_accuracy = accuracy_score(y_test, y_pred_sklearn)
test_precision = precision_score(y_test, y_pred_sklearn)
test_recall = recall_score(y_test, y_pred_sklearn)
test_f1 = f1_score(y_test, y_pred_sklearn)

# Calculate Training Scores
train_accuracy = accuracy_score(y_train, y_pred_train_sklearn)
train_precision = precision_score(y_train, y_pred_train_sklearn)
train_recall = recall_score(y_train, y_pred_train_sklearn)
train_f1 = f1_score(y_train, y_pred_train_sklearn)

print("TESTING SCORES")
print("model accuracy: ", test_accuracy)
print("model precision: ", test_precision)
print("model recall: ", test_recall)
print("model f1 score: ", test_f1)

print("\n TRAINING SCORES")
print("model accuracy: ", train_accuracy)
print("model precision: ", train_precision)
print("model recall: ", train_recall)
print("model f1 score: ", train_f1)

print("\nConfusion Matrix for TESTING:")
cm = confusion_matrix(y_test, y_pred_sklearn)
print(f"True Negatives:  {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives:  {cm[1,1]:,}")


TESTING SCORES
model accuracy:  0.9855598831548199
model precision:  0.9861184223353024
model recall:  0.984985394352483
model f1 score:  0.98555158270087

 TRAINING SCORES
model accuracy:  0.9851813534566699
model precision:  0.9850172410017058
model recall:  0.985350535540409
model f1 score:  0.9851838600820942

Confusion Matrix for TESTING:
True Negatives:  101,276
False Positives: 1,424
False Negatives: 1,542
True Positives:  101,158


## Making third DTL with new hyperparameters
### Hyperparameters reccomended by GPT 
Prompt : reccomend hyperparameters for a 7 million x 116 matrix for a DTL

In [54]:
dt_sklearn = DecisionTreeClassifier(
    max_depth=16,                    # Tree depth
    min_samples_split=12000,         # How many samples are required to split
    min_samples_leaf=16000,           # Minimum samples per leaf
    max_features='sqrt',            # Feature randomization
    random_state=42,
    class_weight='balanced'         
)

dt_sklearn.fit(X_train_sklearn, y_train)

# Make predictions
y_pred_sklearn = dt_sklearn.predict(X_test_sklearn)
y_pred_train_sklearn = dt_sklearn.predict(X_train_sklearn)

# Calculate Test Scores
test_accuracy = accuracy_score(y_test, y_pred_sklearn)
test_precision = precision_score(y_test, y_pred_sklearn)
test_recall = recall_score(y_test, y_pred_sklearn)
test_f1 = f1_score(y_test, y_pred_sklearn)

# Calculate Training Scores
train_accuracy = accuracy_score(y_train, y_pred_train_sklearn)
train_precision = precision_score(y_train, y_pred_train_sklearn)
train_recall = recall_score(y_train, y_pred_train_sklearn)
train_f1 = f1_score(y_train, y_pred_train_sklearn)

print("TESTING SCORES")
print("model accuracy: ", test_accuracy)
print("model precision: ", test_precision)
print("model recall: ", test_recall)
print("model f1 score: ", test_f1)

print("\n TRAINING SCORES")
print("model accuracy: ", train_accuracy)
print("model precision: ", train_precision)
print("model recall: ", train_recall)
print("model f1 score: ", train_f1)

print("\nConfusion Matrix for TESTING:")
cm = confusion_matrix(y_test, y_pred_sklearn)
print(f"True Negatives:  {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives:  {cm[1,1]:,}")


TESTING SCORES
model accuracy:  0.9890408958130477
model precision:  0.9815619157198332
model recall:  0.9968062317429406
model f1 score:  0.9891253411918164

 TRAINING SCORES
model accuracy:  0.9887268743914314
model precision:  0.9806075549384785
model recall:  0.9971738072054528
model f1 score:  0.9888213001182804

Confusion Matrix for TESTING:
True Negatives:  100,777
False Positives: 1,923
False Negatives: 328
True Positives:  102,372


## Model 2 (Unsupervised)
Create a small stratified sample



In [8]:
# Small stratified sample for unsupervised model. This keeps things fast as we start analyzing our outputs
from sklearn.model_selection import train_test_split
MAX_SAMPLE = 50_000
n = len(X_train_sklearn)
if n > MAX_SAMPLE:
    test_size = n - MAX_SAMPLE
    X_unsup_small, _, y_unsup_small, _ = train_test_split(
        X_train_sklearn, y_train,
        test_size=test_size,
        stratify=y_train,
        random_state=42
)
else:
    X_unsup_small = X_train_sklearn.copy()
    y_unsup_small = y_train.copy()
print(f"Total training rows: {n:,}")
print(f"Sampled rows:       {len(X_unsup_small):,}")
print("Class proportion in sample:")
print(y_unsup_small.value_counts(normalize=True))

Total training rows: 821,600
Sampled rows:       50,000
Class proportion in sample:
is_attack
0    0.5
1    0.5
Name: proportion, dtype: float64


### Fit PCA on just this small sample.

In [9]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42) #Keep 95% of variance
X_pca = pca.fit_transform(X_unsup_small)

print(f"Original dimensions: {X_unsup_small.shape}")
print(f"PCA dimensions: {X_pca.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")
print(f"Number of components needed for 95% variance: {pca.n_components_}")

Original dimensions: (50000, 116)
PCA dimensions: (50000, 7)
Explained variance ratio: 0.9567
Number of components needed for 95% variance: 7


## KMeans Clustering (k=2) on our sample

In [10]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# Fit KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_pca)

# Training
train_clusters = kmeans.predict(X_pca)

# Map clusters to back to attack/benign
mapping = {}
for cluster in np.unique(train_clusters):
    labels = y_unsup_small[train_clusters == cluster]
    mapping[cluster] = labels.value_counts().idxmax()

train_pred = np.array([mapping[c] for c in train_clusters])

# Testing
X_test_pca = pca.transform(X_test_sklearn)
test_clusters = kmeans.predict(X_test_pca)
test_pred = np.array([mapping[c] for c in test_clusters])

# Compute Scores
train_acc = accuracy_score(y_unsup_small, train_pred)
test_acc = accuracy_score(y_test, test_pred)
train_precision = precision_score(y_unsup_small, train_pred)
test_precision = precision_score(y_test, test_pred)
train_recall = recall_score(y_unsup_small, train_pred)
test_recall = recall_score(y_test, test_pred)
train_f1 = f1_score(y_unsup_small, train_pred)
test_f1 = f1_score(y_test, test_pred)

# Print results
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Training Precision: {train_precision:.4f}")
print(f"Training Recall: {train_recall:.4f}")
print(f"Training F1 Score: {train_f1:.4f}")
print("---")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

# Compute fp and fn for test set
cm = confusion_matrix(y_test, test_pred)
fp = cm[0,1]
fn = cm[1,0]
print(f"Test False Positives: {fp}")
print(f"Test False Negatives: {fn}")

Training Accuracy: 0.7193
Training Precision: 0.9965
Training Recall: 0.4401
Training F1 Score: 0.6105
---
Test Accuracy: 0.7211
Test Precision: 0.9968
Test Recall: 0.4437
Test F1 Score: 0.6140
Test False Positives: 146
Test False Negatives: 57134
