# Milestone 3

Link to our [ReadME](README.md)

## Imports

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
if 'df' not in globals():
    import pandas as pd
    df = pd.read_csv('main.csv')

## Data Exploration

In [9]:
# Examine the dataset structure
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes.value_counts())
print("\nTarget distribution:")
print(df['is_attack'].value_counts())
print("\nDevice distribution:")
print(df['Device'].value_counts())

Dataset shape: (7062606, 120)

Column names:
['MI_dir_L5_weight', 'MI_dir_L5_mean', 'MI_dir_L5_variance', 'MI_dir_L3_weight', 'MI_dir_L3_mean', 'MI_dir_L3_variance', 'MI_dir_L1_weight', 'MI_dir_L1_mean', 'MI_dir_L1_variance', 'MI_dir_L0.1_weight', 'MI_dir_L0.1_mean', 'MI_dir_L0.1_variance', 'MI_dir_L0.01_weight', 'MI_dir_L0.01_mean', 'MI_dir_L0.01_variance', 'H_L5_weight', 'H_L5_mean', 'H_L5_variance', 'H_L3_weight', 'H_L3_mean', 'H_L3_variance', 'H_L1_weight', 'H_L1_mean', 'H_L1_variance', 'H_L0.1_weight', 'H_L0.1_mean', 'H_L0.1_variance', 'H_L0.01_weight', 'H_L0.01_mean', 'H_L0.01_variance', 'HH_L5_weight', 'HH_L5_mean', 'HH_L5_std', 'HH_L5_magnitude', 'HH_L5_radius', 'HH_L5_covariance', 'HH_L5_pcc', 'HH_L3_weight', 'HH_L3_mean', 'HH_L3_std', 'HH_L3_magnitude', 'HH_L3_radius', 'HH_L3_covariance', 'HH_L3_pcc', 'HH_L1_weight', 'HH_L1_mean', 'HH_L1_std', 'HH_L1_magnitude', 'HH_L1_radius', 'HH_L1_covariance', 'HH_L1_pcc', 'HH_L0.1_weight', 'HH_L0.1_mean', 'HH_L0.1_std', 'HH_L0.1_magnitud

## Data Preprocessing

Following the preprocessing steps outlined in the README.

In [10]:
# Check for missing values and duplicates
print("Missing values per column:")
print(df.isnull().sum().sum())
print("\nNumber of duplicate rows:")
print(df.duplicated().sum())

# Check for any rows with minimal weight (close to 0)
weight_cols = [col for col in df.columns if 'weight' in col]
print(f"\nFound {len(weight_cols)} weight columns")

# Check for rows where all weights are very small
if weight_cols:
    min_weights = df[weight_cols].sum(axis=1)
    print(f"Rows with total weight < 0.001: {(min_weights < 0.001).sum()}")
    print(f"Min total weight: {min_weights.min()}")
    print(f"Max total weight: {min_weights.max()}")

Missing values per column:
0

Number of duplicate rows:
157779

Found 25 weight columns
Rows with total weight < 0.001: 0
Min total weight: 25.0
Max total weight: 252403.3509269681


In [11]:
# Step 1: Remove duplicates
print(f"Original shape: {df.shape}")
df_clean = df.drop_duplicates()
print(f"After removing duplicates: {df_clean.shape}")
print(f"Removed {df.shape[0] - df_clean.shape[0]} duplicate rows")

Original shape: (7062606, 120)
After removing duplicates: (6904827, 120)
Removed 157779 duplicate rows


In [12]:
# Step 4: Balanced sampling approach
print("\nClass distribution before balancing:")
class_counts = df_clean['is_attack'].value_counts()
print(class_counts)

# Calculate class imbalance ratio
attack_count = class_counts[1]
benign_count = class_counts[0]
imbalance_ratio = max(attack_count, benign_count) / min(attack_count, benign_count)

print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1")

# Only balance if significantly imbalanced (>3:1 ratio)
if imbalance_ratio > 3.0:
    # Use the minority class size as target for both classes
    target_size = min(attack_count, benign_count)
    
    # Sample equal amounts from both classes
    benign_data = df_clean[df_clean['is_attack'] == 0].sample(n=target_size, random_state=42)
    attack_data = df_clean[df_clean['is_attack'] == 1].sample(n=target_size, random_state=42)
    
    df_balanced = pd.concat([benign_data, attack_data], ignore_index=True)
    
    print(f"\nBalanced to {target_size} samples per class")
    print(f"Balanced dataset shape: {df_balanced.shape}")
    print("Class distribution after balancing:")
    print(df_balanced['is_attack'].value_counts())
else:
    df_balanced = df_clean.copy()
    print("Data is reasonably balanced - no sampling needed")


Class distribution before balancing:
is_attack
1    6391327
0     513500
Name: count, dtype: int64

Imbalance ratio: 12.45:1

Balanced to 513500 samples per class
Balanced dataset shape: (1027000, 120)
Class distribution after balancing:
is_attack
0    513500
1    513500
Name: count, dtype: int64


In [13]:
# Step 5: Simple normalization (min-max scaling)
# Only normalize numeric feature columns, preserve categorical
numeric_feature_cols = [col for col in df_balanced.columns 
                       if col not in ['Device', 'is_attack'] and df_balanced[col].dtype in ['float64', 'int64']]

print(f"Normalizing {len(numeric_feature_cols)} numeric columns...")

# Simple min-max scaling (vectorized operation)
df_processed = df_balanced.copy()
for col in numeric_feature_cols:
    col_min = df_processed[col].min()
    col_max = df_processed[col].max()
    
    # Avoid division by zero
    if col_max != col_min:
        df_processed[col] = (df_processed[col] - col_min) / (col_max - col_min)
    else:
        df_processed[col] = 0  # All values are the same

print(f"\nFinal preprocessed dataset shape: {df_processed.shape}")
print("\nPreprocessing complete!")
print("\nFinal class distribution:")
print(df_processed['is_attack'].value_counts())

# Save memory by deleting intermediate dataframes
del df_clean, df_balanced

Normalizing 116 numeric columns...

Final preprocessed dataset shape: (1027000, 120)

Preprocessing complete!

Final class distribution:
is_attack
0    513500
1    513500
Name: count, dtype: int64


In [14]:
# Step 5: Simple normalization (min-max scaling)
# Only normalize numeric feature columns, preserve categorical
numeric_feature_cols = [col for col in df.columns 
                       if col not in ['Device', 'is_attack'] and df[col].dtype in ['float64', 'int64']]

print(f"Normalizing {len(numeric_feature_cols)} numeric columns...")

# Simple min-max scaling (vectorized operation)
df_processed_full = df.copy()
for col in numeric_feature_cols:
    col_min = df_processed_full[col].min()
    col_max = df_processed_full[col].max()
    
    # Avoid division by zero
    if col_max != col_min:
        df_processed_full[col] = (df_processed_full[col] - col_min) / (col_max - col_min)
    else:
        df_processed_full[col] = 0  # All values are the same

print(f"\nFinal preprocessed dataset shape: {df_processed_full.shape}")
print("\nPreprocessing complete!")
print("\nFinal class distribution:")
print(df_processed_full['is_attack'].value_counts())

Normalizing 116 numeric columns...

Final preprocessed dataset shape: (7062606, 120)

Preprocessing complete!

Final class distribution:
is_attack
1    6506674
0     555932
Name: count, dtype: int64


In [15]:
## Decision Tree Learning

# Prepare features and target
X = df_processed.drop(['is_attack', 'Device'], axis=1)  # Features only
y = df_processed['is_attack']  # Target

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Manual train/test split (80/20)
np.random.seed(42)
n_samples = len(df_processed)
indices = np.random.permutation(n_samples)
split_idx = int(0.8 * n_samples)

train_idx = indices[:split_idx]
test_idx = indices[split_idx:]

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train class distribution:\n{y_train.value_counts()}")
print(f"Test class distribution:\n{y_test.value_counts()}")

Feature matrix shape: (1027000, 118)
Target vector shape: (1027000,)

Train set: 821600 samples
Test set: 205400 samples
Train class distribution:
is_attack
0    410917
1    410683
Name: count, dtype: int64
Test class distribution:
is_attack
1    102817
0    102583
Name: count, dtype: int64


In [16]:
## Scikit-Learn Decision Tree 

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Prepare features (only numeric)
print("Preparing features for sklearn...")
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
X_train_sklearn = X_train[numeric_cols]
X_test_sklearn = X_test[numeric_cols]

print(f"Using {len(numeric_cols)} numeric features")
print(f"Training set shape: {X_train_sklearn.shape}")

# Create regularized Decision Tree to prevent overfitting
dt_sklearn = DecisionTreeClassifier(
    max_depth=6,                    # Limit tree depth
    min_samples_split=1000,         # Require many samples to split
    min_samples_leaf=500,           # Large leaf nodes
    max_features='sqrt',            # Feature randomization
    random_state=42,
    class_weight='balanced'         # Handle class imbalance
)

print("Training scikit-learn Decision Tree...")
dt_sklearn.fit(X_train_sklearn, y_train)

# Make predictions
y_pred_sklearn = dt_sklearn.predict(X_test_sklearn)

# Calculate comprehensive metrics
accuracy = accuracy_score(y_test, y_pred_sklearn)
precision = precision_score(y_test, y_pred_sklearn)
recall = recall_score(y_test, y_pred_sklearn)
f1 = f1_score(y_test, y_pred_sklearn)

print("\n" + "="*60)
print("SCIKIT-LEARN DECISION TREE PERFORMANCE")
print("="*60)
print(f"Accuracy:     {accuracy:.4f}")
print(f"Error Rate:   {1-accuracy:.4f}")
print(f"Precision:    {precision:.4f}")
print(f"Recall:       {recall:.4f}")
print(f"F1 Score:     {f1:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_sklearn, target_names=['Benign', 'Attack']))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred_sklearn)
print(f"True Negatives:  {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives:  {cm[1,1]:,}")
print("="*60)

Preparing features for sklearn...
Using 116 numeric features
Training set shape: (821600, 116)
Training scikit-learn Decision Tree...

SCIKIT-LEARN DECISION TREE PERFORMANCE
Accuracy:     0.9996
Error Rate:   0.0004
Precision:    0.9996
Recall:       0.9996
F1 Score:     0.9996

Detailed Classification Report:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00    102583
      Attack       1.00      1.00      1.00    102817

    accuracy                           1.00    205400
   macro avg       1.00      1.00      1.00    205400
weighted avg       1.00      1.00      1.00    205400


Confusion Matrix:
True Negatives:  102,537
False Positives: 46
False Negatives: 40
True Positives:  102,777


In [17]:
print("Preparing features for sklearn...")
numeric_cols = df_processed_full.select_dtypes(include=[np.number]).columns
Full_sklearn = df_processed_full[numeric_cols]

print(f"Using {len(numeric_cols)} numeric features")
print(f"Training set shape: {Full_sklearn.shape}")

# Make predictions
y_pred_sklearn = dt_sklearn.predict(Full_sklearn)

# Calculate comprehensive metrics
accuracy = accuracy_score(Full_sklearn['is_attack'], y_pred_sklearn)
precision = precision_score(Full_sklearn['is_attack'], y_pred_sklearn)
recall = recall_score(Full_sklearn['is_attack'], y_pred_sklearn)
f1 = f1_score(Full_sklearn['is_attack'], y_pred_sklearn)

print("SCIKIT-LEARN DECISION TREE PERFORMANCE")
print(f"Accuracy:     {accuracy:.4f}")
print(f"Error Rate:   {1-accuracy:.4f}")
print(f"Precision:    {precision:.4f}")
print(f"Recall:       {recall:.4f}")
print(f"F1 Score:     {f1:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(Full_sklearn['is_attack'], y_pred_sklearn, target_names=['Benign', 'Attack']))

print("\nConfusion Matrix:")
cm = confusion_matrix(Full_sklearn['is_attack'], y_pred_sklearn)
print(f"True Negatives:  {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives:  {cm[1,1]:,}")
print("="*60)

Preparing features for sklearn...
Using 117 numeric features
Training set shape: (7062606, 117)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- is_attack


In [None]:
# Cross-Validation with scikit-learn
print("\nPerforming 5-Fold Cross-Validation...")
cv_scores = cross_val_score(dt_sklearn, X_train_sklearn, y_train, cv=5, scoring='f1')

print(f"Cross-Validation F1 Scores: {[f'{score:.4f}' for score in cv_scores]}")
print(f"Mean CV F1 Score: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Check for overfitting
train_accuracy = dt_sklearn.score(X_train_sklearn, y_train)
test_accuracy = accuracy

print(f"\nOverfitting Analysis:")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy:     {test_accuracy:.4f}")
print(f"Gap:               {train_accuracy - test_accuracy:.4f}")

if train_accuracy - test_accuracy > 0.05:
    print("⚠️  Model shows signs of overfitting")
else:
    print("✅ Model shows good generalization")

# Feature importance (top 10)
feature_importance = pd.DataFrame({
    'feature': X_train_sklearn.columns,
    'importance': dt_sklearn.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))


Performing 5-Fold Cross-Validation...
Cross-Validation F1 Scores: ['0.9997', '0.9997', '0.9997', '0.9997', '0.9997']
Mean CV F1 Score: 0.9997 ± 0.0000

Overfitting Analysis:
Training Accuracy: 0.9996
Test Accuracy:     0.9996
Gap:               -0.0000
✅ Model shows good generalization

Top 10 Most Important Features:
           feature  importance
    HpHp_L1_radius    0.564556
 HpHp_L0.01_weight    0.144786
    H_L0.01_weight    0.118939
    HH_jit_L3_mean    0.073166
       HH_L0.1_pcc    0.049786
     H_L0.1_weight    0.040599
  MI_dir_L3_weight    0.007969
MI_dir_L0.1_weight    0.000131
    HpHp_L3_radius    0.000024
 HpHp_L3_magnitude    0.000014
