# Milestone 3

Link to our [ReadME](README.md)

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('main.csv')

## Data Exploration

In [3]:
# Examine the dataset structure
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes.value_counts())
print("\nTarget distribution:")
print(df['is_attack'].value_counts())
print("\nDevice distribution:")
print(df['Device'].value_counts())

Dataset shape: (7062606, 120)

Column names:
['MI_dir_L5_weight', 'MI_dir_L5_mean', 'MI_dir_L5_variance', 'MI_dir_L3_weight', 'MI_dir_L3_mean', 'MI_dir_L3_variance', 'MI_dir_L1_weight', 'MI_dir_L1_mean', 'MI_dir_L1_variance', 'MI_dir_L0.1_weight', 'MI_dir_L0.1_mean', 'MI_dir_L0.1_variance', 'MI_dir_L0.01_weight', 'MI_dir_L0.01_mean', 'MI_dir_L0.01_variance', 'H_L5_weight', 'H_L5_mean', 'H_L5_variance', 'H_L3_weight', 'H_L3_mean', 'H_L3_variance', 'H_L1_weight', 'H_L1_mean', 'H_L1_variance', 'H_L0.1_weight', 'H_L0.1_mean', 'H_L0.1_variance', 'H_L0.01_weight', 'H_L0.01_mean', 'H_L0.01_variance', 'HH_L5_weight', 'HH_L5_mean', 'HH_L5_std', 'HH_L5_magnitude', 'HH_L5_radius', 'HH_L5_covariance', 'HH_L5_pcc', 'HH_L3_weight', 'HH_L3_mean', 'HH_L3_std', 'HH_L3_magnitude', 'HH_L3_radius', 'HH_L3_covariance', 'HH_L3_pcc', 'HH_L1_weight', 'HH_L1_mean', 'HH_L1_std', 'HH_L1_magnitude', 'HH_L1_radius', 'HH_L1_covariance', 'HH_L1_pcc', 'HH_L0.1_weight', 'HH_L0.1_mean', 'HH_L0.1_std', 'HH_L0.1_magnitud

## Data Preprocessing

Following the preprocessing steps outlined in the README.

In [4]:
# Check for missing values and duplicates
print("Missing values per column:")
print(df.isnull().sum().sum())
print("\nNumber of duplicate rows:")
print(df.duplicated().sum())

# Check for any rows with minimal weight (close to 0)
weight_cols = [col for col in df.columns if 'weight' in col]
print(f"\nFound {len(weight_cols)} weight columns")

# Check for rows where all weights are very small
if weight_cols:
    min_weights = df[weight_cols].sum(axis=1)
    print(f"Rows with total weight < 0.001: {(min_weights < 0.001).sum()}")
    print(f"Min total weight: {min_weights.min()}")
    print(f"Max total weight: {min_weights.max()}")

Missing values per column:
0

Number of duplicate rows:
157779

Found 25 weight columns
Rows with total weight < 0.001: 0
Min total weight: 25.0
Max total weight: 252403.3509269681


In [5]:
# Step 1: Remove duplicates (memory efficient)
print(f"Original shape: {df.shape}")
df_clean = df.drop_duplicates()
print(f"After removing duplicates: {df_clean.shape}")
print(f"Removed {df.shape[0] - df_clean.shape[0]} duplicate rows")

Original shape: (7062606, 120)
After removing duplicates: (6904827, 120)
Removed 157779 duplicate rows


In [None]:
# Step 2: Remove rows with minimal weight (very cheap computation)
weight_cols = [col for col in df_clean.columns if 'weight' in col]
if weight_cols:
    # Calculate total weight per row (vectorized operation)
    total_weight = df_clean[weight_cols].sum(axis=1)
    
    # Keep only rows with meaningful weight
    meaningful_weight_mask = total_weight >= 0.001
    df_clean = df_clean[meaningful_weight_mask]
    
    print(f"Removed {(~meaningful_weight_mask).sum()} rows with minimal weight")
    print(f"Shape after weight filtering: {df_clean.shape}")
else:
    print("No weight columns found - skipping weight filtering")

In [None]:
# Step 4: Simple data balancing (cheap sampling approach)
print("\nClass distribution before balancing:")
print(df_clean['is_attack'].value_counts())

# Get class counts
attack_count = (df_clean['is_attack'] == 1).sum()
benign_count = (df_clean['is_attack'] == 0).sum()

# Simple downsampling of majority class to balance
if benign_count > attack_count * 2:  # Only balance if very imbalanced
    # Downsample benign to 2x attack count (still manageable for computation)
    target_benign = min(attack_count * 2, benign_count)
    
    benign_data = df_clean[df_clean['is_attack'] == 0].sample(n=target_benign, random_state=42)
    attack_data = df_clean[df_clean['is_attack'] == 1]
    
    df_balanced = pd.concat([benign_data, attack_data], ignore_index=True)
    
    print(f"\nBalanced dataset shape: {df_balanced.shape}")
    print("Class distribution after balancing:")
    print(df_balanced['is_attack'].value_counts())
else:
    df_balanced = df_clean.copy()
    print("Data is reasonably balanced - no sampling needed")

In [None]:
# Step 5: Simple normalization (min-max scaling)
# Only normalize numeric feature columns, preserve categorical
numeric_feature_cols = [col for col in df_balanced.columns 
                       if col not in ['Device', 'is_attack'] and df_balanced[col].dtype in ['float64', 'int64']]

print(f"Normalizing {len(numeric_feature_cols)} numeric columns...")

# Simple min-max scaling (vectorized operation)
df_processed = df_balanced.copy()
for col in numeric_feature_cols:
    col_min = df_processed[col].min()
    col_max = df_processed[col].max()
    
    # Avoid division by zero
    if col_max != col_min:
        df_processed[col] = (df_processed[col] - col_min) / (col_max - col_min)
    else:
        df_processed[col] = 0  # All values are the same

print(f"\nFinal preprocessed dataset shape: {df_processed.shape}")
print("\nPreprocessing complete!")
print("\nFinal class distribution:")
print(df_processed['is_attack'].value_counts())

# Save memory by deleting intermediate dataframes
del df_clean, df_balanced