# CIC IoMT 2024 Dataset - Data Preprocessing

This notebook handles data preprocessing steps:
- **Duplicate Removal**: Clean duplicate rows from datasets
- **Label Encoding**: Convert categorical labels to numerical format
- **Feature Scaling**: Normalize/standardize features for model training
- **Train/Validation Split**: Split training data into train and validation sets
- **Data Saving**: Save preprocessed datasets for model training


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
from pathlib import Path
import pickle
import joblib
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

# Add src to path for imports
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(BASE_DIR / 'src'))

# Import common utilities
from utils import get_project_paths, load_datasets, remove_exact_duplicates, comprehensive_outlier_check

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


## 1. Load Dataset and Setup Paths


In [None]:
# Get project paths and load datasets
paths = get_project_paths()
DATA_DIR = paths['DATA_DIR']
OUTPUT_DIR = paths['OUTPUT_DIR']
MODELS_DIR = paths['MODELS_DIR']

# Load datasets
train_df, test_df = load_datasets()
print(f"✓ Training: {train_df.shape} | Test: {test_df.shape}")


## 2. Duplicate Removal

**Note**: From EDA, we found 2.6M+ duplicate rows. We'll apply the duplicate removal function here.


In [None]:
# Function is imported from utils.py


In [None]:
# Apply duplicate removal
print("Removing duplicates...")
train_df, train_stats = remove_exact_duplicates(train_df, keep='first', return_stats=True)
test_df, test_stats = remove_exact_duplicates(test_df, keep='first', return_stats=True)
print(f"✓ Training: {len(train_df):,} rows | Test: {len(test_df):,} rows")


## 3. Separate Features and Labels


In [None]:
# Separate features and labels
feature_cols = [col for col in train_df.columns if col != 'label']

X_train = train_df[feature_cols]
y_train = train_df['label']

X_test = test_df[feature_cols]
y_test = test_df['label']

print(f"✓ {len(feature_cols)} features | Train: {X_train.shape[0]:,} | Test: {X_test.shape[0]:,}")


## 4. Label Encoding

Convert categorical labels to numerical format for model training.


In [None]:
# Label encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
class_names = label_encoder.classes_
n_classes = len(class_names)

print(f"✓ {n_classes} classes encoded")


## 4.1. Outlier Detection

Detect outliers to determine the best scaling method.

**Note**: The `comprehensive_outlier_check()` function is imported from `src/utils.py` (see imports at the top of the notebook).


In [None]:
# Function is imported from utils.py

# Check outliers for all features
print("Detecting outliers...")
outlier_results = comprehensive_outlier_check(X_train, feature_cols, methods=['iqr', 'zscore'])

# Calculate summary statistics
iqr_outlier_percentages = [outlier_results[col]['iqr']['percentage'] for col in feature_cols]
zscore_outlier_percentages = [outlier_results[col]['zscore']['percentage'] for col in feature_cols]

avg_iqr_outliers = np.mean(iqr_outlier_percentages)
avg_zscore_outliers = np.mean(zscore_outlier_percentages)

print(f"✓ Outlier detection completed")
print(f"  Average IQR outliers: {avg_iqr_outliers:.2f}%")
print(f"  Average Z-score outliers: {avg_zscore_outliers:.2f}%")

# Show top 10 features with most outliers
outlier_summary = []
for col in feature_cols:
    iqr_pct = outlier_results[col]['iqr']['percentage']
    zscore_pct = outlier_results[col]['zscore']['percentage']
    outlier_summary.append((col, iqr_pct, zscore_pct))

outlier_summary.sort(key=lambda x: x[1], reverse=True)
print(f"\nTop 10 features with most outliers (IQR method):")
for col, iqr_pct, zscore_pct in outlier_summary[:10]:
    print(f"  {col:<30s}: IQR={iqr_pct:6.2f}% | Z-score={zscore_pct:6.2f}%")


In [None]:
# Visualize class distribution
class_counts = pd.Series(y_train_encoded).value_counts().sort_index()
plt.figure(figsize=(14, 6))
plt.bar(range(len(class_counts)), class_counts.values, color='steelblue', alpha=0.7)
plt.xlabel('Class Index', fontsize=12)
plt.ylabel('Number of Samples', fontsize=12)
plt.title('Class Distribution After Encoding', fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Class stats: Min={class_counts.min():,} | Max={class_counts.max():,} | Mean={class_counts.mean():.0f}")


## 5. Feature Scaling (Auto-selected based on Outliers)

Scaling method is automatically selected based on outlier detection:
- **RobustScaler**: If average outliers > 5% (robust to outliers)
- **StandardScaler**: If average outliers ≤ 5% (good for normal distributions)
- **MinMaxScaler**: Alternative option (scales to [0,1] range)


In [None]:
# Auto-select scaling method based on outlier detection
OUTLIER_THRESHOLD = 5.0  # Percentage threshold

if avg_iqr_outliers > OUTLIER_THRESHOLD:
    SCALING_METHOD = 'robust'
    scaler = RobustScaler()
    reason = f"High outliers detected ({avg_iqr_outliers:.2f}%)"
elif avg_zscore_outliers > OUTLIER_THRESHOLD:
    SCALING_METHOD = 'robust'
    scaler = RobustScaler()
    reason = f"High outliers detected ({avg_zscore_outliers:.2f}%)"
else:
    SCALING_METHOD = 'standard'
    scaler = StandardScaler()
    reason = f"Low outliers ({avg_iqr_outliers:.2f}%)"

print(f"Selected scaling method: {SCALING_METHOD.upper()}")
print(f"Reason: {reason}")

# Apply scaling
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train), 
    columns=feature_cols, 
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test), 
    columns=feature_cols, 
    index=X_test.index
)

print(f"✓ Scaling completed")


## 6. Train/Validation Split

Split the training data into train and validation sets for model evaluation during training.


In [None]:
# Train/Validation split
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_scaled,
    y_train_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_train_encoded
)

print(f"✓ Split: Train={len(X_train_final):,} | Val={len(X_val):,} | Test={len(X_test_scaled):,}")


## 7. Save Preprocessed Data

Save the preprocessed datasets and preprocessing objects for later use.


In [None]:
# Save preprocessed datasets
# Numpy arrays (for TensorFlow/Keras)
np.save(OUTPUT_DIR / 'X_train.npy', X_train_final.values)
np.save(OUTPUT_DIR / 'X_val.npy', X_val.values)
np.save(OUTPUT_DIR / 'X_test.npy', X_test_scaled.values)
np.save(OUTPUT_DIR / 'y_train.npy', y_train_final)
np.save(OUTPUT_DIR / 'y_val.npy', y_val)
np.save(OUTPUT_DIR / 'y_test.npy', y_test_encoded)

# Parquet files (for pandas/analysis)
X_train_final.to_parquet(OUTPUT_DIR / 'X_train.parquet')
X_val.to_parquet(OUTPUT_DIR / 'X_val.parquet')
X_test_scaled.to_parquet(OUTPUT_DIR / 'X_test.parquet')
pd.Series(y_train_final).to_frame('label').to_parquet(OUTPUT_DIR / 'y_train.parquet')
pd.Series(y_val).to_frame('label').to_parquet(OUTPUT_DIR / 'y_val.parquet')
pd.Series(y_test_encoded).to_frame('label').to_parquet(OUTPUT_DIR / 'y_test.parquet')

print("✓ Data files saved")


In [None]:
# Save preprocessing objects
joblib.dump(scaler, MODELS_DIR / 'scaler.pkl')
joblib.dump(label_encoder, MODELS_DIR / 'label_encoder.pkl')

with open(MODELS_DIR / 'class_mapping.pkl', 'wb') as f:
    pickle.dump({i: name for i, name in enumerate(class_names)}, f)

with open(MODELS_DIR / 'preprocessing_info.pkl', 'wb') as f:
    pickle.dump({
        'scaling_method': SCALING_METHOD,
        'n_features': len(feature_cols),
        'n_classes': n_classes,
        'feature_names': feature_cols,
        'class_names': class_names.tolist(),
        'train_size': len(X_train_final),
        'val_size': len(X_val),
        'test_size': len(X_test_scaled),
        'train_stats': train_stats,
        'test_stats': test_stats
    }, f)

print("✓ Preprocessing objects saved")


## 8. Summary and Next Steps


In [None]:
print("=" * 80)
print("PREPROCESSING SUMMARY")
print("=" * 80)
print(f"Train: {len(X_train_final):,} | Val: {len(X_val):,} | Test: {len(X_test_scaled):,}")
print(f"Features: {len(feature_cols)} | Classes: {n_classes} | Scaling: {SCALING_METHOD}")
print(f"✓ All files saved to {OUTPUT_DIR} and {MODELS_DIR}")
print("=" * 80)
