---
## 1. Import Libraries & Load Data

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer

# Utilities
import glob
import joblib
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("✅ Libraries imported successfully!")

In [None]:
# TODO: Load the dataset
# data_files = glob.glob('../data/UNSW-NB15_*.csv')
# df = pd.concat([pd.read_csv(file) for file in sorted(data_files)], ignore_index=True)
# print(f"Loaded {len(df):,} records with {df.shape[1]} features")

---
## 2. Handle Missing Values

In [None]:
# TODO: Check missing values again
# missing_summary = df.isnull().sum()
# print("Missing values per column:")
# print(missing_summary[missing_summary > 0])

In [None]:
# TODO: Strategy for handling missing values
# Option 1: Drop columns with >50% missing
# threshold = 0.5
# cols_to_drop = missing_summary[missing_summary > len(df) * threshold].index
# df = df.drop(columns=cols_to_drop)

# Option 2: Impute numerical features with median
# numerical_cols = df.select_dtypes(include=[np.number]).columns
# imputer = SimpleImputer(strategy='median')
# df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

# Option 3: Impute categorical features with mode
# categorical_cols = df.select_dtypes(include=['object']).columns
# for col in categorical_cols:
#     df[col].fillna(df[col].mode()[0], inplace=True)

# print("✅ Missing values handled")

---
## 3. Outlier Treatment

In [None]:
# TODO: Define outlier treatment strategy
# Option 1: Capping (Winsorization) - clip to percentiles
# def cap_outliers(data, column, lower_percentile=0.01, upper_percentile=0.99):
#     lower_cap = data[column].quantile(lower_percentile)
#     upper_cap = data[column].quantile(upper_percentile)
#     data[column] = data[column].clip(lower=lower_cap, upper=upper_cap)
#     return data

# # Apply to selected numerical features
# features_to_cap = ['dur', 'sbytes', 'dbytes', 'spkts', 'dpkts']  # example
# for col in features_to_cap:
#     if col in df.columns:
#         df = cap_outliers(df, col)

# Option 2: Log transformation for skewed features
# skewed_features = ['sbytes', 'dbytes']  # example
# for col in skewed_features:
#     if col in df.columns:
#         df[f'{col}_log'] = np.log1p(df[col])  # log1p handles zeros

# print("✅ Outliers treated")

---
## 4. Feature Engineering

Create new features that might improve model performance.

In [None]:
# TODO: Engineer new features

# Example 1: Packet rate
# if 'spkts' in df.columns and 'dur' in df.columns:
#     df['spkts_rate'] = df['spkts'] / (df['dur'] + 1e-6)  # avoid division by zero
#     df['dpkts_rate'] = df['dpkts'] / (df['dur'] + 1e-6)

# Example 2: Byte ratios
# if 'sbytes' in df.columns and 'dbytes' in df.columns:
#     df['byte_ratio'] = df['sbytes'] / (df['dbytes'] + 1)

# Example 3: Total traffic volume
# df['total_bytes'] = df['sbytes'] + df['dbytes']
# df['total_pkts'] = df['spkts'] + df['dpkts']

# Example 4: Average packet size
# df['avg_pkt_size'] = df['total_bytes'] / (df['total_pkts'] + 1)

# Example 5: Time-based features
# if 'stime' in df.columns:
#     df['hour'] = pd.to_datetime(df['stime'], unit='s').dt.hour
#     df['is_night'] = df['hour'].apply(lambda x: 1 if x >= 22 or x <= 6 else 0)

# print("✅ Feature engineering completed")
# print(f"New shape: {df.shape}")

---
## 5. Encode Categorical Variables

In [None]:
# TODO: Identify categorical columns
# categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
# # Exclude target-related columns if present
# categorical_cols = [col for col in categorical_cols if col not in ['attack_cat']]
# print(f"Categorical columns to encode: {categorical_cols}")

In [None]:
# TODO: Encode categorical variables

# Option 1: Label Encoding for ordinal or low-cardinality features
# label_encoders = {}
# for col in categorical_cols:
#     le = LabelEncoder()
#     df[col] = le.fit_transform(df[col].astype(str))
#     label_encoders[col] = le

# Option 2: One-Hot Encoding for nominal features
# df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# print("✅ Categorical variables encoded")
# print(f"Final shape: {df.shape}")

---
## 6. Feature Scaling

Normalize features for algorithms sensitive to scale (SVC, ANN, Logistic Regression).

In [None]:
# TODO: Separate features and target
# X = df.drop(columns=['label', 'attack_cat'], errors='ignore')
# y = df['label']

# # Remove id column if exists
# if 'id' in X.columns:
#     X = X.drop(columns=['id'])

# print(f"Feature matrix shape: {X.shape}")
# print(f"Target vector shape: {y.shape}")

In [None]:
# TODO: Note - Scaling will be done AFTER train-test split to prevent data leakage
# We'll scale in the next notebook during model training
# For now, just verify data is ready

# print("Feature names:")
# print(X.columns.tolist())

---
## 7. Train-Test Split (70-30 Stratified)

Split data while maintaining class distribution.

In [None]:
# TODO: Stratified train-test split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, 
#     test_size=0.30, 
#     random_state=42, 
#     stratify=y
# )

# print("Train set size:", X_train.shape)
# print("Test set size:", X_test.shape)
# print("\nTrain set class distribution:")
# print(y_train.value_counts(normalize=True))
# print("\nTest set class distribution:")
# print(y_test.value_counts(normalize=True))

In [None]:
# TODO: Apply scaling to train and test sets separately
# scaler = StandardScaler()
# # OR use RobustScaler if data has outliers:
# # scaler = RobustScaler()

# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)  # Use fitted scaler, don't fit again!

# # Convert back to DataFrame for easier handling
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# print("✅ Feature scaling completed")

---
## 8. Save Processed Data

In [None]:
# TODO: Save processed datasets for modeling
# X_train_scaled.to_csv('../data/X_train.csv', index=False)
# X_test_scaled.to_csv('../data/X_test.csv', index=False)
# y_train.to_csv('../data/y_train.csv', index=False)
# y_test.to_csv('../data/y_test.csv', index=False)

# # Save scaler for future use
# joblib.dump(scaler, '../models/scaler.pkl')

# print("✅ Processed data saved to /data directory")
# print("✅ Scaler saved to /models directory")

---
## Summary

### Preprocessing Steps Completed:
1. ✅ Handled missing values
2. ✅ Treated outliers
3. ✅ Engineered new features
4. ✅ Encoded categorical variables
5. ✅ Scaled numerical features
6. ✅ Split data (70-30 stratified)
7. ✅ Saved processed datasets

### Ready for Modeling!
- Train set: **X_train_scaled**, **y_train**
- Test set: **X_test_scaled**, **y_test**
- Feature count: [Fill in after processing]

---
**Proceed to:** `03_model_training_evaluation.ipynb`