### TODO: Data Preprocessing

#### Primary tasks (must-have)
- [ ] Load & Clean the NF-UNSW-NB15-v3 dataset (or skip if confirmed pre-cleaned) and log basic stats via `df.shape` / `df.info()`.
- [ ] Load the synthetic data for checking 
- [ ] Define target label column (e.g., `Label` / `attack_cat`) and check class imbalance via `value_counts(normalize=True)`.
- [ ] Double-check for nulls, duplicates, and obvious inconsistencies; visualize anomalies (e.g., `df.isna().sum().plot(kind="bar")`).
- [ ] Validate column types: convert numeric columns, standardize categorical dtypes, fix malformed values (ensure pandas/models interpret data correctly).
- [ ] Split data using **Stratified Sampling** to maintain the same percentage of rare attack classes in train/validation/test sets.
- [ ] Check feature correlations; remove or merge highly correlated columns to reduce redundancy and potential leakage.
- [ ] Save train/validation/test splits as CSV files in a structured folder (separate for real and synthetic datasets) ‚Äî e.g., in Google Drive for reproducibility.
- [ ] Data split percentages:
  - **Real Data:** Training 80%, Validation 10%, Test 10%
  - **Synthetic Data:** Training 10‚Äì20%, Validation 5%, Test 0%

#### Secondary tasks (second essentials)
- [ ] Produce a quick EDA snapshot (pairplots, histograms) to highlight feature distributions and potential distribution shifts.
- [ ] Ensure deterministic splits: set a shared `RANDOM_STATE` (e.g., 42) in all splitters (`train_test_split`, `StratifiedKFold`) to guarantee reproducibility across runs.

### SE2: DATA PREP FINAL

#### colab : https://colab.research.google.com/drive/1YJh5SvbtX8iQJc6XtLddaj1fEOeMeBlX?usp=sharing

#### drive : https://drive.google.com/drive/folders/1i3ZL1jffdgfY4O6AHmDrZRKY8WewUWTB?usp=sharing 
 
 üì¢üì¢üì¢ (pls read):
- ung ginawa ko sa synthetic data splitting para mahati: Count the Real Data > Calculate the Ratio > Apply Ratio to Synthetic Data. inapply ko yung ratio ng Real Data sa Synthetic splitting para ma-maximize yung training rows
- did the splitting by using Stratified by 'Attack' instead of just 'Label' para maensure na ung mga rare attacks ay divided equally 
- ung sa synthetic data na naka split, inalign ko yung columns nun to 36 para same sa real data, I applied a filter to the synthetic splits to make sure they match exactly. This ensures na ung model won't crash due to a shape mismatch during training.

In [None]:
#1. CLEAN THE DATA

# Handle NaN values in real dataset
import pandas as pd
import numpy as np

# 1. Start with a fresh copy
data_clean = data.copy()

# 2. Identify and Drop 'Noise' & 'Leakage' columns
# (IPs, Ports, and exact Timestamps)
cols_to_drop = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT',
                'FLOW_START_MILLISECONDS', 'FLOW_END_MILLISECONDS']
data_clean = data_clean.drop(columns=[c for c in cols_to_drop if c in data_clean.columns])
print("‚úÖ Identification noise and timestamps removed.")

# 3. Memory Optimization (To prevent Colab from crashing later)
for col in data_clean.select_dtypes(include=['int64']).columns:
    data_clean[col] = pd.to_numeric(data_clean[col], downcast='integer')
for col in data_clean.select_dtypes(include=['float64']).columns:
    data_clean[col] = pd.to_numeric(data_clean[col], downcast='float')
print("‚úÖ Memory optimized.")

# 4. Final Verification
print(f"\nFinal Feature Count: {data_clean.shape[1]}")
print("Missing Values Check:", data_clean.isnull().sum().sum())

# Check for NaN values
print("\n" + "="*60)
print("Missing values per column:")
print("="*60)
missing_values = data_clean.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

if len(missing_values) > 0:
    print(missing_values)
    print(f"\nTotal missing values: {data_clean.isnull().sum().sum()}")
    print(f"Percentage of data with missing values: {(data_clean.isnull().sum().sum() / (data_clean.shape[0] * data_clean.shape[1]) * 100):.2f}%")
else:
    print("‚úÖ No missing values found!")

# Check data types
print("\n" + "="*60)
print("Data types:")
print("="*60)
print(data_clean.dtypes.value_counts())

# Check if IPs still exist
print("\n" + "="*60)
print("IP columns still present?")
print("="*60)
ip_columns = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR']
for col in ip_columns:
    if col in data_clean.columns:
        print(f"‚ùå {col}: Still present")
    else:
        print(f"‚úÖ {col}: Already removed")

In [None]:
#2. DROPPING NANS INSTEAD OF FILLING IT WITH 0 (it will create noise if filled with 0)

#Dropping the NaN rows
data_clean = data_clean.dropna(subset=['SRC_TO_DST_SECOND_BYTES'])

#Final verification check
print(f"Rows remaining: {len(data_clean)}")
print(f"Total missing values now: {data_clean.isnull().sum().sum()}")

if data_clean.isnull().sum().sum() == 0:
    print("Dataset Cleaned")

In [None]:
#3. CHECKS WHETHER THERE ARE SIMILAR COLUMNS

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

numeric_df = data_clean.select_dtypes(include=[np.number])

corr_matrix = numeric_df.sample(100000, random_state=42).corr()

plt.figure(figsize=(16, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("ForenXAI: Feature Correlation Heatmap")
plt.show()

upper = corr_matrix.abs().where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

print("\n" + "="*60)
print("üîé REDUNDANT COLUMNS DETECTED (>95% correlation):")
print("="*60)
print(to_drop if to_drop else "None! All features are unique.")

In [None]:
#4. DROPPED RELATED/SIMILAR COLUMNS

# The list of redundant columns found
redundant_cols = ['OUT_PKTS', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS', 'DURATION_IN',
                  'MAX_TTL', 'MAX_IP_PKT_LEN', 'RETRANSMITTED_IN_BYTES',
                  'RETRANSMITTED_IN_PKTS', 'RETRANSMITTED_OUT_BYTES',
                  'RETRANSMITTED_OUT_PKTS', 'SRC_TO_DST_AVG_THROUGHPUT',
                  'DST_TO_SRC_AVG_THROUGHPUT', 'ICMP_IPV4_TYPE']

# Drop them from our clean dataset
data_final = data_clean.drop(columns=redundant_cols)

# check
print(f"Original Features: {data_clean.shape[1]}")
print(f"Features after removing redundancy: {data_final.shape[1]}")
print(data_final.columns.tolist())
print("‚úÖ Redundancy removed")

#Save the cleaned file
output_path = '/content/drive/MyDrive/ForenXAI_cleaned/cleaned_real_data_final.csv'

print(f"Saving file to: {output_path}...")
data_final.to_csv(output_path, index=False)
print("SAVED")

In [None]:
#Checking which columns to validate
import pandas as pd
import numpy as np

file_path = '/content/drive/MyDrive/ForenXAI_cleaned/cleaned_real_data_final.csv'

try:
    df = pd.read_csv(file_path)
    print(f"File loaded successfully: {len(df):,} rows and {len(df.columns)} columns.")

    non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

    print("\n" + "="*50)
    print("NON-NUMERIC COLUMNS FOUND:")
    print("="*50)

    if non_numeric_cols:
        for col in non_numeric_cols:
            print(f"- {col} (Type: {df[col].dtype})")
    else:
        print("None! All columns are currently numerical.")

    print("\n" + "="*50)
    print("SAMPLED VALUES FROM NON-NUMERIC COLUMNS:")
    print("="*50)
    if non_numeric_cols:
        print(df[non_numeric_cols].head(3))
    else:
        print("Everything is numeric.")

except FileNotFoundError:
    print(f"Error: Could not find the file at {file_path}")

In [None]:
# 5. Validation and Standardization
import matplotlib.pyplot as plt
import seaborn as sns

# Standardize the Categorical Dtype
# This changes 'object' to 'category'
df['Attack'] = df['Attack'].astype('category')

# check class imbalance via value_counts(normalize=True)c
print("CLASS IMBALANCE (ATTACK CATEGORIES):")
print("="*50)
print(df['Attack'].value_counts(normalize=True) * 100)

# visualize anomalies (e.g., df.isna().sum().plot(kind='bar'))
plt.figure(figsize=(10, 6))
df.isna().sum().plot(kind="bar", color='skyblue')
plt.title("Anomalies Check: Missing Values per Column")
plt.ylabel("Count of Nulls")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\n‚úÖ Type Validation and Standardization Complete.")
print(f"Final Dtype for 'Attack': {df['Attack'].dtype}")

In [None]:
#6. DATA SPLITTING (REAL DATA 80/10/10)

import pandas as pd
from sklearn.model_selection import train_test_split

print("="*60)
print("STARTING STRATIFIED SPLIT (80/10/10)")
print("="*60)

# peel off the 10% Test Set (The "Final Exam")
# use 'stratify' to keep the 4% attack ratio perfect.
X_temp, test_df = train_test_split(
    data_final,
    test_size=0.10,
    stratify=data_final['Label'],
    random_state=42
)

# split the remaining 90% into Train (80%) and Val (10%)
# Math: 10% is 1/9th of the remaining 90%.
train_df, val_df = train_test_split(
    X_temp,
    test_size=1/9,
    stratify=X_temp['Attack'],
    random_state=42
)

# Create the Synthetic Seed (Essential for the "Synthetic Data" Task)
synthetic_seed_df = train_df.copy()

total_rows = len(data_final)

print(f"\nüìä Split Results:")
print(f"   - Total Data:      {total_rows:,} rows (100%)")
print(f"   - Training Set:    {len(train_df):,} rows ({len(train_df)/total_rows:.1%}) --> Matches 'Training > 80%'")
print(f"   - Validation Set:  {len(val_df):,} rows ({len(val_df)/total_rows:.1%}) --> Matches 'Validation > 90%'")
print(f"   - Test Set:        {len(test_df):,} rows ({len(test_df)/total_rows:.1%}) --> Matches 'Test > 95%'")

# Check Stratification
attack_ratio_train = train_df['Label'].mean()
attack_ratio_test = test_df['Label'].mean()
print(f"\n‚öñÔ∏è Stratification Check (Attack Ratio should be ~0.039):")
print(f"   - Train Attack Ratio: {attack_ratio_train:.5f}")
print(f"   - Test Attack Ratio:  {attack_ratio_test:.5f}")

if abs(attack_ratio_train - attack_ratio_test) < 0.001:
    print("PERFECT STRATIFICATION CONFIRMED")
else:
    print("arning: Stratification might be off.")

output_dir = '/content/drive/MyDrive/ForenXAI_cleaned/'

print(f"\nüíæ Saving files to {output_dir}...")
train_df.to_csv(f'{output_dir}train_real.csv', index=False)
val_df.to_csv(f'{output_dir}val_real.csv', index=False)
test_df.to_csv(f'{output_dir}test_real.csv', index=False)
synthetic_seed_df.to_csv(f'{output_dir}synthetic_seed.csv', index=False)

print("SAVED")

In [None]:
from sklearn.model_selection import train_test_split

# 1. Peel off the 10% Test Set (The "Final Exam")
X_temp, test_df = train_test_split(
    df,
    test_size=0.10,
    stratify=df['Attack'], # Preserves the rare 0.005% Worms in all sets
    random_state=42
)

# 2. Split remaining 90% into Train (80% total) and Val (10% total)
train_df, val_df = train_test_split(
    X_temp,
    test_size=1/9,
    stratify=X_temp['Attack'],
    random_state=42
)

# 3. Save the final files to your Drive
output_dir = '/content/drive/MyDrive/ForenXAI_cleaned/'

train_df.to_csv(f'{output_dir}train_real.csv', index=False)
val_df.to_csv(f'{output_dir}val_real.csv', index=False)
test_df.to_csv(f'{output_dir}test_real.csv', index=False)

print("="*60)
print("FINAL SPLIT COMPLETE (Stratified by Attack Category)")
print("="*60)
print(f"Training rows:   {len(train_df):,}")
print(f"Validation rows: {len(val_df):,}")
print(f"Test rows:       {len(test_df):,}")
print(f"Files saved to:  {output_dir}")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

file_path = '/content/drive/MyDrive/ForenXAI_cleaned/cleaned_real_data_final.csv'
df = pd.read_csv(file_path)

# Ensure 'Label' is treated as a string for the legend colors
df['Label'] = df['Label'].astype(str)

# Select Key Features
key_features = ['IN_BYTES', 'OUT_BYTES', 'FLOW_DURATION_MILLISECONDS',
                'IN_PKTS', 'TCP_WIN_MAX_IN', 'MIN_TTL']

print("Generating Histograms (handling zeros)...")

plt.figure(figsize=(15, 10))
for i, feature in enumerate(key_features):
    plt.subplot(2, 3, i+1)

    # add 1 to the data (df[feature] + 1) just for the plot.
    # This turns 0 into 1, so log(1) = 0.
    sns.histplot(data=df, x=df[feature] + 1, hue='Label', bins=30,
                 kde=True, palette='viridis', log_scale=True)

    plt.title(f'Distribution of {feature} (Log Scale)')
    plt.xlabel(f'{feature} (+1 for log)')
    plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/ForenXAI_cleaned/eda_histograms.png')
plt.show()

print("Generating Pairplot (using sample)...")

# Create Pairplot
# applied log+1 here to keep the visual relationships consistent
plot_df = df.sample(5000, random_state=42).copy()
for feat in ['IN_BYTES', 'OUT_BYTES', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS']:
    plot_df[feat] = np.log1p(plot_df[feat]) # log1p is shorthand for log(1+x)

sns.pairplot(plot_df,
             vars=['IN_BYTES', 'OUT_BYTES', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS'],
             hue='Label',
             palette='husl',
             diag_kind='kde',
             plot_kws={'alpha': 0.4})

plt.suptitle("Feature Interactions Snapshot (Log-Transformed)", y=1.02)
plt.savefig('/content/drive/MyDrive/ForenXAI_cleaned/eda_pairplot.png')
plt.show()

print(f"Check: {df['IN_BYTES'].min()}")

In [None]:
#DOUBLE CHECKING CLEANED DATA
import pandas as pd
import numpy as np

file_path = '/content/drive/MyDrive/ForenXAI_cleaned/cleaned_real_data_final.csv'

try:
    data_final = pd.read_csv(file_path)
    print("File loaded successfully.")

    print("\n" + "="*40)
    print("--- CLEANING VERIFICATION REPORT ---")
    print("="*40)

    # Check for NaNs
    nan_count = data_final.isnull().sum().sum()
    print(f"Total Missing Values:  {nan_count} {'(Clean)' if nan_count == 0 else '(Action Required)'}")

    # Check for Identifiers (Should be gone)
    id_cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT', 'Timestamp']
    present_ids = [col for col in id_cols if col in data_final.columns]
    if not present_ids:
        print(f"Identifier Columns:    (Removed)")
    else:
        print(f"Identifier Columns:    Found: {present_ids}")

    # Check Feature Count (Started ~55 -> Dropped 6 IDs -> Dropped 13 Redundant -> Goal 36)
    col_count = data_final.shape[1]
    expected = 36
    if col_count == expected:
         print(f"Final Feature Count:   {col_count} (Matches Expected)")
    else:
         print(f"Final Feature Count:   {col_count} (Expected {expected})")

    # Check Target Columns Existence
    if 'Label' in data_final.columns and 'Attack' in data_final.columns:
        print(f"Target Columns:        Found 'Label' and 'Attack'")
    else:
        print(f"Target Columns:        Missing Target Columns!")

except FileNotFoundError:
    print(f" Error: Could not find file at {file_path}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

file_path = '/content/drive/MyDrive/ForenXAI_cleaned/cleaned_real_data_final.csv'
print(f"Loading data from {file_path}...")
data_final = pd.read_csv(file_path)

target_column = 'Label'
current_data = data_final

# Count class distribution
class_counts = current_data[target_column].value_counts().sort_index()

print("\n--- CLEANED DATA DISTRIBUTION ---")
print(f"Total samples: {len(current_data):,}")
print("\nClass proportions:")
print(current_data[target_column].value_counts(normalize=True).sort_index())

# Visualization
plt.figure(figsize=(12, 5))

# Bar Chart
plt.subplot(1, 2, 1)
class_counts.plot(kind='bar', color='seagreen', edgecolor='black')
plt.title('Class Distribution (Cleaned Data)', fontsize=14, fontweight='bold')
plt.xlabel('Class (0=Normal, 1=Attack)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)

# Add count labels
for i, v in enumerate(class_counts):
    plt.text(i, v + max(class_counts)*0.01, f"{v:,}", ha='center', va='bottom')

# Pie Chart
plt.subplot(1, 2, 2)
# Using generic colors to differentiate from the earlier "Attack Category" charts
plt.pie(class_counts, labels=['Normal (0)', 'Attack (1)'], autopct='%1.2f%%',
        startangle=90, colors=['#99ff99','#ff9999'])
plt.title('Cleaned Distribution (%)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Imbalance metrics
max_class = class_counts.max()
min_class = class_counts.min()
imbalance_ratio = max_class / min_class

print(f"\n{'='*50}")
print(f"UPDATED IMBALANCE ANALYSIS")
print(f"{'='*50}")
print(f"New Imbalance Ratio: {imbalance_ratio:.2f}:1")
print(f"Most common class: {class_counts.idxmax()} ({max_class:,} samples)")
print(f"Least common class: {class_counts.idxmin()} ({min_class:,} samples)")

In [None]:
import pandas as pd

# Load both just to peek at the columns
real_path = '/content/drive/MyDrive/ForenXAI_cleaned/cleaned_real_data_final.csv'
syn_path = '/content/drive/MyDrive/ForenXAI_Datasets/Synthetic_Datasets/synthetic_data_ctgan.csv'

try:
    real_df = pd.read_csv(real_path)
    syn_df = pd.read_csv(syn_path)

    real_cols = set(real_df.columns)
    syn_cols = set(syn_df.columns)

    print(f"Real Data Columns:      {len(real_cols)}")
    print(f"Synthetic Data Columns: {len(syn_cols)}")

    # Check for mismatch
    if real_cols == syn_cols:
        print("\ncolumns match perfectly!")
    else:
        print("\n‚ö†Ô∏è MISMATCH DETECTED")
        print(f"Columns in Synthetic but NOT in Real: {syn_cols - real_cols}")
        print(f"Columns in Real but NOT in Synthetic: {real_cols - syn_cols}")
        print("\nVERDICT: You MUST run the 'alignment' code, or the model training will crash.")

except FileNotFoundError:
    print("Files not found. Check paths.")

In [None]:
#6. SYNTHETIC SPLITTNG

import pandas as pd
from sklearn.model_selection import train_test_split

syn_path = '/content/drive/MyDrive/ForenXAI_Datasets/Synthetic_Datasets/synthetic_data_ctgan.csv'
real_clean_path = '/content/drive/MyDrive/ForenXAI_cleaned/cleaned_real_data_final.csv'
output_dir = '/content/drive/MyDrive/ForenXAI_cleaned/'

print("Loading Data for Alignment...")
try:
    syn_df = pd.read_csv(syn_path)
    real_df = pd.read_csv(real_clean_path)

    print(f"Original Synthetic Shape: {syn_df.shape}")

    # ALIGNMENT STEP (The Fix)
    # This prevents the 'Shape Mismatch' error during training.
    cols_to_keep = [c for c in real_df.columns if c in syn_df.columns]
    syn_df = syn_df[cols_to_keep]

    print(f"Aligned Synthetic Shape:  {syn_df.shape} (Matches Real Data)")

    # STANDARDIZE ATTACK COLUMN
    # If the synthetic data has 'Attack', make sure it's a category like the real data
    if 'Attack' in syn_df.columns:
        syn_df['Attack'] = syn_df['Attack'].astype('category')

    # STRATIFIED SPLIT
    # use 'Attack' if available, otherwise fallback to 'Label'
    strat_col = syn_df['Attack'] if 'Attack' in syn_df.columns else syn_df['Label']

    train_ratio = 0.8889 # Matches the proportion of Real Data
    print(f"Splitting... (Stratifying by {strat_col.name})")

    syn_train, syn_val = train_test_split(
        syn_df,
        train_size=train_ratio,
        stratify=strat_col,
        random_state=42
    )

    syn_train.to_csv(f'{output_dir}synthetic_train_split.csv', index=False)
    syn_val.to_csv(f'{output_dir}synthetic_val_split.csv', index=False)

    print(f"SUCCESS. Files saved to: {output_dir}")
    print(f"- synthetic_train_split.csv ({len(syn_train)} rows)")
    print(f"- synthetic_val_split.csv ({len(syn_val)} rows)")

except FileNotFoundError as e:
    print(f"Error: {e}")

