1. Dataset Exploration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
def clean_dataset(df):
    # Remove inconsistent column names with extra spaces
    df.columns = df.columns.str.strip()
    
    # Columns to be removed
    drop_columns = [
        "Destination Port",  # specific targets in simulation
        'Fwd Header Length.1'  # Duplicate column
    ]
    df.drop(columns=drop_columns, inplace=True, errors="ignore")
    
    # Correct columns to the correct dtype
    int_col = df.select_dtypes(include='integer').columns
    df[int_col] = df[int_col].apply(pd.to_numeric, errors='coerce', downcast='integer')
    float_col = df.select_dtypes(include='float').columns
    df[float_col] = df[float_col].apply(pd.to_numeric, errors='coerce', downcast='float')
    
    df['Label'].replace({'BENIGN': 'Benign'}, inplace=True)
    df['Label'] = df.Label.astype('category')
    
    # Remove NaN and infinite values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    
    # Remove duplicates
    df.drop_duplicates(inplace=True, subset=df.columns.difference(['Label']))
    
    return df

In [3]:
# Get all CSV files
data_path = '/home/sagemaker-user/cybersecurity-tensor-ad/data/raw/cicids2017/'
csv_files = glob.glob(os.path.join(data_path, '*.csv'))

print(f"Found {len(csv_files)} CSV files")
print("\nProcessing files...")

for file_path in csv_files:
    filename = os.path.basename(file_path)
    print(f"\nProcessing: {filename}")
    
    try:
        df = pd.read_csv(file_path)
        print(f"Original shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
        
        df_clean = clean_dataset(df)
        print(f"Cleaned shape: {df_clean.shape[0]:,} rows × {df_clean.shape[1]} columns")
        print(f"Label distribution:\n{df_clean['Label'].value_counts()}")
        
        # Save cleaned file
        output_path = f'/home/sagemaker-user/cybersecurity-tensor-ad/data/processed/{filename}'
        df_clean.to_csv(output_path, index=False)
        print(f"Saved to: {output_path}")
        
        # Clear memory
        del df, df_clean
        gc.collect()
        
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        gc.collect()

Found 8 CSV files

Processing files...

Processing: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Original shape: 225,745 rows × 79 columns
Cleaned shape: 221,264 rows × 77 columns
Label distribution:
Label
DDoS      128014
Benign     93250
Name: count, dtype: int64
Saved to: /home/sagemaker-user/cybersecurity-tensor-ad/data/processed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv

Processing: Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Original shape: 286,467 rows × 79 columns
Cleaned shape: 119,522 rows × 77 columns
Label distribution:
Label
Benign      117566
PortScan      1956
Name: count, dtype: int64
Saved to: /home/sagemaker-user/cybersecurity-tensor-ad/data/processed/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv

Processing: Friday-WorkingHours-Morning.pcap_ISCX.csv
Original shape: 191,033 rows × 79 columns
Cleaned shape: 176,036 rows × 77 columns
Label distribution:
Label
Benign    174600
Bot         1436
Name: count, dtype: int64
Saved to: /home/sagemaker