# Exploratory Data Analysis - CICDDoS2019 Dataset

This notebook explores the CICDDoS2019 dataset to understand its structure, distribution, and characteristics.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, '../src')

from data_loader import DataLoader

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Dataset

In [None]:
# Path to dataset
data_path = "../data/raw/cicddos2019_dataset.csv"

# Get dataset info without loading full file
loader = DataLoader(data_path)
info = loader.get_data_info()

print("Dataset Information:")
print(f"Number of columns: {info['num_columns']}")
print(f"File size: {info['file_size_mb']:.2f} MB")
print(f"\nColumns: {info['columns'][:10]}...")  # Show first 10

In [None]:
# Load a sample for exploration
# Adjust sample_size based on your memory
df = loader.load_data(sample_size=10000)

print(f"Loaded {len(df)} samples")
print(f"Shape: {df.shape}")

## 2. Basic Statistics

In [None]:
# Display first few rows
df.head()

In [None]:
# Data types
df.dtypes

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing[missing > 0]

## 3. Target Variable Analysis

In [None]:
# Assuming the target column is named 'Label' - adjust if different
target_col = 'Label'  # Change this based on your dataset

if target_col in df.columns:
    # Class distribution
    class_dist = df[target_col].value_counts()
    print("Class Distribution:")
    print(class_dist)
    print(f"\nClass proportions:")
    print(class_dist / len(df))
else:
    print(f"Column '{target_col}' not found. Available columns:")
    print(df.columns.tolist())

In [None]:
# Visualize class distribution
if target_col in df.columns:
    plt.figure(figsize=(10, 6))
    df[target_col].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
    plt.title('Class Distribution', fontsize=16)
    plt.xlabel('Class', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 4. Feature Analysis

In [None]:
# Numeric features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Number of numeric features: {len(numeric_cols)}")
print(f"Numeric features: {numeric_cols[:10]}...")  # Show first 10

In [None]:
# Categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Number of categorical features: {len(categorical_cols)}")
print(f"Categorical features: {categorical_cols}")

In [None]:
# Distribution of first few numeric features
if len(numeric_cols) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for idx, col in enumerate(numeric_cols[:6]):
        df[col].hist(bins=50, ax=axes[idx], edgecolor='black')
        axes[idx].set_title(col)
        axes[idx].set_xlabel('Value')
        axes[idx].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix (for a subset of features)
if len(numeric_cols) > 0:
    # Select first 15 numeric features for correlation
    subset_cols = numeric_cols[:15]
    corr = df[subset_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, annot=False, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5)
    plt.title('Correlation Matrix (First 15 Features)', fontsize=16)
    plt.tight_layout()
    plt.show()

## 6. Save Insights

In [None]:
# Save EDA summary
summary = {
    'total_samples': len(df),
    'total_features': len(df.columns),
    'numeric_features': len(numeric_cols),
    'categorical_features': len(categorical_cols),
    'missing_values': df.isnull().sum().sum(),
}

if target_col in df.columns:
    summary['class_distribution'] = df[target_col].value_counts().to_dict()

print("\nEDA Summary:")
for key, value in summary.items():
    print(f"{key}: {value}")

## Next Steps

1. Proceed to `02_preprocessing.ipynb` for data cleaning and preparation
2. Handle missing values and outliers
3. Encode categorical features
4. Scale numeric features