In [None]:
# Intrusion Detection - Data Exploration and Preprocessing

This notebook will guide you through:
- Loading and combining the DAPT 2020 dataset (10 CSV files)
- Exploring the data
- Preprocessing for ML
- Building a simple binary classifier (benign vs attack)

The DAPT 2020 dataset contains network flow data from different days and network segments:
- Public and private network traffic
- Different days (Monday-Friday)
- Normal and attack traffic patterns

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from pathlib import Path

# Set up matplotlib for better plots
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Loading and Combining the Dataset

The DAPT 2020 dataset consists of multiple CSV files. Let's load and combine them all for comprehensive analysis.

In [None]:
# Define the path to CSV files
data_path = '../data/csv/'

# Get list of all CSV files
csv_files = glob.glob(os.path.join(data_path, '*.csv'))
print(f"Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"  - {os.path.basename(file)}")
    
print(f"\nFiles to combine: {len(csv_files)} files")

In [None]:
# Function to load and combine all CSV files
def load_dapt_dataset(data_path):
    """
    Load and combine all DAPT 2020 CSV files
    Returns combined dataframe
    """
    csv_files = glob.glob(os.path.join(data_path, '*.csv'))
    dataframes = []
    
    print("Loading CSV files...")
    for i, file in enumerate(csv_files, 1):
        filename = os.path.basename(file)
        print(f"  {i}/{len(csv_files)}: Loading {filename}...")
        
        try:
            # Load CSV file
            df = pd.read_csv(file)
            
            # Add source file information
            df['source_file'] = filename
            
            # Extract day and network type from filename
            if 'monday' in filename:
                df['day'] = 'Monday'
            elif 'tuesday' in filename:
                df['day'] = 'Tuesday'
            elif 'wednesday' in filename:
                df['day'] = 'Wednesday'
            elif 'thursday' in filename:
                df['day'] = 'Thursday'
            elif 'friday' in filename:
                df['day'] = 'Friday'
            else:
                df['day'] = 'Unknown'
                
            if 'pvt' in filename:
                df['network_type'] = 'Private'
            elif 'public' in filename:
                df['network_type'] = 'Public'
            else:
                df['network_type'] = 'Mixed'
                
            dataframes.append(df)
            print(f"    → Loaded {len(df):,} rows")
            
        except Exception as e:
            print(f"    ✗ Error loading {filename}: {e}")
    
    if dataframes:
        # Combine all dataframes
        print(f"\nCombining {len(dataframes)} dataframes...")
        combined_df = pd.concat(dataframes, ignore_index=True)
        print(f"✓ Combined dataset: {len(combined_df):,} total rows")
        return combined_df
    else:
        print("No valid CSV files found!")
        return None

# Load the combined dataset
df = load_dapt_dataset(data_path)

## 2. Initial Data Exploration

Let's examine the structure and content of our combined dataset.

In [None]:
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Columns: {df.shape[1]}")
print(f"Rows: {df.shape[0]:,}")

print("\n=== FIRST FEW ROWS ===")
df.head()