## Load Data

In [1]:
import pandas as pd

In [2]:
# Load Data
file_paths = [
    r'C:\Users\Sohail Mohammed\OneDrive\Desktop\Cybersecurity-Threat-Detection\data\raw\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
    r'C:\Users\Sohail Mohammed\OneDrive\Desktop\Cybersecurity-Threat-Detection\data\raw\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    r'C:\Users\Sohail Mohammed\OneDrive\Desktop\Cybersecurity-Threat-Detection\data\raw\Friday-WorkingHours-Morning.pcap_ISCX.csv',
    r'C:\Users\Sohail Mohammed\OneDrive\Desktop\Cybersecurity-Threat-Detection\data\raw\Monday-WorkingHours.pcap_ISCX.csv',
    r'C:\Users\Sohail Mohammed\OneDrive\Desktop\Cybersecurity-Threat-Detection\data\raw\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    r'C:\Users\Sohail Mohammed\OneDrive\Desktop\Cybersecurity-Threat-Detection\data\raw\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    r'C:\Users\Sohail Mohammed\OneDrive\Desktop\Cybersecurity-Threat-Detection\data\raw\Tuesday-WorkingHours.pcap_ISCX.csv',
    r'C:\Users\Sohail Mohammed\OneDrive\Desktop\Cybersecurity-Threat-Detection\data\raw\Wednesday-workingHours.pcap_ISCX.csv'
]

dataframes = []

for file_path in file_paths:
    df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')
    dataframes.append(df)

consolidated_df = pd.concat(dataframes, ignore_index=True)
print(f"Total records: {len(consolidated_df)}")
print(f"Number of features: {len(consolidated_df.columns)}")

# REMOVE EXTRA SPACES FROM COLUMN NAMES.
consolidated_df.columns = consolidated_df.columns.str.strip()

Total records: 2830743
Number of features: 80


In [3]:
consolidated_df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Destination Port.1
0,54865.0,3,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,
1,55054.0,109,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,
2,55055.0,52,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,
3,46236.0,34,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,
4,54863.0,3,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,


## Feature Selection

## Select Most Relevant Features to Avoid Overfitting, Reduce Computational Complexity, and Improve Model Performance

### Using Combination of Domain Knowledge, Statistical Methods, and ML-Based Feature Selection

### 1. Domain Knowledge
Based on cybersecurity expertise, the following features are selected as they are often critical for intrusion detection:

- **Flow Duration**: Longer flows may indicate suspicious activity.
- **Total Fwd/Bwd Packets**: High packet counts may indicate attacks like DDoS.
- **Packet Length Statistics**: Unusual packet sizes may indicate malicious payloads.
- **Flow Bytes/s and Flow Packets/s**: High rates may indicate flooding attacks.
- **TCP Flags**: Flags like SYN, ACK, and RST are often used in attacks.
- **Inter-Arrival Time (IAT)**: Unusual IAT may indicate scanning or probing.
- **Subflow Statistics**: Useful for detecting anomalieseature Selection


### 2. Statistical Methods

### 3. Machine Learning-Based Feature Selection

In [4]:
selected_features = [
    "Flow Duration", "Total Fwd Packets", "Total Backward Packets",
    "Total Length of Fwd Packets", "Total Length of Bwd Packets",
    "Fwd Packet Length Max", "Fwd Packet Length Mean", "Bwd Packet Length Mean",
    "Flow Bytes/s", "Flow Packets/s", "Flow IAT Mean", "Flow IAT Std",
    "Fwd IAT Mean", "Fwd IAT Std", "FIN Flag Count", "SYN Flag Count",
    "ACK Flag Count", "RST Flag Count", "Fwd Packets/s", "Bwd Packets/s",
    "Packet Length Mean", "Init_Win_bytes_forward", "Subflow Fwd Packets",
    "Active Mean", "Idle Mean"
]

In [5]:
df = consolidated_df[selected_features + ["Label"]]

In [6]:
df.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Mean,Flow Bytes/s,Flow Packets/s,...,ACK Flag Count,RST Flag Count,Fwd Packets/s,Bwd Packets/s,Packet Length Mean,Init_Win_bytes_forward,Subflow Fwd Packets,Active Mean,Idle Mean,Label
0,3,2,0,12,0,6,6.0,0.0,4000000.0,666666.6667,...,1,0,666666.6667,0.0,6.0,33,2,0.0,0.0,BENIGN
1,109,1,1,6,6,6,6.0,6.0,110091.7,18348.62385,...,1,0,9174.311927,9174.311927,6.0,29,1,0.0,0.0,BENIGN
2,52,1,1,6,6,6,6.0,6.0,230769.2,38461.53846,...,1,0,19230.76923,19230.76923,6.0,29,1,0.0,0.0,BENIGN
3,34,1,1,6,6,6,6.0,6.0,352941.2,58823.52941,...,1,0,29411.76471,29411.76471,6.0,31,1,0.0,0.0,BENIGN
4,3,2,0,12,0,6,6.0,0.0,4000000.0,666666.6667,...,1,0,666666.6667,0.0,6.0,32,2,0.0,0.0,BENIGN


## Data Preprocessing

#### Replace Infinity Values

In [7]:
import numpy as np
# Replace Infinity Values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)


#### Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

# Encode Categorical Labels
label_encoder = LabelEncoder()
df["Label"] = label_encoder.fit_transform(df["Label"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Label"] = label_encoder.fit_transform(df["Label"])


#### Missing Values Imputation

In [9]:
from sklearn.impute import SimpleImputer

# Impute Missing Values with Mean
imputer = SimpleImputer(strategy='mean')
df.loc[:, df.columns] = imputer.fit_transform(df)

In [10]:
df.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Mean,Flow Bytes/s,Flow Packets/s,...,ACK Flag Count,RST Flag Count,Fwd Packets/s,Bwd Packets/s,Packet Length Mean,Init_Win_bytes_forward,Subflow Fwd Packets,Active Mean,Idle Mean,Label
0,3,2,0,12,0,6,6.0,0.0,4000000.0,666666.6667,...,1,0,666666.6667,0.0,6.0,33,2,0.0,0.0,0
1,109,1,1,6,6,6,6.0,6.0,110091.7,18348.62385,...,1,0,9174.311927,9174.311927,6.0,29,1,0.0,0.0,0
2,52,1,1,6,6,6,6.0,6.0,230769.2,38461.53846,...,1,0,19230.76923,19230.76923,6.0,29,1,0.0,0.0,0
3,34,1,1,6,6,6,6.0,6.0,352941.2,58823.52941,...,1,0,29411.76471,29411.76471,6.0,31,1,0.0,0.0,0
4,3,2,0,12,0,6,6.0,0.0,4000000.0,666666.6667,...,1,0,666666.6667,0.0,6.0,32,2,0.0,0.0,0


#### Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler

# Standard Scaling
scaler = StandardScaler()
X = df.drop("Label", axis=1)
y = df["Label"]
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [12]:
print(X)

         Flow Duration  Total Fwd Packets  Total Backward Packets  \
0            -0.439347          -0.009819               -0.010421   
1            -0.439344          -0.011153               -0.009418   
2            -0.439345          -0.011153               -0.009418   
3            -0.439346          -0.011153               -0.009418   
4            -0.439347          -0.009819               -0.010421   
...                ...                ...                     ...   
2830738      -0.438390          -0.007151               -0.008416   
2830739      -0.439337          -0.009819               -0.008416   
2830740      -0.439344          -0.009819               -0.009418   
2830741      -0.408187          -0.004484               -0.008416   
2830742      -0.436526          -0.007151               -0.008416   

         Total Length of Fwd Packets  Total Length of Bwd Packets  \
0                          -0.053765                    -0.007142   
1                          -0.054

#### Handle Imbalanced Dataset with SMOTE

In [13]:
from sklearn.model_selection import train_test_split

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Save X_train and y_train
X_train.to_csv("data/processed/X_train.csv", index=False)
y_train.to_csv("data/processed/y_train.csv", index=False)

# Save X_test and y_test
X_test.to_csv("data/processed/X_test.csv", index=False)
y_test.to_csv("data/processed/y_test.csv", index=False)

OSError: Cannot save file into a non-existent directory: 'data\processed'