In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

In [2]:
data_path = r"../data/raw/MachineLearningCSV"

In [3]:
# Recursively get ALL CSV files
csv_files = []
for root, dirs, files in os.walk(data_path):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

print("Total CSV files found:", len(csv_files))

Total CSV files found: 8


In [4]:
# Load all CSV files into a single DataFrame
dfs = []
for file_path in csv_files:
    print("Loading:", file_path)
    df = pd.read_csv(file_path, low_memory=False)
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
print("Initial Dataset Shape:", data.shape)

Loading: ../data/raw/MachineLearningCSV\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Loading: ../data/raw/MachineLearningCSV\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Loading: ../data/raw/MachineLearningCSV\Friday-WorkingHours-Morning.pcap_ISCX.csv
Loading: ../data/raw/MachineLearningCSV\Monday-WorkingHours.pcap_ISCX.csv
Loading: ../data/raw/MachineLearningCSV\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Loading: ../data/raw/MachineLearningCSV\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Loading: ../data/raw/MachineLearningCSV\Tuesday-WorkingHours.pcap_ISCX.csv
Loading: ../data/raw/MachineLearningCSV\Wednesday-workingHours.pcap_ISCX.csv
Initial Dataset Shape: (2830743, 79)


In [5]:
# Drop unnecessary columns
columns_to_drop = [
    'Flow ID', 'Source IP', 'Destination IP', 'Timestamp',
    'Fwd Header Length.1', 'Bwd Header Length.1'
]

data = data.drop(columns=[c for c in columns_to_drop if c in data.columns], errors='ignore')

In [6]:
# Remove missing values
data = data.dropna()
print("After Dropping NA:", data.shape)

After Dropping NA: (2829385, 79)


In [7]:
# Remove duplicate rows
data = data.drop_duplicates()
print("After Deduplication:", data.shape)

After Deduplication: (2522009, 79)


In [8]:
# Drop low-variance columns
low_variance_cols = [col for col in data.columns if data[col].nunique() <= 1]
data = data.drop(columns=low_variance_cols)
print("Dropped low-variance cols:", low_variance_cols)

Dropped low-variance cols: [' Bwd PSH Flags', ' Bwd URG Flags', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']


In [9]:
# Convert categorical label to numeric
if "Label" in data.columns:
    le = LabelEncoder()
    data["Label"] = le.fit_transform(data["Label"])

In [10]:
# Reset Index
data.reset_index(drop=True, inplace=True)

In [12]:
# Save cleaned dataset
cleaned_path = r"../data/cleaned"
os.makedirs(cleaned_path, exist_ok=True)
output_file = os.path.join(cleaned_path, "cicids2017_cleaned_nids.csv")
data.to_csv(output_file, index=False)

print("Cleaned dataset saved:", output_file)
print("Final Shape:", data.shape)

Cleaned dataset saved: ../data/cleaned\cicids2017_cleaned_nids.csv
Final Shape: (2522009, 71)
