In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('/home/sagemaker-user/cybersecurity-tensor-ad/data/raw/cicids2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')

print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")

Shape: 225,745 rows × 79 columns


In [3]:
print(df.columns.tolist())
print("="*80)

print(df.info())

[' Destination Port', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag 

In [4]:
print("Shape:", df.shape)
print(df.head())

Shape: (225745, 79)
    Destination Port   Flow Duration   Total Fwd Packets  \
0              54865               3                   2   
1              55054             109                   1   
2              55055              52                   1   
3              46236              34                   1   
4              54863               3                   2   

    Total Backward Packets  Total Length of Fwd Packets  \
0                        0                           12   
1                        1                            6   
2                        1                            6   
3                        1                            6   
4                        0                           12   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                             0                       6   
1                             6                       6   
2                             6                       6   
3                           

In [5]:
# Remove inconsistent column names with extra spaces
df.columns = df.columns.str.strip()

In [6]:
class_counts = df['Label'].value_counts()
total = len(df)
for label, count in class_counts.items():
    pct = (count / total) * 100
    print(f"{label}: {count:,} ({pct:.1f}%)")

DDoS: 128,027 (56.7%)
BENIGN: 97,718 (43.3%)


In [7]:
missing = df.isnull().sum()
print(missing[missing > 0])

Flow Bytes/s    4
dtype: int64


In [8]:
print(f"\nDuplicate rows: {df.duplicated().sum():,}")


Duplicate rows: 2,633


In [9]:
max_count = class_counts.max()
print("\nClass Imbalance Ratios:")
for label, count in class_counts.items():
    ratio = max_count / count
    print(f"{label}: 1:{ratio:.2f}")


Class Imbalance Ratios:
DDoS: 1:1.00
BENIGN: 1:1.31


In [10]:
print("\n" + "="*80)
print("KEY FINDINGS:")
print("="*80)
print(f"1. Dataset has {df.duplicated().sum():,} duplicates → REMOVE")
print(f"2. Class imbalance ratio up to 1:{(max_count/class_counts.min()):.0f} → NEED SMOTE")
print(f"3. Missing values: {df.isnull().sum().sum()} → {'HANDLE' if df.isnull().sum().sum() > 0 else 'NONE'}")
print(f"4. Infinite values: {np.isinf(df.select_dtypes(include=[np.number])).sum().sum()} → REPLACE")
print(f"5. Columns to drop: Flow ID, IPs, Ports, Timestamp → METADATA")


KEY FINDINGS:
1. Dataset has 2,633 duplicates → REMOVE
2. Class imbalance ratio up to 1:1 → NEED SMOTE
3. Missing values: 4 → HANDLE
4. Infinite values: 64 → REPLACE
5. Columns to drop: Flow ID, IPs, Ports, Timestamp → METADATA
