# Milestone 2

Link to our [ReadME](README.md)

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
# import sklearn

## Load data

In [None]:
# Use this path if you aren't using colab:
# base_path = './detection+of+iot+botnet+attacks+n+baiot/'
base_path = 'data/'

folders = ['SimpleHome_XCS7_1003_WHT_Security_Camera', 'SimpleHome_XCS7_1002_WHT_Security_Camera', 'Samsung_SNH_1011_N_Webcam', 'Provision_PT_838_Security_Camera', 'Provision_PT_737E_Security_Camera', 'Philips_B120N10_Baby_Monitor', 'Ennio_Doorbell', 'Ecobee_Thermostat', 'Danmini_Doorbell']

# 1. Load benign traffic
benign_traffic = pd.DataFrame()
for folder in folders:
  curr_csv = pd.read_csv(base_path + folder + '/benign_traffic.csv')
  curr_csv['Device'] = folder
  benign_traffic = pd.concat([benign_traffic, curr_csv])
# benign_traffic1 = pd.read_csv(base_path + 'SimpleHome_XCS7_1003_WHT_Security_Camera/benign_traffic.csv')
# benign_traffic2 = pd.read_csv(base_path + 'Samsung_SNH_1011_N_Webcam/benign_traffic.csv')
# benign_traffic = pd.concat([benign_traffic1, benign_traffic2])
benign_traffic

In [None]:
# RUN ONCE FOR EXTRACTION below are prereqs
# pip install patool
# winget install --id 7zip.7zip
import patoolib

In [None]:
#RUN ONCE FOR EXTRACTION!!

for folder in folders:
  gafgyt_path = base_path + folder + "/gafgyt_attacks.rar"
  mirai_path = base_path + folder + "/mirai_attacks.rar"
  
  if os.path.exists(gafgyt_path):
    patoolib.extract_archive(gafgyt_path, outdir=base_path + folder + "/gafgyt_attacks/")
  
  if os.path.exists(mirai_path):
    patoolib.extract_archive(mirai_path, outdir=base_path + folder + "/mirai_attacks/")



In [None]:
test = pd.read_csv(base_path+folders[2]+'/gafgyt_attacks/junk.csv')
test

In [None]:
# Load ALL attack data from all devices and both attack types

# 2. Load attack traffic (gafgyt and mirai)
attack_traffic = pd.DataFrame()
temp = pd.DataFrame()
print("Loading attack data...")
for folder in folders:
    print(f"Processing {folder}...")
    temp = pd.DataFrame()
    # Check gafgyt attacks folder
    gafgyt_folder = base_path + folder + '/gafgyt_attacks/'
    if os.path.exists(gafgyt_folder):
        gafgyt_files = [f for f in os.listdir(gafgyt_folder) if f.endswith('.csv')]
        print(f"  Found {len(gafgyt_files)} gafgyt CSV files")
        
        for csv_filename in gafgyt_files:
            csv_file = gafgyt_folder + csv_filename
            curr_csv = pd.read_csv(csv_file)
            curr_csv['Device'] = folder
            curr_csv['Botnet_Type'] = 'gafgyt'
            curr_csv['Attack_Type'] = csv_filename
            temp = pd.concat([temp, curr_csv])
            print(f"    Added {csv_filename} - {len(curr_csv)} rows")
    
    # Check mirai attacks folder  
    mirai_folder = base_path + folder + '/mirai_attacks/'
    if os.path.exists(mirai_folder):
        mirai_files = [f for f in os.listdir(mirai_folder) if f.endswith('.csv')]
        print(f"  Found {len(mirai_files)} mirai CSV files")
        
        for csv_filename in mirai_files:
            csv_file = mirai_folder + csv_filename
            curr_csv = pd.read_csv(csv_file)
            curr_csv['Device'] = folder
            curr_csv['Botnet_Type'] = 'mirai'
            curr_csv['Attack_Type'] = csv_filename
            temp = pd.concat([temp, curr_csv])
            print(f"    Added {csv_filename} - {len(curr_csv)} rows")
    attack_traffic = pd.concat([attack_traffic, temp])
print(f"\nTotal attack traffic loaded: {len(attack_traffic)} rows")
attack_traffic.head() if len(attack_traffic) > 0 else print("No attack data found")

In [None]:
# Add binary classification labels (0 = benign, 1 = attack) and merge datasets

print("Adding binary classification labels...")

# Add labels to benign traffic (0 = benign)
benign_traffic['is_attack'] = 0
benign_traffic['Botnet_Type'] = "Benign"
benign_traffic['Attack_Type'] = "Benign"

print(f"Benign traffic: {len(benign_traffic)} rows with label 0")

# Add labels to attack traffic (1 = attack)
if len(attack_traffic) > 0:
    attack_traffic['is_attack'] = 1
    print(f"Attack traffic: {len(attack_traffic)} rows with label 1")
    
    # Print shapes before merging
    print(f"\nDataframe shapes before merging:")
    print(f"  Benign traffic shape: {benign_traffic.shape}")
    print(f"  Attack traffic shape: {attack_traffic.shape}")
    
    # Merge all data into one big dataset
    print("\nMerging benign and attack data...")
    complete_dataset = pd.concat([benign_traffic, attack_traffic], ignore_index=True)
    
else:
    print("No attack data found, using only benign data")
    complete_dataset = benign_traffic

print(f"Total rows: {len(complete_dataset):,}")
print(f"Features: {len(complete_dataset.columns) - 2} (excluding Device, is_attack)")


# Checking for missing values
missing_values = complete_dataset.isnull().sum()
print(f"\nMissing values in each column:")
print(missing_values[missing_values > 0])

print(f"   Shape: {complete_dataset.shape}")

complete_dataset.head()

Load your dataset (change path if needed)

In [None]:
df = complete_dataset

def infer_scale(series: pd.Series) -> str:
    if series.dtype == "object" or series.dtype.name == "category":
        return "Nominal"
    if pd.api.types.is_bool_dtype(series):
        return "Binary (Nominal)"
    if pd.api.types.is_integer_dtype(series):
        if series.nunique() < 20:   # few integer levels → ordinal codes
            return "Ordinal"
        return "Ratio"
    if pd.api.types.is_float_dtype(series):
        if (series.min(skipna=True) >= 0) and (series == 0).any():
            return "Ratio"
        return "Interval"
    return "Unknown"

summary = []
for col in df.columns:
    summary.append({
        "Column": col,
        "dtype": str(df[col].dtype),
        "Unique": df[col].nunique(),
        "Scale": infer_scale(df[col])
    })

scales_df = pd.DataFrame(summary)
print(scales_df)


In [None]:
num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

In [None]:
counts = [attack_traffic.shape[0], benign_traffic.shape[0]]
fig, axes = plt.subplots(2, 2, figsize=(10,5))

axes[0][0].pie(counts, labels=["Attack", "Benign"], autopct="%1.1f%%", startangle=90)
axes[0][0].set_title("Benign vs Attack Distribution")


counts = attack_traffic["Botnet_Type"].value_counts()
axes[0][1].pie(counts, labels=counts.index, autopct="%1.1f%%", startangle=90)
axes[0][1].set_title("Botnet_Type Distribution")

counts = attack_traffic[attack_traffic.Botnet_Type == 'mirai']["Attack_Type"].value_counts()
axes[1][0].pie(counts, labels=counts.index, autopct="%1.1f%%", startangle=90)
axes[1][0].set_title("Attack_Type Distribution for Mirai")

counts = attack_traffic[attack_traffic.Botnet_Type == 'gafgyt']["Attack_Type"].value_counts()
axes[1][1].pie(counts, labels=counts.index, autopct="%1.1f%%", startangle=90)
axes[1][1].set_title("Attack_Type Distribution for BASHLITE")
plt.show()

In [None]:
num = df.select_dtypes(include="number")
corr = num.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(12,10))
sns.heatmap(corr, mask=mask, cmap="coolwarm", center=0)
plt.title("Correlation heatmap - lower triangle")
plt.tight_layout(); 
plt.show()