In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from collections import defaultdict

pd.set_option('display.max_columns', 50)

# =========================================================
# CONFIGURATION
# =========================================================
data_paths = {
    "Bot-IoT": r"D:\IoT_IDS_Thesis\data\raw\Bot-IoT\Bot-IoT_Dataset\Dataset\Entire_Dataset",
    "UNSW-NB15": r"D:\IoT_IDS_Thesis\data\raw\UNSW-NB15\UNSW-NB15 dataset\CSV Files\Training and Testing Sets\UNSW_NB15_training-set.csv",
    "TON_IoT": r"D:\IoT_IDS_Thesis\data\raw\TON_IoT\Train_Test_datasets\Train_Test_Network_dataset\train_test_network.csv"
}

output_dir = r"D:\IoT_IDS_Thesis\data\docs"
os.makedirs(output_dir, exist_ok=True)

datasets = {}
metadata = {}

# =========================================================
# BOT-IoT DTYPES (MEMORY OPTIMIZED)
# =========================================================
bot_iot_dtypes = {
    'pkSeqID': 'int32',
    'stime': 'float32',
    'flgs': 'category',
    'proto': 'category',
    'saddr': 'category',
    'sport': 'object',
    'daddr': 'category',
    'dport': 'object',
    'pkts': 'int32',
    'bytes': 'int64',
    'state': 'category',
    'ltime': 'float32',
    'dur': 'float32',
    'mean': 'float32',
    'stddev': 'float32',
    'sum': 'float32',
    'min': 'float32',
    'max': 'float32',
    'spkts': 'int32',
    'dpkts': 'int32',
    'sbytes': 'int64',
    'dbytes': 'int64',
    'rate': 'float32',
    'attack': 'int8',
    'category': 'category',
    'subcategory': 'category'
}

BOT_IOT_LABELS = ['attack', 'category', 'subcategory']

# =========================================================
# LOAD BOT-IoT (CHUNKED)
# =========================================================
print("\nüîπ Loading Bot-IoT dataset...")
bot_iot_folder = data_paths["Bot-IoT"]

label_dists = {k: defaultdict(int) for k in BOT_IOT_LABELS}
missing_total = pd.Series(0, index=bot_iot_dtypes.keys())
total_rows = 0
samples = []

csv_files = glob.glob(os.path.join(bot_iot_folder, "**/*.csv"), recursive=True)

for file in csv_files:
    try:
        for chunk in pd.read_csv(
            file,
            dtype=bot_iot_dtypes,
            low_memory=False,
            chunksize=100_000
        ):
            total_rows += len(chunk)
            missing_total += chunk.isna().sum()

            for lbl in BOT_IOT_LABELS:
                if lbl in chunk.columns:
                    for k, v in chunk[lbl].value_counts().items():
                        label_dists[lbl][k] += v

            if len(samples) < 5:
                samples.append(chunk.head(1000))

        print(f"   ‚úÖ Processed {os.path.basename(file)}")

    except Exception as e:
        print(f"   ‚ùå Error processing {file}: {e}")

datasets["Bot-IoT"] = pd.concat(samples, ignore_index=True)
metadata["Bot-IoT"] = {
    "total_rows": total_rows,
    "missing_total": missing_total,
    "label_dists": {k: pd.Series(v) for k, v in label_dists.items()}
}

print(f"‚úî Bot-IoT rows: {total_rows:,} (sample loaded: {datasets['Bot-IoT'].shape})")

# =========================================================
# LOAD OTHER DATASETS
# =========================================================
for name in ["UNSW-NB15", "TON_IoT"]:
    print(f"\nüîπ Loading {name}...")
    df = pd.read_csv(data_paths[name], low_memory=False)
    datasets[name] = df
    metadata[name] = {
        "total_rows": len(df),
        "missing_total": df.isna().sum(),
        "label_dists": None
    }
    print(f"   ‚úÖ Loaded {df.shape}")

# =========================================================
# CLEANING FUNCTIONS
# =========================================================
def clean_dataset(df):
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace(".", "_")
    df = df.loc[:, df.nunique() > 1]
    return df

for name in datasets:
    datasets[name] = clean_dataset(datasets[name])
    print(f"üßπ Cleaned {name}: {datasets[name].shape}")

# =========================================================
# STANDARDIZE COLUMN NAMES
# =========================================================
rename_map = {
    'saddr': 'src_ip',
    'sport': 'src_port',
    'daddr': 'dst_ip',
    'dport': 'dst_port',
    'dur': 'duration',
    'sbytes': 'src_bytes',
    'dbytes': 'dst_bytes'
}

for name, df in datasets.items():
    df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True)

# =========================================================
# DATASET OVERVIEW
# =========================================================
for name, df in datasets.items():
    print(f"\n{'='*60}")
    print(f"{name} DATASET OVERVIEW")
    print(f"{'='*60}")
    print(f"Records: {metadata[name]['total_rows']:,}")
    print(f"Features: {df.shape[1]}")
    print(f"Memory (sample): {df.memory_usage(deep=True).sum()/1024**2:.2f} MB")
    print(df.head(5).to_string())

# =========================================================
# LABEL ANALYSIS (SAFE)
# =========================================================
for name, df in datasets.items():
    print(f"\nüîé {name} - Label Analysis")

    if name == "Bot-IoT":
        label_cols = BOT_IOT_LABELS
        label_dists = metadata[name]['label_dists']
    else:
        label_cols = [c for c in df.columns if 'label' in c.lower() or 'class' in c.lower()]

    if not label_cols:
        print("   ‚ö†Ô∏è No label columns found")
        continue

    for lbl in label_cols:
        print(f"\n--- {lbl} ---")

        if name == "Bot-IoT":
            counts = label_dists.get(lbl, pd.Series())
        else:
            counts = df[lbl].value_counts()

        if counts.empty:
            print("   ‚ö†Ô∏è Empty distribution, skipping plot")
            continue

        print(counts.head(10).to_string())

        if len(counts) <= 20:
            counts.astype(int).plot(kind='bar')
            plt.title(f"{name} - {lbl}")
            plt.ylabel("Count (log scale)")
            plt.yscale("log")
            plt.tight_layout()
            plt.show()

# =========================================================
# BASIC ML PREPROCESSING (SAMPLE SAFE)
# =========================================================
for name, df in datasets.items():
    print(f"\n‚öôÔ∏è Preprocessing {name}...")

    num_cols = df.select_dtypes(include=np.number).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

    cat_cols = [c for c in ['proto', 'service'] if c in df.columns]
    if cat_cols:
        df = pd.get_dummies(df, columns=cat_cols)

    datasets[name] = df
    print("   ‚úÖ Done")

# =========================================================
# FINAL SUMMARY
# =========================================================
summary = []

for name, df in datasets.items():
    summary.append({
        "Dataset": name,
        "Total_Rows": metadata[name]['total_rows'],
        "Features": df.shape[1],
        "Numeric_Features": len(df.select_dtypes(include=np.number).columns),
        "Categorical_Features": len(df.select_dtypes(include=['object', 'category']).columns),
        "Missing_Values": int(metadata[name]['missing_total'].sum()),
        "Memory_MB": round(df.memory_usage(deep=True).sum()/1024**2, 2)
    })

summary_df = pd.DataFrame(summary)
print("\nüìå FINAL SUMMARY")
print(summary_df.to_string(index=False))

summary_path = os.path.join(output_dir, "dataset_analysis_summary.csv")
summary_df.to_csv(summary_path, index=False)
print(f"\nüíæ Summary saved to: {summary_path}")