In [None]:
import polars as pl
from pathlib import Path
import csv

#  Define dataset folders

DATA_DIR = Path(r"D:/Dissertation 2025/Dataset")
OUT_DIR = Path(r"D:/Dissertation 2025/Results")

files = {
    "Monday": DATA_DIR / "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday":   DATA_DIR / "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday": DATA_DIR / "Wednesday-workingHours.pcap_ISCX.csv",
}

print(" Dataset folder:", DATA_DIR)
print(" Output folder :", OUT_DIR)


#  Read ONLY the header (super safe)

def safe_preview_header(path): 
    """Reads only the first line (header). Never freezes."""
    with open(path,
"r", encoding="latin-1") as f:
        reader = csv.reader(f)
        header = next(reader)
        return header


headers = {day: safe_preview_header(path) for day, path in files.items()
}

print("\n=== Header Extraction Complete ===")
for day, hdr in headers.items():
    print(f"\n{day} Columns ({len(hdr)}):")
    for h in hdr:
        print(" -", h)


# Detect leakage columns (IP, Port, Timestamp)

def find_leakage_cols(header):
    leak = []
    for col in header:
        c = col.lower()
        if (
            "flow id" in c or
            "src ip" in c or "dst ip" in c or
            "source ip" in c or "destination ip" in c or
            "port" in c or
            "timestamp" in c or
            "protocol" in c
        ):
            leak.append(col)
    return leak


leakage_cols = {day: find_leakage_cols(h) for day, h in headers.items()
}

print("\n=== Leakage Columns Detected ===")
for day, cols in leakage_cols.items():
    print(day,
":", cols)


# Attack family mapping

fam_map = {
    "FTP-PATATOR": "BruteForce",
    "SSH-PATATOR": "BruteForce",
    "DOS HULK": "DoS",
    "DOS GOLDENEYE": "DoS",
    "DOS SLOWLORIS": "DoS",
    "WEB ATTACK â€“ SQL INJECTION": "Web",
    "WEB ATTACK â€“ XSS": "Web",
    "WEB ATTACK â€“ BRUTE FORCE": "Web",
    "WEB ATTACK - SQL INJECTION": "Web",
    "WEB ATTACK - XSS": "Web",
    "WEB ATTACK - BRUTE FORCE": "Web",
    "DDOS": "DDoS",
    "PORTSCAN": "PortScan",
    "BOT": "Botnet",
    "INFILTRATION": "Infiltration",
}


# Process each file in streaming mode


def preprocess_day(day, path, header, leak_list, output_dir):
    print(f"\nProcessing {day}: {path.name}")

    # Identify label column name
    label_col = None
    for col in header:
        if col.lower() in {
    "label",
    "attack"
}:
            label_col = col
            break

    if not label_col:
        raise ValueError(f"No label column found in {path.name}")

    print(f"ðŸ‘‰ Label column for {day} =", label_col)

    lf = pl.scan_csv(
        path,
        has_header=True,
        infer_schema_length=0,
        ignore_errors=True
    )

    # Drop leakage columns
    for c in leak_list:
        lf = lf.drop(c)

    # Normalize label to uppercase & rename column
    lf = lf.rename({label_col: "Label"
})
    lbl_up = pl.col("Label").str.to_uppercase()

    # Create y_binary
    y_binary = (lbl_up != "BENIGN").cast(pl.Int8).alias("y_binary")

    # y_family using dictionary mapping
    y_family = (
        pl.col("Label")
        .replace(fam_map)
        .fill_null("OtherAttack")
        .alias("y_family")
    )

    # Add columns
    lf = lf.with_columns([
        lbl_up.alias("Label"),
        y_binary,
        y_family,
        pl.lit(day).alias("day")
])

    # Output file
    out_path = output_dir / f"{day}_clean.csv"
    print(f"Saving cleaned file to: {out_path}")

    # Stream results to CSV (no RAM use)
    lf.sink_csv(out_path)

    print(f"Finished {day}")
    return out_path


# Run preprocessing for each day
output_paths = {}

for day, path in files.items():
    output_paths[day
] = preprocess_day(
        day=day,
        path=path,
        header=headers[day
],
        leak_list=leakage_cols[day
],
        output_dir=OUT_DIR
    )

print("\nCleaned files saved:")
for day, p in output_paths.items():
    print(day,
"â†’", p)
