In [None]:
import pandas as pd
import random

# ========================
# --- Windows Event Log ---
# ========================
def classify_event_threat(row):
    uname = str(row.get('UserName', '')).lower()
    eid = str(row.get('EventID', '')).strip()
    host = str(row.get('LogHost', '')).lower()
    logon_type = str(row.get('LogonTypeDescription', '')).lower()
    src = str(row.get('Source', '')).lower()
    auth_pkg = str(row.get('AuthenticationPackage', '')).lower()
    proc = str(row.get('ProcessName', '')).lower()
    parent_proc = str(row.get('ParentProcessName', '')).lower()
    domain = str(row.get('DomainName', '')).lower()

    scores = {
        "Remote Access Attempt": 0,
        "Brute Force Login": 0,
        "Pass-the-Hash Attack": 0,
        "Malicious Script Execution": 0,
        "Post-Compromise Activity": 0,
        "Malware Execution": 0,
        "Lateral Movement": 0,
        "Drive-By Download": 0
    }

    if eid in ["4624", "4648"] and "remoteinteractive" in logon_type:
        scores["Remote Access Attempt"] += 2
    if "network" in logon_type and ("failed" in src or eid in ["4625"]):
        scores["Brute Force Login"] += 3
    if "ntlm" in auth_pkg and domain not in ["corp", "internal"]:
        scores["Pass-the-Hash Attack"] += 4
    if eid in ["4768", "4769", "4776"] and "kerberos" in auth_pkg:
        scores["Pass-the-Hash Attack"] += 2

    if proc in ["powershell.exe", "wmic.exe"] and parent_proc in ["winword.exe", "excel.exe"]:
        scores["Malicious Script Execution"] += 4
    if proc in ["cmd.exe", "powershell.exe"] and "remoteinteractive" in logon_type:
        scores["Post-Compromise Activity"] += 3
    if proc.endswith(".vbs") or proc.endswith(".js"):
        scores["Malware Execution"] += 3
    if proc in ["mimikatz.exe", "procdump.exe"]:
        scores["Post-Compromise Activity"] += 4
    if "internal" in host and domain != "" and "internal" in domain:
        scores["Lateral Movement"] += 3
    if parent_proc in ["chrome.exe", "iexplore.exe", "firefox.exe"] and proc.endswith(".exe"):
        scores["Drive-By Download"] += 3

    if all(v == 0 for v in scores.values()):
        scores[random.choice(list(scores.keys()))] = 1

    max_score = max(scores.values())
    max_types = [t for t, s in scores.items() if s == max_score]
    return random.choice(max_types)


# ========================
# --- NetFlow Threats ---
# ========================
def classify_netflow_threat(row):
    dur = row['Duration']
    spk = row['SrcPackets']
    dpk = row['DstPackets']
    sbytes = row['SrcBytes']
    dbytes = row['DstBytes']
    dport = row['DstPort']
    proto = str(row['Protocol']).lower()
    shap_text = str(row.get('shap_explanation', '')).lower()

    scores = {
        "DDoS": 0,
        "DoS": 0,
        "Port Scan": 0,
        "Brute Force Login": 0,
        "Data Exfiltration": 0,
        "Malware Communication": 0,
        "Lateral Movement": 0,
        "Malware C2": 0
    }

    if "low srcpackets" in shap_text and "low duration" in shap_text:
        scores["Port Scan"] += 3
    if "high srcbytes" in shap_text:
        scores["Data Exfiltration"] += 3
    if "high dstpackets" in shap_text:
        scores["DDoS"] += 3
    if "low srcbytes" in shap_text and "long duration" in shap_text:
        scores["Malware C2"] += 2
    if "many srcpackets" in shap_text and ("dstport" in shap_text or "login" in shap_text):
        scores["Brute Force Login"] += 3

    if spk > 5000 and dur < 10 and dpk > 1000:
        scores["DDoS"] += 4
    if spk > 1000 and dur < 5:
        scores["DoS"] += 3
    if dport > 1024 and spk < 500 and dur < 2:
        scores["Port Scan"] += 3
    if dport in [21, 22, 23, 80, 443] and spk > 50 and dur < 20:
        scores["Brute Force Login"] += 3
    if sbytes > 2_000_000 and dur < 120:
        scores["Data Exfiltration"] += 4
    if proto in ["tcp", "udp"] and sbytes < 5000 and dur > 60:
        scores["Malware Communication"] += 3
    if "internal" in str(row['SrcDevice']).lower() and "internal" in str(row['DstDevice']).lower():
        scores["Lateral Movement"] += 3

    return max(scores, key=scores.get)


# ========================
# --- Load & Process ---
# ========================
# Windows Event Log
df_wls = pd.read_csv("../datasets/wls_day-02.csv")
df_wls['Threat_Type'] = df_wls.apply(classify_event_threat, axis=1)

# NetFlow
df_net = pd.read_csv("../datasets/netflow_day-02.csv")
num_cols = ['Duration', 'SrcPackets', 'DstPackets', 'SrcBytes', 'DstBytes', 'DstPort']
for col in num_cols:
    df_net[col] = pd.to_numeric(df_net[col], errors='coerce')
df_net['Threat_Type'] = df_net.apply(classify_netflow_threat, axis=1)

# Combine both
df_combined = pd.concat([df_wls, df_net], ignore_index=True)

# Save tidy CSV
df_combined.to_csv("combined_classified_threats.csv", index=False)

# Print counts
print(df_combined['Threat_Type'].value_counts())

Threat_Type
DDoS                          1021290
Port Scan                       21553
Brute Force Login                6197
Post-Compromise Activity          679
Pass-the-Hash Attack              655
Malicious Script Execution        630
Lateral Movement                  613
Remote Access Attempt             612
Drive-By Download                 601
Malware Execution                 599
Data Exfiltration                  91
DoS                                55
Name: count, dtype: int64


In [None]:
import pandas as pd
import random

# ---------- NETFLOW CLASSIFICATION ----------
df_net = pd.read_csv("../datasets/netflow_day-02.csv")

num_cols = ['Duration', 'SrcPackets', 'DstPackets', 'SrcBytes', 'DstBytes', 'DstPort']
for col in num_cols:
    df_net[col] = pd.to_numeric(df_net[col], errors='coerce')

def classify_threat_balanced(row):
    dur = row['Duration']
    spk = row['SrcPackets']
    dpk = row['DstPackets']
    sbytes = row['SrcBytes']
    dbytes = row['DstBytes']
    dport = row['DstPort']
    proto = str(row['Protocol']).lower()
    shap_text = str(row.get('shap_explanation', '')).lower()

    scores = {
        "DDoS": 0, "DoS": 0, "Port Scan": 0, "Brute Force Login": 0,
        "Data Exfiltration": 0, "Malware Communication": 0,
        "Lateral Movement": 0, "Malware C2": 0
    }

    # SHAP hints
    if "low srcpackets" in shap_text and "low duration" in shap_text:
        scores["Port Scan"] += 3
    if "high srcbytes" in shap_text:
        scores["Data Exfiltration"] += 3
    if "high dstpackets" in shap_text:
        scores["DDoS"] += 2
    if "low srcbytes" in shap_text and "long duration" in shap_text:
        scores["Malware C2"] += 2
    if "many srcpackets" in shap_text and ("dstport" in shap_text or "login" in shap_text):
        scores["Brute Force Login"] += 3

    # Feature patterns
    if spk > 5000 and dur < 10 and dpk > 1000:
        scores["DDoS"] += 2
    if spk > 1000 and dur < 5:
        scores["DoS"] += 3
    if dport > 1024 and spk < 500 and dur < 2:
        scores["Port Scan"] += 3
    if dport in [21, 22, 23, 80, 443] and spk > 50 and dur < 20:
        scores["Brute Force Login"] += 3
    if sbytes > 2_000_000 and dur < 120:
        scores["Data Exfiltration"] += 4
    if proto in ["tcp", "udp"] and sbytes < 5000 and dur > 60:
        scores["Malware Communication"] += 3
    if "internal" in str(row['SrcDevice']).lower() and "internal" in str(row['DstDevice']).lower():
        scores["Lateral Movement"] += 3

    # Variety balancing
    if all(v == 0 for v in scores.values()):
        scores[random.choice(list(scores.keys()))] = 1

    max_score = max(scores.values())
    max_types = [t for t, s in scores.items() if s == max_score]
    return random.choice(max_types)

df_net['Threat_Type'] = df_net.apply(classify_threat_balanced, axis=1)

# Keep only relevant features
df_net = df_net[num_cols + ['Protocol', 'SrcDevice', 'DstDevice', 'Threat_Type']]

# ---------- WINDOWS LOG CLASSIFICATION ----------
df_win = pd.read_csv("../datasets/wls_day-02.csv")  # replace with actual path

def classify_windows_threat(row):
    pname = str(row['ProcessName']).lower()
    parent = str(row['ParentProcessName']).lower()
    domain = str(row['DomainName']).lower()

    if "mimikatz" in pname or "sekurlsa" in pname:
        return "Pass-the-Hash Attack"
    elif "powershell" in pname and "-enc" in pname:
        return "Malicious Script Execution"
    elif "rundll32" in pname and ".dll" in pname:
        return "Malware Execution"
    elif "wmic" in pname or "psexec" in pname:
        return "Lateral Movement"
    elif "cmd.exe" in parent and "net user" in pname:
        return "Privilege Escalation"
    elif "remote" in pname or "rdp" in pname:
        return "Remote Access Attempt"
    else:
        return random.choice([
            "Drive-By Download", "Brute Force Login", "Data Exfiltration"
        ])

df_win['Threat_Type'] = df_win.apply(classify_windows_threat, axis=1)

# Keep only relevant features
df_win = df_win[['UserName', 'EventID', 'LogHost', 'LogonID', 'DomainName',
                 'ParentProcessName', 'ParentProcessID', 'ProcessName', 'Time',
                 'ProcessID', 'LogonTypeDescription', 'Source', 'AuthenticationPackage',
                 'LogonType', 'Threat_Type']]

# ---------- MERGE & BALANCE ----------
df_combined = pd.concat([df_net, df_win], ignore_index=True)

target_count = 500  # samples per class

balanced_data = []
for ttype, group in df_combined.groupby('Threat_Type'):
    if len(group) > target_count:
        balanced_data.append(group.sample(target_count, random_state=42))
    else:
        balanced_data.append(group.sample(target_count, replace=True, random_state=42))

df_final = pd.concat(balanced_data)

df_final.to_csv("classified_threats_balanced_big.csv", index=False)

print(df_final['Threat_Type'].value_counts())