# Global Cyberattack Pattern Analysis — Data Mining (Anaconda Edition)

**Notebook:** 03_preprocess_feature_engineering.ipynb  
Create target `exploit_speed` (Fast/Medium/Slow) and build train/test splits.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path


ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

DATA = ROOT / "data" / "cisa_kev.csv"
print("Using:", DATA)
df = pd.read_csv(DATA)


colmap = {c.lower(): c for c in df.columns}
def col(name):
  
    return colmap.get(name.lower())

print("Columns:", df.columns.tolist())
df.head(3)


Using: C:\Users\asus\Downloads\global_cyberattack_dm_anaconda\global_cyberattack_dm\data\cisa_kev.csv
Columns: ['cveID', 'vendorProject', 'product', 'vulnerabilityName', 'dateAdded', 'shortDescription', 'requiredAction', 'dueDate', 'knownRansomwareCampaignUse', 'notes', 'cwes']


Unnamed: 0,cveID,vendorProject,product,vulnerabilityName,dateAdded,shortDescription,requiredAction,dueDate,knownRansomwareCampaignUse,notes,cwes
0,CVE-2025-54253,Adobe,Experience Manager (AEM) Forms,Adobe Experience Manager Forms Code Execution ...,2025-10-15,Adobe Experience Manager Forms in JEE contains...,"Apply mitigations per vendor instructions, fol...",2025-11-05,Unknown,https://helpx.adobe.com/security/products/aem-...,
1,CVE-2025-47827,IGEL,IGEL OS,IGEL OS Use of a Key Past its Expiration Date ...,2025-10-14,IGEL OS contains a use of a key past its expir...,"Apply mitigations per vendor instructions, fol...",2025-11-04,Unknown,https://msrc.microsoft.com/update-guide/en-US/...,CWE-324
2,CVE-2025-24990,Microsoft,Windows,Microsoft Windows Untrusted Pointer Dereferenc...,2025-10-14,Microsoft Windows Agere Modem Driver contains ...,"Apply mitigations per vendor instructions, fol...",2025-11-04,Unknown,https://msrc.microsoft.com/update-guide/en-US/...,CWE-822


In [2]:

for cname in ["dateadded","duedate","datepublished","disclosuredate"]:
    c = col(cname)
    if c is not None:
        df[c] = pd.to_datetime(df[c], errors="coerce")


da = col("dateadded")
if da is not None:
    df["year_added"]  = df[da].dt.year
    df["month_added"] = df[da].dt.month


# vendor / product / cwes
vp = col("vendorproject")
prod = col("product")
cwes = col("cwes")


if cwes is not None:
    df["cwe_primary"] = (
        df[cwes].fillna("")
        .astype(str)
        .str.split(",", n=1).str[0]
        .str.strip()
        .replace({"": np.nan})
    )


kr = col("knownransomwarecampaignuse")
if kr is not None:
    df["ransomware_known"] = df[kr].astype(str).str.lower().isin(["yes","true","y","1"])


df[["year_added","month_added", vp if vp else None, prod if prod else None, "cwe_primary" if "cwe_primary" in df else None, "ransomware_known" if "ransomware_known" in df else None]].head()


Unnamed: 0,year_added,month_added,vendorProject,product,cwe_primary,ransomware_known
0,2025,10,Adobe,Experience Manager (AEM) Forms,,False
1,2025,10,IGEL,IGEL OS,CWE-324,False
2,2025,10,Microsoft,Windows,CWE-822,False
3,2025,10,Microsoft,Windows,CWE-284,False
4,2025,10,Rapid7,Velociraptor,CWE-276,False


In [3]:
da = col("dateadded")
dd = col("duedate")
dp = col("datepublished")  

if dp is not None and df[dp].notna().any():
    days = (df[da] - df[dp]).dt.days
    target_name = "exploit_speed"  # Fast/Medium/Slow 
else:
    days = (df[dd] - df[da]).dt.days
    target_name = "response_speed"

df["time_to_days"] = days

bins = [-1, 30, 180, 10_000]
labels = ["Fast", "Medium", "Slow"]
df[target_name] = pd.cut(df["time_to_days"], bins=bins, labels=labels)

print(target_name, "class counts:")
print(df[target_name].value_counts(dropna=False))
df[[target_name, "time_to_days"]].head(10)


response_speed class counts:
response_speed
Fast      1180
Slow       242
NaN         15
Medium       5
Name: count, dtype: int64


Unnamed: 0,response_speed,time_to_days
0,Fast,21
1,Fast,21
2,Fast,21
3,Fast,21
4,Fast,21
5,Fast,21
6,Fast,21
7,Fast,21
8,Fast,21
9,Fast,21


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import joblib


feature_cols = []
if vp:   feature_cols.append(vp)
if prod: feature_cols.append(prod)
if "cwe_primary" in df: feature_cols.append("cwe_primary")
feature_cols += [c for c in ["year_added","month_added","ransomware_known"] if c in df.columns]

mask = df[target_name].notna()
X = df.loc[mask, feature_cols].copy()
y = df.loc[mask, target_name].copy()

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

print("Features used:", feature_cols)
print("Categorical:", cat_cols)
print("Numeric:", num_cols)

preprocess = ColumnTransformer([
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("oh",  OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
    ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols)
])

# Split into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train size:", X_train.shape, " Test size:", X_test.shape)
print("y_train counts:\n", y_train.value_counts())

# Save processed metadata and splits for later notebooks
METRICS_DIR = (ROOT / "reports" / "metrics")
METRICS_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump({
    "feature_cols": feature_cols,
    "cat_cols": cat_cols,
    "num_cols": num_cols,
    "target_name": target_name
}, METRICS_DIR / "feature_meta.joblib")

joblib.dump({
    "X_train": X_train, "X_test": X_test,
    "y_train": y_train, "y_test": y_test
}, METRICS_DIR / "splits.joblib")

print("Saved artifacts to:", METRICS_DIR)


Features used: ['vendorProject', 'product', 'cwe_primary', 'year_added', 'month_added', 'ransomware_known']
Categorical: ['vendorProject', 'product', 'cwe_primary']
Numeric: ['year_added', 'month_added', 'ransomware_known']
Train size: (1141, 6)  Test size: (286, 6)
y_train counts:
 response_speed
Fast      944
Slow      193
Medium      4
Name: count, dtype: int64
Saved artifacts to: C:\Users\asus\Downloads\global_cyberattack_dm_anaconda\global_cyberattack_dm\reports\metrics


In [6]:
import os

# Create 'data' folder if it doesn't exist
os.makedirs("data", exist_ok=True)

# Now save the file safely
df.to_csv("data/cisa_preprocessed.csv", index=False)
print("✅ Saved preprocessed data to: data/cisa_preprocessed.csv")

# Save the preprocessed dataset for later steps

df.to_csv("data/cisa_preprocessed.csv", index=False)
print("✅ Saved preprocessed data to: data/cisa_preprocessed.csv")


✅ Saved preprocessed data to: data/cisa_preprocessed.csv
✅ Saved preprocessed data to: data/cisa_preprocessed.csv
