# Global Cyberattack Pattern Analysis — Data Mining (Anaconda Edition)

**Notebook:** 03_preprocess_feature_engineering.ipynb  
Create target `exploit_speed` (Fast/Medium/Slow) and build train/test splits.

In [None]:
import pandas as pd, numpy as np, os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

df = pd.read_csv("data/cisa_kev.csv", parse_dates=["DateAdded","PublishDate","DueDate"], dayfirst=False)

if "PublishDate" in df.columns and df["PublishDate"].notna().any():
    df["time_to_exploit_days"] = (df["DateAdded"] - df["PublishDate"]).dt.days
elif "DueDate" in df.columns:
    df["time_to_exploit_days"] = (df["DueDate"] - df["DateAdded"]).dt.days
else:
    raise ValueError("Need PublishDate or DueDate to construct target.")

bins = [-1,30,180,10_000]; labels = ["Fast","Medium","Slow"]
df["exploit_speed"] = pd.cut(df["time_to_exploit_days"], bins=bins, labels=labels)

if "DateAdded" in df.columns:
    df["year_added"] = df["DateAdded"].dt.year
    df["month_added"] = df["DateAdded"].dt.month

features = [c for c in ["VendorProject","Product","CWE_ID","year_added","month_added"] if c in df.columns]
mask = df["exploit_speed"].notna()
X, y = df.loc[mask, features], df.loc[mask, "exploit_speed"]

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

preprocess = ColumnTransformer([
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
    ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Save minimal artifacts for reuse
import joblib, pathlib
art_dir = pathlib.Path("reports/metrics"); art_dir.mkdir(parents=True, exist_ok=True)
joblib.dump({"features":features,"cat_cols":cat_cols,"num_cols":num_cols}, "reports/metrics/feature_meta.joblib")
joblib.dump({"X_train":X_train,"X_test":X_test,"y_train":y_train,"y_test":y_test}, "reports/metrics/splits.joblib")
print("Prepared splits and metadata.")