In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score

# paths
DATA = Path("../data/processed")
df = pd.read_csv(DATA / "report_features.csv", parse_dates=["receivedate"])

df.shape, df.columns[:15]

((100, 39),
 Index(['safetyreportid', 'receivedate', 'sex', 'age_mean', 'age_bin',
        'n_reactions', 'n_drugs', 'aspirin_present', 'is_serious_report',
        'react_Anaemia', 'react_Blood count abnormal', 'react_Constipation',
        'react_Contusion', 'react_Death', 'react_Diarrhoea'],
       dtype='object'))

In [2]:
TARGET = "is_serious_report"

# make a copy for modeling
df_model = df.copy()

# encode categorical columns
cat_cols = ["sex", "age_bin"]
for col in cat_cols:
    df_model[col] = df_model[col].astype("category").cat.codes

# fill missing values
df_model = df_model.fillna(0)

# split features/target
X = df_model.drop(columns=[TARGET, "safetyreportid", "receivedate"])
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((75, 36), (25, 36))

In [3]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

preds = rf.predict_proba(X_test)[:, 1]
roc = roc_auc_score(y_test, preds)
pr = average_precision_score(y_test, preds)

print("RandomForest ROC-AUC:", roc)
print("RandomForest PR-AUC:", pr)


RandomForest ROC-AUC: 0.6458333333333334
RandomForest PR-AUC: 0.6203285992759677


In [4]:
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

preds_gb = gb.predict_proba(X_test)[:, 1]
roc_gb = roc_auc_score(y_test, preds_gb)
pr_gb = average_precision_score(y_test, preds_gb)

print("GradientBoosting ROC-AUC:", roc_gb)
print("GradientBoosting PR-AUC:", pr_gb)


GradientBoosting ROC-AUC: 0.6597222222222222
GradientBoosting PR-AUC: 0.6432342430149447


In [5]:
cv_scores = cross_val_score(
    rf, X, y, cv=5, scoring="roc_auc", n_jobs=-1
)
print("RandomForest CV ROC-AUC:", cv_scores.mean())


RandomForest CV ROC-AUC: 0.5600503663003662


In [6]:
import joblib, json

MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(exist_ok=True)

joblib.dump(rf, MODEL_DIR / "random_forest.joblib")
joblib.dump(gb, MODEL_DIR / "gradient_boosting.joblib")

metrics = {
    "random_forest": {"roc_auc": float(roc), "pr_auc": float(pr)},
    "gradient_boosting": {"roc_auc": float(roc_gb), "pr_auc": float(pr_gb)}
}

with open(MODEL_DIR / "improved_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("Saved improved models + metrics.")


Saved improved models + metrics.


In [7]:
# Save Random Forest and Gradient Boosting models
joblib.dump(rf, MODEL_DIR / "rf_model.joblib")
joblib.dump(gb, MODEL_DIR / "gb_model.joblib")

# save feature names

with open(MODEL_DIR / "rf_model.json", "w") as f:
    json.dump(list(X_train.columns), f, indent=2)
with open(MODEL_DIR / "gb_model.json.json", "w") as f:
    json.dump(list(X_train.columns), f, indent=2)

print("RF and GB models saved successfully!")

RF and GB models saved successfully!
