In [13]:
#Cell 1 - imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score


In [14]:
# Cell 2 — define label and features

# paths
DATA = Path("../data/processed")
features_path = DATA / "report_features.csv"

# load features
df = pd.read_csv(features_path, parse_dates=["receivedate"])
print("Data shape:", df.shape)

# define label
label = "is_serious_report"

# drop columns not used as features
ignore_cols = ["safetyreportid", "receivedate", label]
X = df.drop(columns=ignore_cols)
y = df[label]

# identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:", list(categorical_cols))

print("Positive class rate:", y.mean())


Data shape: (100, 39)
Categorical columns: ['age_bin']
Positive class rate: 0.36


In [15]:
# Cell 3 — train/test split (time-based)

# --- Safe train/test split that works for any dataset size ---
test_ratio = 0.2  # 20% for test
test_size = max(1, int(len(df) * test_ratio))  # ensure at least 1 row in test

train = df.iloc[:-test_size]
test = df.iloc[-test_size:]

print("Total rows:", len(df))
print("Train rows:", len(train), "Test rows:", len(test))

X_train, y_train = train.drop(columns=ignore_cols), train[label]
X_test, y_test = test.drop(columns=ignore_cols), test[label]

# --- Encode categorical columns ---
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Align test columns to train + fill NaNs
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
X_train_encoded = X_train_encoded.fillna(0)
X_test_encoded = X_test_encoded.fillna(0)

print("Encoded train shape:", X_train_encoded.shape, "Encoded test shape:", X_test_encoded.shape)

Total rows: 100
Train rows: 80 Test rows: 20
Encoded train shape: (80, 38) Encoded test shape: (20, 38)


In [16]:
# Cell 4 — fit baseline logistic regression

model = LogisticRegression(max_iter=200, class_weight="balanced")
model.fit(X_train_encoded, y_train)

# predict probabilities and classes
y_pred_proba = model.predict_proba(X_test_encoded)[:,1]
y_pred = model.predict(X_test_encoded)

# evaluation
roc = roc_auc_score(y_test, y_pred_proba)
pr = average_precision_score(y_test, y_pred_proba)

print(f"ROC AUC: {roc:.3f}")
print(f"PR AUC: {pr:.3f}")


ROC AUC: 0.556
PR AUC: 0.507


In [17]:
# Cell 6 — save model and metrics
import joblib, json

MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(exist_ok=True)

joblib.dump(model, MODEL_DIR / "logreg_baseline.joblib")

metrics = {
    "roc_auc": float(roc),
    "pr_auc": float(pr),
    "n_train": int(len(y_train)),
    "n_test": int(len(y_test))
}
with open(MODEL_DIR / "baseline_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("Saved model + metrics.")

Saved model + metrics.
