In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
    precision_recall_curve,
)


In [3]:
# Load the processed modelling table
model_df = pd.read_csv("../data/processed/child_mom_model_table.csv")

# Vaccine columns you used before
vaccine_cols = ["bcg", "dpt1", "dpt2", "dpt3",
                "polio0", "polio1", "polio2", "polio3", "measles1"]

# Treat missing vaccine info as "not received"
model_df[vaccine_cols] = model_df[vaccine_cols].fillna(0)

target_col = "missed_any"
id_cols = ["cluster", "household", "woman_line"]

X = model_df.drop(columns=id_cols + [target_col])
y = model_df[target_col].astype(int)

X.shape, y.value_counts(normalize=True)


((5753, 18),
 missed_any
 0    0.608726
 1    0.391274
 Name: proportion, dtype: float64)

02 – Modelling: predicting missed vaccinations

Goal: build and compare predictive models for `missed_any` using the features
prepared in `01_eda.ipynb`. We will:

- Train a baseline logistic regression model.
- Train a stronger tree-based model (Random Forest).
- Use stratified train/test split and cross-validation.
- Compare ROC AUC, recall for `missed_any = 1`, and confusion matrices.


In [4]:
from sklearn.model_selection import train_test_split

# Stratified split to keep class balance similar in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_test.shape


((4602, 18), (1151, 18))

In [5]:
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
)

def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    """Fit model, print metrics, and return a results dict."""
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    return {
        "model": name,
        "roc_auc": roc_auc_score(y_test, y_proba),
        "recall_pos": classification_report(
            y_test, y_pred, output_dict=True
        )["1"]["recall"],
    }


In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

log_reg = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=5000, class_weight=None)
)

results = []
results.append(evaluate_model("Logistic Regression", log_reg, X_train, X_test, y_train, y_test))



=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       701
           1       0.90      0.83      0.86       450

    accuracy                           0.90      1151
   macro avg       0.90      0.88      0.89      1151
weighted avg       0.90      0.90      0.89      1151

ROC AUC: 0.9484688540180695
Confusion matrix:
 [[658  43]
 [ 77 373]]


In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
)

results.append(evaluate_model("Random Forest", rf, X_train, X_test, y_train, y_test))



=== Random Forest ===
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       701
           1       0.96      0.97      0.96       450

    accuracy                           0.97      1151
   macro avg       0.97      0.97      0.97      1151
weighted avg       0.97      0.97      0.97      1151

ROC AUC: 0.9923395149786021
Confusion matrix:
 [[681  20]
 [ 14 436]]


In [8]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,model,roc_auc,recall_pos
0,Logistic Regression,0.948469,0.828889
1,Random Forest,0.99234,0.968889


In [9]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def cv_auc(model, X, y, cv):
    scores = cross_val_score(
        model, X, y,
        cv=cv,
        scoring="roc_auc",
        n_jobs=-1,
    )
    print("CV AUC scores:", scores)
    print("Mean CV AUC:", scores.mean())
    return scores

print("Logistic Regression – CV")
_ = cv_auc(log_reg, X, y, cv)

print("\nRandom Forest – CV")
_ = cv_auc(rf, X, y, cv)


Logistic Regression – CV
CV AUC scores: [0.94753368 0.96288477 0.96146341 0.94293333 0.94673651]
Mean CV AUC: 0.9523103411475118

Random Forest – CV
CV AUC scores: [0.9863988  0.99284514 0.99262274 0.98947302 0.99099365]
Mean CV AUC: 0.9904666694019479


In [10]:
import numpy as np

rf_feature_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

rf_feature_importance.head(15)


Unnamed: 0,feature,importance
0,dob_cmc,0.361723
17,n_received,0.162813
16,measles1,0.137835
15,polio3,0.055005
14,polio2,0.043175
11,dpt3,0.032828
8,bcg,0.031629
9,dpt1,0.030548
2,age,0.025824
4,region,0.022563


Model comparison summary

We compared two models using the same train/test split and 5-fold cross-validation:

- Logistic Regression (with StandardScaler)  
  - Test ROC AUC ≈ 0.95  
  - Recall for `missed_any` ≈ 0.83  
  - Mean CV ROC AUC ≈ 0.95

- Random Forest  
  - Test ROC AUC ≈ 0.99  
  - Recall for `missed_any` ≈ 0.97  
  - Mean CV ROC AUC ≈ 0.99

Random Forest clearly outperforms logistic regression, especially on recall for missed vaccinations, which is critical if the goal is to identify as many at-risk children as possible. Feature importance shows that child age (`dob_cmc`), total number of doses received (`n_received`), and measles/polio/dpt doses carry most of the predictive signal.


In [12]:
from sklearn.metrics import precision_score, recall_score

# Probabilities from the fitted Random Forest
rf_proba = rf.predict_proba(X_test)[:, 1]


In [13]:
thresholds = [0.3, 0.5, 0.7]

for thr in thresholds:
    rf_pred_thr = (rf_proba >= thr).astype(int)
    prec = precision_score(y_test, rf_pred_thr)
    rec = recall_score(y_test, rf_pred_thr)
    print(f"Threshold {thr:.1f} -> precision={prec:.3f}, recall={rec:.3f}")


Threshold 0.3 -> precision=0.938, recall=0.978
Threshold 0.5 -> precision=0.956, recall=0.969
Threshold 0.7 -> precision=0.971, recall=0.909


Threshold analysis for Random Forest (missed_any = 1)

Using the RandomForest model’s predicted probabilities on the test set:

| Threshold | Precision (missed_any=1) | Recall (missed_any=1) |
|----------|---------------------------|------------------------|
| 0.3      | 0.938                     | 0.978                  |
| 0.5      | 0.956                     | 0.969                  |
| 0.7      | 0.971                     | 0.909                  |

Interpretation

- Lowering the decision threshold from 0.5 → 0.3 slightly reduces precision (0.956 → 0.938) but increases recall from 0.969 to 0.978 – we catch more children who truly missed at least one dose.
- Raising the threshold to 0.7 gives only a small gain in precision (0.971) but recall drops to 0.909, meaning more children who actually missed vaccinations would not be flagged.
- For a real reminder system, recall is more critical than squeezing out the last bit of precision. A threshold around 0.3–0.4 is therefore more appropriate if the programme’s priority is to minimise the risk of leaving out children who need follow-up.
