In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


In [3]:
from pathlib import Path
import pandas as pd

data_path = Path("../data/child_mom_model_df.csv")
model_df = pd.read_csv(data_path)

vaccine_cols = ["bcg", "dpt1", "dpt2", "dpt3",
                "polio0", "polio1", "polio2", "polio3", "measles1"]

model_df[vaccine_cols] = model_df[vaccine_cols].fillna(0)


In [4]:
model_df.shape, model_df["missed_any"].value_counts(normalize=True)


((5753, 22),
 missed_any
 0    0.608726
 1    0.391274
 Name: proportion, dtype: float64)

In [5]:
from sklearn.ensemble import RandomForestClassifier

target_col = "missed_any"

# Some ID-like columns *might* be present – drop them if they exist
possible_id_cols = ["cluster", "household", "woman_line", "caseid"]
id_cols = [c for c in possible_id_cols if c in model_df.columns]

feature_cols = [c for c in model_df.columns if c not in id_cols + [target_col]]

X = model_df[feature_cols]
y = model_df[target_col].astype(int)

X.shape, y.value_counts(normalize=True)


((5753, 18),
 missed_any
 0    0.608726
 1    0.391274
 Name: proportion, dtype: float64)

In [6]:
rf_final = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

rf_final.fit(X, y)


In [7]:
import numpy as np

# Probability of "missed_any = 1"
model_df["rf_proba"] = rf_final.predict_proba(X)[:, 1]

model_df[["rf_proba"]].describe()


Unnamed: 0,rf_proba
count,5753.0
mean,0.391149
std,0.46786
min,0.0
25%,0.0
50%,0.013254
75%,0.99
max,1.0


In [8]:
low_thr = 0.2
high_thr = 0.6

def assign_risk(p):
    if p < low_thr:
        return "low"
    elif p < high_thr:
        return "medium"
    else:
        return "high"

model_df["risk_band"] = model_df["rf_proba"].apply(assign_risk)

model_df["risk_band"].value_counts(normalize=True)


risk_band
low       0.585260
high      0.390579
medium    0.024161
Name: proportion, dtype: float64

In [9]:
summary = (
    model_df
    .groupby("risk_band")
    .agg(
        n=("missed_any", "size"),
        prevalence_missed=("missed_any", "mean")
    )
    .sort_values("prevalence_missed")
)

total_n = summary["n"].sum()
total_missed = model_df["missed_any"].sum()

summary["share_of_population"] = summary["n"] / total_n
summary["share_of_all_missed"] = (
    summary["prevalence_missed"] * summary["n"] / total_missed
)

print(summary.to_string(index=False))


   n  prevalence_missed  share_of_population  share_of_all_missed
3367           0.000000             0.585260             0.000000
 139           0.086331             0.024161             0.005331
2247           0.996440             0.390579             0.994669


Interpretation: Who drives most missed vaccinations?

Using the final Random Forest model (class_weight='balanced'), we predicted risk of missing any recommended vaccine and grouped children into three bands:

- Low risk: `rf_proba < 0.2`
- Medium risk: `0.2 ≤ rf_proba < 0.6`
- High risk: `rf_proba ≥ 0.6`

Summary:

- Share of population
  - Low: ~58%
  - Medium: ~2%
  - High: ~39%

- Prevalence of missed vaccinations
  - Low: ~0%
  - Medium: ~8–9%
  - High: ~99%

- Share of all missed vaccinations
  - Low: ~0%
  - Medium: ~0.5%
  - High: ~99.5%

Implications for reminder programmes

- Almost all missed vaccinations come from the high-risk band, even though it is < half of children.
- Medium-risk children are rare but still contribute some missed doses; they could be a secondary priority.
- Low-risk children are the majority but contribute almost no missed doses; they might not need intensive reminders.

This suggests that focusing SMS / community-health-worker reminders on the high-risk group is the most efficient use of resources.
