# DengAI: Predicting Disease Spread

Forecasting weekly dengue cases in San Juan and Iquitos using climate, environmental, and public health features.


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb

# Load training data
features = pd.read_csv('dengue_features_train.csv')
labels = pd.read_csv('dengue_labels_train.csv')

# Merge
df = pd.merge(features, labels, on=["city", "year", "weekofyear"])
df["week_start_date"] = pd.to_datetime(df["week_start_date"])


In [None]:
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)


In [None]:
sj = df[df["city"] == "sj"].copy()
iq = df[df["city"] == "iq"].copy()

sj["city_code"] = 0
iq["city_code"] = 1


In [None]:
def create_lag_features(df, features, lags):
    for feature in features:
        for lag in lags:
            df[f"{feature}_lag{lag}"] = df[feature].shift(lag)
    return df

lag_features = ["precipitation_amt_mm", "reanalysis_air_temp_k", "reanalysis_relative_humidity_percent"]
sj = create_lag_features(sj, lag_features, lags=[1, 2, 3])


In [None]:
def create_rolling_features(df, features, windows):
    for feature in features:
        for window in windows:
            df[f"{feature}_roll{window}"] = df[feature].rolling(window=window, min_periods=1).mean()
    return df

sj = create_rolling_features(sj, lag_features, windows=[3, 5, 8, 12])


In [None]:
sj["intervention_active"] = ((sj["week_start_date"] >= "1994-11-01") & (sj["week_start_date"] <= "1995-03-31")).astype(int)
sj["major_outbreak_season"] = (((sj["week_start_date"] >= "1991-08-01") & (sj["week_start_date"] <= "1992-03-31")) | ((sj["week_start_date"] >= "1994-05-01") & (sj["week_start_date"] <= "1995-06-30"))).astype(int)
sj["post_hurricane_georges"] = ((sj["week_start_date"] >= "1998-09-21") & (sj["week_start_date"] <= "1999-05-31")).astype(int)

years_from_1990 = sj["week_start_date"].dt.year - 1990
sj["migration_index"] = -1 * (years_from_1990 / 10.0).clip(0, 1)

for flag in ["intervention_active", "major_outbreak_season", "post_hurricane_georges"]:
    sj[f"{flag}_lag1"] = sj[flag].shift(1)
    sj[f"{flag}_lag2"] = sj[flag].shift(2)

sj.dropna(inplace=True)


In [None]:
X = sj.drop(columns=["city", "week_start_date", "total_cases"])
y = sj["total_cases"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

tscv = TimeSeriesSplit(n_splits=5)

# Final model: XGBoost
model = xgb.XGBRegressor(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)

rmse_scores = []
mae_scores = []

for train_idx, test_idx in tscv.split(X_scaled):
    model.fit(X_scaled[train_idx], y.iloc[train_idx])
    y_pred = model.predict(X_scaled[test_idx])
    rmse_scores.append(np.sqrt(mean_squared_error(y.iloc[test_idx], y_pred)))
    mae_scores.append(mean_absolute_error(y.iloc[test_idx], y_pred))

print("Average RMSE:", np.mean(rmse_scores))
print("Average MAE:", np.mean(mae_scores))
