In [1]:
from pathlib import Path
ROOT = Path("..")       # notebook folder -> project root
DATA_DIR = ROOT / "datasets"
STATION_HOUR_CSV = DATA_DIR / "station_hour.csv"

print("Notebook cwd:", Path.cwd())
print("Expected station_hour path:", STATION_HOUR_CSV)
print("Exists?", STATION_HOUR_CSV.exists())


Notebook cwd: c:\Users\91934\airaware\notebooks
Expected station_hour path: ..\datasets\station_hour.csv
Exists? True


In [2]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path("..") / "datasets"
STATION_HOUR_CSV = DATA_DIR / "station_hour.csv"

# quick head + columns (very fast)
pd.options.display.max_columns = 200
print("Reading first 5 rows...")
df_head = pd.read_csv(STATION_HOUR_CSV, nrows=5)
display(df_head)
print("\nColumns:")
print(df_head.columns.tolist())


Reading first 5 rows...


Unnamed: 0,StationId,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,AP001,2017-11-24 17:00:00,60.5,98.0,2.35,30.8,18.25,8.5,0.1,11.85,126.4,0.1,6.1,0.1,,
1,AP001,2017-11-24 18:00:00,65.5,111.25,2.7,24.2,15.07,9.77,0.1,13.17,117.12,0.1,6.25,0.15,,
2,AP001,2017-11-24 19:00:00,80.0,132.0,2.1,25.18,15.15,12.02,0.1,12.08,98.98,0.2,5.98,0.18,,
3,AP001,2017-11-24 20:00:00,81.5,133.25,1.95,16.25,10.23,11.58,0.1,10.47,112.2,0.2,6.72,0.1,,
4,AP001,2017-11-24 21:00:00,75.25,116.0,1.43,17.48,10.43,12.03,0.1,9.12,106.35,0.2,5.75,0.08,,



Columns:
['StationId', 'Datetime', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket']


In [3]:
# Cell 2 — parameters and load (sample-friendly)
import os
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path("..")
DATA_DIR = ROOT / "datasets"
STATION_HOUR_CSV = DATA_DIR / "station_hour.csv"

# --- Config: set SAMPLE_CITY to a city name for quick iteration, or None to load all (may be heavy) ---
SAMPLE_CITY = "Delhi"   # set to None to load full dataset (may need lots of RAM)
CHUNKSIZE = 200000      # for chunk reading when filtering by city

print("station_hour path:", STATION_HOUR_CSV)
print("SAMPLE_CITY:", SAMPLE_CITY)

if SAMPLE_CITY:
    # load in chunks and filter by city (case-insensitive)
    chunks = []
    for chunk in pd.read_csv(STATION_HOUR_CSV, chunksize=CHUNKSIZE):
        if "City" in chunk.columns:
            mask = chunk["City"].str.contains(SAMPLE_CITY, case=False, na=False)
            filtered = chunk[mask]
            if not filtered.empty:
                chunks.append(filtered)
    if len(chunks) == 0:
        print("No rows found for city filter. Loading first chunk as sample.")
        df = pd.read_csv(STATION_HOUR_CSV, nrows=CHUNKSIZE)
    else:
        df = pd.concat(chunks, ignore_index=True)
else:
    # load full file (be careful on low-RAM machines)
    df = pd.read_csv(STATION_HOUR_CSV)

print("Loaded shape:", df.shape)



station_hour path: ..\datasets\station_hour.csv
SAMPLE_CITY: Delhi


  for chunk in pd.read_csv(STATION_HOUR_CSV, chunksize=CHUNKSIZE):


No rows found for city filter. Loading first chunk as sample.
Loaded shape: (200000, 16)


In [4]:
# Cell 3 — sanitize and datetime
# Replace spaces/dots in column names to safe identifiers, keep original column mapping in mind
df.columns = [c.strip().replace(" ", "_").replace(".", "_") for c in df.columns]
print("Columns after sanitization:\n", df.columns.tolist())

# Ensure Datetime column exists (we saw 'Datetime')
if "Datetime" not in df.columns:
    raise RuntimeError("Datetime column not found after sanitize. Check column names.")

df["Datetime"] = pd.to_datetime(df["Datetime"], errors="coerce")
print("Datetime sample:", df["Datetime"].head())

# Sort per station for time-series ops
if "StationId" in df.columns:
    df = df.sort_values(["StationId", "Datetime"]).reset_index(drop=True)
else:
    df = df.sort_values(["Datetime"]).reset_index(drop=True)

print("After sort shape:", df.shape)


Columns after sanitization:
 ['StationId', 'Datetime', 'PM2_5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket']
Datetime sample: 0   2017-11-24 17:00:00
1   2017-11-24 18:00:00
2   2017-11-24 19:00:00
3   2017-11-24 20:00:00
4   2017-11-24 21:00:00
Name: Datetime, dtype: datetime64[ns]
After sort shape: (200000, 16)


In [5]:
# Cell 5 — time features and lags
df["hour"] = df["Datetime"].dt.hour
df["day"] = df["Datetime"].dt.day
df["month"] = df["Datetime"].dt.month
df["dayofweek"] = df["Datetime"].dt.dayofweek

LAGS = [1, 2, 3]
ROLL_WINDOW = 3

# Create lags for main target PM2_5 (and create for AQI optionally)
target_col = "PM2_5"    # primary numeric target
extra_lag_cols = ["AQI"] if "AQI" in df.columns else []

lag_source_cols = [target_col] + extra_lag_cols

# Add lags and rolling mean per station
def make_lags(group):
    group = group.sort_values("Datetime")
    for col in lag_source_cols:
        for lag in LAGS:
            group[f"{col}_lag{lag}"] = group[col].shift(lag)
        group[f"{col}_roll{ROLL_WINDOW}"] = group[col].rolling(window=ROLL_WINDOW, min_periods=1).mean()
    return group

if "StationId" in df.columns:
    df = df.groupby("StationId").apply(make_lags).reset_index(drop=True)
else:
    df = make_lags(df)

# Show how many NaNs introduced by lags
lag_cols = [f"{target_col}_lag{l}" for l in LAGS] + [f"{target_col}_roll{ROLL_WINDOW}"]
print("Lag columns sample:", lag_cols)
print("NaNs from lags:", df[lag_cols].isnull().sum().to_dict())


Lag columns sample: ['PM2_5_lag1', 'PM2_5_lag2', 'PM2_5_lag3', 'PM2_5_roll3']
NaNs from lags: {'PM2_5_lag1': 40537, 'PM2_5_lag2': 40547, 'PM2_5_lag3': 40557, 'PM2_5_roll3': 36927}


  df = df.groupby("StationId").apply(make_lags).reset_index(drop=True)


In [6]:
# Cell 6 — create +1/+2/+3 hour targets and drop rows missing them
HORIZONS = [1, 2, 3]
for h in HORIZONS:
    df[f"PM2_5_t_plus_{h}"] = df.groupby("StationId")[target_col].shift(-h)

# Drop rows that have NaN in any target or in required lag features
required_cols_for_model = lag_cols + [target_col]  # these must be non-null
target_cols = [f"PM2_5_t_plus_{h}" for h in HORIZONS]
df_model = df.dropna(subset=required_cols_for_model + target_cols).reset_index(drop=True)

print("After creating targets: model dataset shape:", df_model.shape)


After creating targets: model dataset shape: (147964, 31)


In [8]:
# Cell 7 — select features & targets
# Build a conservative feature list — include lags, rolling mean, some pollutant cols, and time features
feature_cols = []

# Add PM2_5 and its lags/roll
feature_cols += [target_col] + [f"{target_col}_lag{l}" for l in LAGS] + [f"{target_col}_roll{ROLL_WINDOW}"]

# Add other pollutant columns if present
for c in ["PM10","NO","NO2","NOx","NH3","CO","SO2","O3","Benzene","Toluene","Xylene"]:
    if c in df_model.columns:
        feature_cols.append(c)

# Add time features
feature_cols += ["hour", "dayofweek", "month"]

# Ensure uniqueness & presence
feature_cols = [c for c in feature_cols if c in df_model.columns]
print("Final feature columns (count):", len(feature_cols))
print(feature_cols[:40])

X = df_model[feature_cols].astype(float)
y = df_model[[f"PM2_5_t_plus_{h}" for h in HORIZONS]].astype(float)

print("X shape:", X.shape)
print("y shape:", y.shape)


Final feature columns (count): 19
['PM2_5', 'PM2_5_lag1', 'PM2_5_lag2', 'PM2_5_lag3', 'PM2_5_roll3', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'hour', 'dayofweek', 'month']
X shape: (147964, 19)
y shape: (147964, 3)


In [9]:
# Cell 8 — time-based train/test split
# Use a simple time split to avoid leakage: keep last 15% as test
split_idx = int(len(df_model) * 0.85)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes:", X_test.shape, y_test.shape)


Train shapes: (125769, 19) (125769, 3)
Test shapes: (22195, 19) (22195, 3)


In [14]:
# Cell 9 — train model
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

# Tuned parameters for AQI multi-step prediction
xgb_params = {
    "n_estimators": 600,        # Higher for better accuracy
    "learning_rate": 0.05,      # Smaller for smoother learning
    "max_depth": 8,             # Can capture complex patterns
    "subsample": 0.8,           # Avoid overfitting
    "colsample_bytree": 0.8,    # Feature sampling
    "reg_alpha": 0.1,           # L1 regularization (helps with sparsity)
    "reg_lambda": 1.0,          # L2 regularization
    "min_child_weight": 3,      # Prevents overfitting small data splits
    "gamma": 0.1,               # Minimum loss reduction for further partition
    "random_state": 42,
    "n_jobs": -1,
    "tree_method": "hist"       # Faster on large datasets
}

xgbr = XGBRegressor(**xgb_params)

# Multi-output wrapper for predicting multiple future hours
model = MultiOutputRegressor(xgbr)

print("Training tuned XGBoost model… (will take some time)")
model.fit(X_train, y_train)
print("Training finished.")



Training tuned XGBoost model… (will take some time)
Training finished.


In [15]:
# Cell 10 — evaluation
from sklearn.metrics import mean_absolute_error

pred = model.predict(X_test)
mae_vals = [mean_absolute_error(y_test.iloc[:,i], pred[:,i]) for i in range(pred.shape[1])]

for i, mae in enumerate(mae_vals, start=1):
    print(f"MAE for +{i} hour: {mae:.3f}")


MAE for +1 hour: 25.959
MAE for +2 hour: 38.100
MAE for +3 hour: 44.680


In [16]:
# Cell 11 — save model and feature list
import joblib

MODEL_DIR = Path("..") / "backend" / "app" / "ml_model"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = MODEL_DIR / "aqi_rfr_multi.joblib"
FEATURES_PATH = MODEL_DIR / "feature_columns.txt"

joblib.dump(model, MODEL_PATH)
with open(FEATURES_PATH, "w", encoding="utf-8") as f:
    for col in feature_cols:
        f.write(col + "\n")

print("Saved model to:", MODEL_PATH)
print("Saved feature list to:", FEATURES_PATH)


Saved model to: ..\backend\app\ml_model\aqi_rfr_multi.joblib
Saved feature list to: ..\backend\app\ml_model\feature_columns.txt
