In [2]:
import pandas as pd
import glob
import os

folder_path = "../data/raw/mumbai/"
files = glob.glob(os.path.join(folder_path, "*.csv"))

dfs = []

for file in files:
    df = pd.read_csv(file)

    # Standardize column names
    df.columns = df.columns.str.strip()

    # Parse date
    df["From Date"] = pd.to_datetime(df["From Date"], errors="coerce")

    # Keep essentials
    df = df[["From Date", "PM2.5"]].copy()

    # Drop bad rows
    df = df.dropna(subset=["From Date", "PM2.5"])

    # Add station name
    df["station"] = os.path.basename(file).replace(".csv", "")

    dfs.append(df)

mumbai_raw = pd.concat(dfs, ignore_index=True)


In [3]:
def pm25_to_aqi(pm):
    if pm <= 30:
        return pm * 50 / 30
    elif pm <= 60:
        return 50 + (pm - 30) * 50 / 30
    elif pm <= 90:
        return 100 + (pm - 60) * 100 / 30
    elif pm <= 120:
        return 200 + (pm - 90) * 100 / 30
    elif pm <= 250:
        return 300 + (pm - 120) * 100 / 130
    else:
        return 400 + (pm - 250) * 100 / 130

mumbai_raw["AQI"] = mumbai_raw["PM2.5"].apply(pm25_to_aqi)


In [4]:
mumbai_raw["date"] = mumbai_raw["From Date"].dt.date

mumbai_daily = (
    mumbai_raw
    .groupby("date")["AQI"]
    .mean()
    .reset_index()
)

mumbai_daily["date"] = pd.to_datetime(mumbai_daily["date"])


In [5]:
mumbai_daily.to_csv(
    "../data/processed/mumbai_daily_aqi.csv",
    index=False
)


In [6]:
import plotly.express as px

fig = px.line(
    mumbai_daily,
    x="date",
    y="AQI",
    title="Mumbai Daily AQI (PM2.5-based)",
    labels={"AQI": "Average AQI"}
)

fig.update_traces(line=dict(width=3))
fig.update_layout(hovermode="x unified")

fig.show()
