In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/major_city.csv")

df.head()


Unnamed: 0,City,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Delhi,2015-01-01,153.3,241.7,182.9,33.0,81.3,38.5,1.87,64.5,83.6,18.93,20.81,8.32,204.5,Severe
1,Mumbai,2015-01-01,70.5,312.7,195.0,42.0,122.5,31.5,7.22,83.8,108.0,2.01,19.41,2.86,60.9,Satisfactory
2,Chennai,2015-01-01,174.1,275.4,56.2,68.8,230.9,28.5,8.56,60.8,43.9,19.07,10.19,9.63,486.5,Severe
3,Kolkata,2015-01-01,477.2,543.9,14.1,76.4,225.9,45.6,2.41,42.1,171.1,9.31,11.65,9.39,174.4,Very Poor
4,Bangalore,2015-01-01,171.6,117.7,123.3,12.4,61.9,49.7,1.26,79.7,164.3,6.04,12.74,9.59,489.7,Good


In [2]:
df["Datetime"] = pd.to_datetime(df["Datetime"], errors="coerce")

# drop bad rows
df = df.dropna(subset=["Datetime"])


In [3]:
df = df[["City", "Datetime", "AQI"]]


In [4]:
df["year"] = df["Datetime"].dt.year


In [5]:
cities = ["Delhi", "Mumbai", "Chennai", "Kolkata", "Bangalore"]

df = df[df["City"].isin(cities)]


In [6]:
df = df[df["AQI"] > 0]
df = df[df["AQI"] < 1000]   # remove outliers if any


In [7]:
pre_covid = df[(df["year"] >= 2015) & (df["year"] <= 2019)]
post_covid = df[(df["year"] >= 2021) & (df["year"] <= 2024)]


In [8]:
pre_avg = pre_covid.groupby("City")["AQI"].mean().reset_index()
pre_avg.columns = ["City", "AQI_pre_covid"]

post_avg = post_covid.groupby("City")["AQI"].mean().reset_index()
post_avg.columns = ["City", "AQI_post_covid"]


In [9]:
compare = pre_avg.merge(post_avg, on="City")

compare["AQI_change"] = compare["AQI_post_covid"] - compare["AQI_pre_covid"]

compare = compare.sort_values("AQI_pre_covid", ascending=False)

compare


Unnamed: 0,City,AQI_pre_covid,AQI_post_covid,AQI_change
4,Mumbai,252.071358,255.262329,3.190971
1,Chennai,251.701041,251.290623,-0.410418
2,Delhi,250.209529,256.274675,6.065146
3,Kolkata,248.68598,254.486516,5.800536
0,Bangalore,247.484885,249.936003,2.451118


In [10]:
df.to_csv("../data/processed/clean_major_5_cities.csv", index=False)


In [11]:
pre_covid.to_csv("../data/processed/pre_covid_clean.csv", index=False)
post_covid.to_csv("../data/processed/post_covid_clean.csv", index=False)


In [12]:
compare.to_csv("city_pre_post_covid_summary.csv", index=False)


In [13]:
import plotly.graph_objects as go

# highlight Delhi
colors_pre = ["crimson" if c == "Delhi" else "#9ecae1" for c in compare["City"]]
colors_post = ["darkred" if c == "Delhi" else "#3182bd" for c in compare["City"]]

fig = go.Figure()

# Pre-COVID bars
fig.add_trace(go.Bar(
    x=compare["City"],
    y=compare["AQI_pre_covid"],
    name="Pre-COVID Avg AQI (2015–2019)",
    marker_color=colors_pre
))

# Post-COVID bars
fig.add_trace(go.Bar(
    x=compare["City"],
    y=compare["AQI_post_covid"],
    name="Post-COVID Avg AQI (2021–2024)",
    marker_color=colors_post
))

fig.update_layout(
    title="City-wise AQI Before vs After COVID",
    xaxis_title="City",
    yaxis_title="Average AQI",
    barmode="group",
    height=520,
    template="plotly_white",
    hovermode="x unified"
)

fig.show()


In [14]:
import pandas as pd

df = pd.read_csv("../data/raw/major_city.csv")

# fix date
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# numeric AQI
df["AQI Value"] = pd.to_numeric(df["AQI Value"], errors="coerce")

# drop bad rows
df = df.dropna(subset=["Date", "AQI Value"])

# keep major cities only
major = ["Delhi", "Mumbai", "Chennai", "Kolkata", "Bangalore"]
df = df[df["City"].isin(major)]


KeyError: 'Date'

In [15]:
import pandas as pd

df = pd.read_csv("../data/raw/major_city.csv")

# convert datetime
df["Datetime"] = pd.to_datetime(df["Datetime"], errors="coerce")

# numeric AQI
df["AQI"] = pd.to_numeric(df["AQI"], errors="coerce")

# drop bad rows
df = df.dropna(subset=["Datetime", "AQI"])


In [16]:
major = ["Delhi", "Mumbai", "Chennai", "Kolkata", "Bangalore"]

df = df[df["City"].isin(major)]


In [17]:
monthly = (
    df
    .set_index("Datetime")
    .groupby("City")["AQI"]
    .resample("M")
    .mean()
    .reset_index()
)


ValueError: Invalid frequency: M. Failed to parse with error message: ValueError("'M' is no longer supported for offsets. Please use 'ME' instead.")

In [18]:
# ensure AQI numeric
df["AQI"] = pd.to_numeric(df["AQI"], errors="coerce")

# drop bad AQI rows
df = df.dropna(subset=["AQI"])

# remove impossible AQI values
df = df[(df["AQI"] > 0) & (df["AQI"] < 1000)]


In [19]:
major = ["Delhi", "Mumbai", "Chennai", "Kolkata", "Bangalore"]
df = df[df["City"].isin(major)]


In [20]:
df = df[(df["Datetime"].dt.year >= 2018) & (df["Datetime"].dt.year <= 2024)]


In [21]:
import plotly.express as px

fig = px.line(
    monthly,
    x="Datetime",
    y="AQI",
    color="City",
    line_shape="spline",
    title="AQI Wave Pattern — COVID Impact Across Major Indian Cities"
)

# make Delhi stand out
for trace in fig.data:
    if trace.name == "Delhi":
        trace.line.width = 5
    else:
        trace.line.width = 2
        trace.opacity = 0.45

# lockdown marker
fig.add_vline(
    x="2020-03-25",
    line_dash="dash",
    line_width=2,
    annotation_text="COVID Lockdown"
)

fig.update_layout(
    template="plotly_white",
    hovermode="x unified",
    height=560
)

fig.show()


NameError: name 'monthly' is not defined

In [22]:
import pandas as pd

df = pd.read_csv("../data/raw/major_city.csv")

# fix datetime
df["Datetime"] = pd.to_datetime(df["Datetime"], errors="coerce")

# fix AQI
df["AQI"] = pd.to_numeric(df["AQI"], errors="coerce")

# drop bad rows
df = df.dropna(subset=["Datetime", "AQI"])

# remove outliers
df = df[(df["AQI"] > 0) & (df["AQI"] < 1000)]

df.head()


Unnamed: 0,City,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Delhi,2015-01-01,153.3,241.7,182.9,33.0,81.3,38.5,1.87,64.5,83.6,18.93,20.81,8.32,204.5,Severe
1,Mumbai,2015-01-01,70.5,312.7,195.0,42.0,122.5,31.5,7.22,83.8,108.0,2.01,19.41,2.86,60.9,Satisfactory
2,Chennai,2015-01-01,174.1,275.4,56.2,68.8,230.9,28.5,8.56,60.8,43.9,19.07,10.19,9.63,486.5,Severe
3,Kolkata,2015-01-01,477.2,543.9,14.1,76.4,225.9,45.6,2.41,42.1,171.1,9.31,11.65,9.39,174.4,Very Poor
4,Bangalore,2015-01-01,171.6,117.7,123.3,12.4,61.9,49.7,1.26,79.7,164.3,6.04,12.74,9.59,489.7,Good


In [23]:
major = ["Delhi", "Mumbai", "Chennai", "Kolkata", "Bangalore"]
df = df[df["City"].isin(major)]


In [24]:
df = df[(df["Datetime"].dt.year >= 2018) & (df["Datetime"].dt.year <= 2024)]


In [26]:
monthly = (
    df
    .set_index("Datetime")
    .groupby("City")["AQI"]
    .resample("ME")
    .mean()
    .reset_index()
)

monthly.head()


Unnamed: 0,City,Datetime,AQI
0,Bangalore,2018-01-31,222.958065
1,Bangalore,2018-02-28,233.657143
2,Bangalore,2018-03-31,254.241935
3,Bangalore,2018-04-30,258.703333
4,Bangalore,2018-05-31,227.651613


In [30]:
import plotly.express as px

fig = px.line(
    monthly,
    x="Datetime",
    y="AQI",
    color="City",
    line_shape="spline",
    title="AQI Wave — COVID Impact Across Major Cities"
)

for trace in fig.data:
    if trace.name == "Delhi":
        trace.line.width = 5
    else:
        trace.line.width = 2
        trace.opacity = 0.45

# import pandas as pd

fig.add_vline(
    x=pd.Timestamp("2020-03-25"),
    line_dash="dash",
    line_width=2,
    annotation_text="COVID Lockdown",
    annotation_position="top"
)


fig.update_layout(
    template="plotly_white",
    hovermode="x unified",
    height=560
)

fig.show()


TypeError: Addition/subtraction of integers and integer-arrays with Timestamp is no longer supported.  Instead of adding/subtracting `n`, use `n * obj.freq`

In [29]:
import pandas as pd

fig.add_vline(
    x=pd.Timestamp("2020-03-25"),
    line_dash="dash",
    line_width=2,
    annotation_text="COVID Lockdown",
    annotation_position="top"
)


TypeError: Addition/subtraction of integers and integer-arrays with Timestamp is no longer supported.  Instead of adding/subtracting `n`, use `n * obj.freq`

In [31]:
import plotly.express as px
import pandas as pd

fig = px.line(
    monthly,
    x="Datetime",
    y="AQI",
    color="City",
    line_shape="spline",
    title="AQI Wave — COVID Impact Across Major Cities"
)

# highlight Delhi
for trace in fig.data:
    if trace.name == "Delhi":
        trace.line.width = 5
    else:
        trace.line.width = 2
        trace.opacity = 0.45

# lockdown marker — FIXED
fig.add_vline(
    x=pd.Timestamp("2020-03-25"),
    line_dash="dash",
    line_width=2,
    annotation_text="COVID Lockdown",
    annotation_position="top"
)

fig.update_layout(
    template="plotly_white",
    hovermode="x unified",
    height=560
)

fig.show()


TypeError: Addition/subtraction of integers and integer-arrays with Timestamp is no longer supported.  Instead of adding/subtracting `n`, use `n * obj.freq`

In [32]:
import plotly.express as px
import pandas as pd

fig = px.line(
    monthly,
    x="Datetime",
    y="AQI",
    color="City",
    line_shape="spline",
    title="AQI Wave — COVID Impact Across Major Cities"
)

# highlight Delhi
for trace in fig.data:
    if trace.name == "Delhi":
        trace.line.width = 5
    else:
        trace.line.width = 2
        trace.opacity = 0.45

# lockdown marker — stable method
lock_date = pd.Timestamp("2020-03-25")

fig.add_shape(
    type="line",
    x0=lock_date,
    x1=lock_date,
    y0=monthly["AQI"].min(),
    y1=monthly["AQI"].max(),
    line=dict(color="black", width=2, dash="dash")
)

fig.add_annotation(
    x=lock_date,
    y=monthly["AQI"].max(),
    text="COVID Lockdown",
    showarrow=True,
    arrowhead=2,
    yshift=10
)

fig.update_layout(
    template="plotly_white",
    hovermode="x unified",
    height=560
)

fig.show()


In [33]:
# split delhi vs others
delhi = monthly[monthly["City"] == "Delhi"]

others = (
    monthly[monthly["City"] != "Delhi"]
    .groupby("Datetime")["AQI"]
    .mean()
    .reset_index()
)

others["City"] = "Other Major Cities (avg)"


In [34]:
wave_df = pd.concat([delhi, others])


In [35]:
import plotly.express as px

fig = px.line(
    wave_df,
    x="Datetime",
    y="AQI",
    color="City",
    title="COVID Impact on AQI — Delhi vs Other Major Cities",
    markers=True
)

# styling
for trace in fig.data:
    if "Delhi" in trace.name:
        trace.line.width = 5
    else:
        trace.line.width = 3
        trace.opacity = 0.5
        trace.line.dash = "dot"

fig.update_layout(
    template="plotly_white",
    height=520,
    hovermode="x unified",
    legend_title="",
    xaxis_title="Time",
    yaxis_title="Average AQI"
)

fig.show()


In [36]:
import plotly.express as px

fig = px.line(
    monthly,
    x="Datetime",
    y="AQI",
    facet_col="City",
    facet_col_wrap=2,
    line_shape="spline",
    title="Monthly AQI Trends by City — COVID Impact"
)

fig.update_layout(
    template="plotly_white",
    height=700,
    showlegend=False
)

fig.show()


In [37]:
df["period"] = "Post"

df.loc[df["Datetime"].dt.year <= 2019, "period"] = "Pre-COVID"
df.loc[df["Datetime"].dt.year == 2020, "period"] = "COVID Year"


In [38]:
fig = px.box(
    df,
    x="City",
    y="AQI",
    color="period",
    title="AQI Distribution — Pre vs COVID vs Post",
)

fig.update_layout(
    template="plotly_white",
    height=520
)

fig.show()


In [39]:
yearly = (
    df.groupby(["City", df["Datetime"].dt.year])["AQI"]
    .mean()
    .reset_index(name="avg_aqi")
    .rename(columns={"Datetime":"year"})
)


In [40]:
fig = px.line(
    yearly,
    x="Datetime",
    y="avg_aqi",
    color="City",
    markers=True,
    title="Yearly AQI Recovery After COVID"
)

fig.update_layout(template="plotly_white")
fig.show()


ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['City', 'year', 'avg_aqi'] but received: Datetime

In [41]:
pre = df[df["Datetime"].dt.year <= 2019].groupby("City")["AQI"].mean()
covid = df[df["Datetime"].dt.year == 2020].groupby("City")["AQI"].mean()

drop = (covid - pre).reset_index()
drop.columns = ["City", "AQI_change"]


In [42]:
fig = px.bar(
    drop,
    x="City",
    y="AQI_change",
    title="AQI Change During COVID Year",
    color="AQI_change"
)

fig.update_layout(template="plotly_white")
fig.show()


In [43]:
# pre-covid baseline per city
baseline = (
    df[df["Datetime"].dt.year <= 2019]
    .groupby("City")["AQI"]
    .mean()
)

baseline


City
Bangalore    246.591096
Chennai      246.710000
Delhi        248.650548
Kolkata      254.143562
Mumbai       258.259726
Name: AQI, dtype: float64

In [44]:
df["AQI_indexed"] = df.apply(
    lambda r: r["AQI"] / baseline[r["City"]],
    axis=1
)


In [45]:
indexed_monthly = (
    df
    .set_index("Datetime")
    .groupby("City")["AQI_indexed"]
    .resample("ME")
    .mean()
    .reset_index()
)


In [46]:
import plotly.express as px

fig = px.line(
    indexed_monthly,
    x="Datetime",
    y="AQI_indexed",
    color="City",
    line_shape="spline",
    title="Post-COVID AQI Recovery — Relative to Each City's Baseline"
)

# styling — highlight Delhi
for trace in fig.data:
    if trace.name == "Delhi":
        trace.line.width = 5
    else:
        trace.line.width = 2
        trace.opacity = 0.45

fig.update_layout(
    template="plotly_white",
    hovermode="x unified",
    yaxis_title="AQI vs Pre-COVID Baseline (=1.0)",
    height=560
)

fig.show()


In [47]:
# extra smoothing (3-month rolling)
indexed_monthly["smooth"] = (
    indexed_monthly
    .groupby("City")["AQI_indexed"]
    .transform(lambda s: s.rolling(3, center=True).mean())
)


In [50]:
import plotly.express as px

fig = px.line(
    indexed_monthly,
    x="Datetime",
    y="smooth",
    color="City",
    line_shape="spline",
    title="Post-COVID AQI Recovery — Relative to Each City's Normal Level"
)

# highlight Delhi only
for trace in fig.data:
    if trace.name == "Delhi":
        trace.line.width = 6
    else:
        trace.line.width = 2
        trace.opacity = 0.35

# add COVID band instead of vline (more readable)
fig.add_vrect(
    x0="2020-03-01",
    x1="2020-06-01",
    fillcolor="gray",
    opacity=0.12,
    layer="below",
    annotation_text="Lockdown Period",
    annotation_position="top left"
)

fig.update_layout(
    template="plotly_white",
    height=560,
    hovermode="x unified",
    yaxis_title="AQI vs Baseline (1.0 = Pre-COVID Normal)",
    legend_title="City"
)

fig.show()
fig.write_html("../plots/recovery_covid.html")


In [49]:
indexed_monthly["Group"] = indexed_monthly["City"].apply(
    lambda x: "Delhi" if x == "Delhi" else "Other Cities"
)

grouped = (
    indexed_monthly
    .groupby(["Datetime","Group"])["smooth"]
    .mean()
    .reset_index()
)

px.line(grouped, x="Datetime", y="smooth", color="Group",
        line_shape="spline",
        title="Delhi vs Other Cities — AQI Recovery Pattern")
