<a href="https://colab.research.google.com/github/sonupatel24/air-quality-forecasting/blob/main/Streamlit_Air_Quality_Forecasting_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
from prophet import Prophet

In [None]:
df = pd.read_csv('/content/air_quality_clean.csv')

In [None]:
df.head()

Unnamed: 0,Date,Air_Quality
0,2018-07-01,
1,2018-07-02,44.375
2,2018-07-03,56.65
3,2018-07-04,53.681818
4,2018-07-05,62.875


In [None]:
print(df.isnull().sum())

Date           0
Air_Quality    2
dtype: int64


In [None]:
df['Air_Quality'].fillna(df['Air_Quality'].median(), inplace=True)
print(df.isnull().sum())

Date           0
Air_Quality    0
dtype: int64



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df.head()

Unnamed: 0,Date,Air_Quality
0,2018-07-01,72.0
1,2018-07-02,44.375
2,2018-07-03,56.65
3,2018-07-04,53.681818
4,2018-07-05,62.875


In [None]:
# Resample to monthly average AQI (assuming AQI column is named 'Air_Quality')
monthly_df = df.resample('M', on='Date').mean().reset_index()


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



In [None]:
monthly_df

Unnamed: 0,ds,y
0,2018-07-31,48.561607
1,2018-08-31,43.944256
2,2018-09-30,67.420808
3,2018-10-31,136.220247
4,2018-11-30,119.173152
...,...,...
61,2023-08-31,74.771505
62,2023-09-30,42.723168
63,2023-10-31,107.565860
64,2023-11-30,92.601993


In [None]:
# Rename columns for Prophet
monthly_df = monthly_df.rename(columns={'Date': 'ds', 'Air_Quality': 'y'})

In [None]:
# Log-transform
monthly_df['y'] = np.log1p(monthly_df['y'])

In [None]:
# Clip outliers
monthly_df['y'] = monthly_df['y'].clip(upper=np.log1p(300)) # adjust upper bound if needed

In [None]:
# ---------------------
@st.cache_resource
def train_model(data):
    m = Prophet()
    m.fit(data)
    return m

with st.spinner("⏳ Training forecast model..."):
    model = train_model(monthly_df)

INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpdv584g9y/50po1y_h.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpdv584g9y/56dhty_o.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=48012', 'data', 'file=/tmp/tmpdv584g9y/50po1y_h.json', 'init=/tmp/tmpdv584g9y/56dhty_o.json', 'output', 'file=/tmp/tmpdv584g9y/prophet_model71u2uj_d/prophet_model-20250817063454.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
06:34:54 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
06:34:54 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


In [None]:
import plotly.graph_objs as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_original'], mode='lines', name='Predicted AQI'))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_upper_original'], mode='lines', name='Upper Bound', line=dict(dash='dot')))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_lower_original'], mode='lines', name='Lower Bound', line=dict(dash='dot')))
fig.update_layout(title="Monthly Average AQI Forecast", xaxis_title="Date", yaxis_title="AQI")
fig.show()

In [None]:
import plotly.express as px

# Daily AQI trend
fig_daily = px.line(df, x='Date', y='Air_Quality', title='Daily AQI Trend')
fig_daily.show()


In [None]:
import plotly.express as px

# Prepare data for heatmap
df_heatmap = df.copy()
df_heatmap['Year'] = df_heatmap['Date'].dt.year
df_heatmap['Month'] = df_heatmap['Date'].dt.month_name()

# Group by year and month and calculate mean AQI
heatmap_data = df_heatmap.pivot_table(index='Year', columns='Month', values='Air_Quality', aggfunc='mean')

# Reorder columns to be in chronological order of months
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
heatmap_data = heatmap_data[month_order]

# Create heatmap
fig_heatmap = px.imshow(heatmap_data,
                         title='Monthly vs Yearly Average AQI Heatmap',

                         labels={'x': 'Month', 'y': 'Year', 'color': 'Average AQI'},
                         color_continuous_scale='Viridis',
                         x=heatmap_data.columns,
                         y=heatmap_data.index)

fig_heatmap.update_layout(xaxis={'side': 'top'})
fig_heatmap.show()

In [None]:
# Identify months or seasons with highest pollution
# We can look at the heatmap_data to find the months with the highest average AQI across all years.

# Calculate the average AQI for each month across all years
monthly_avg_aqi = heatmap_data.mean()

# Sort the months by average AQI in descending order
sorted_monthly_avg_aqi = monthly_avg_aqi.sort_values(ascending=False)

print("Months with highest average pollution across all years:")
print(sorted_monthly_avg_aqi)

# Based on the sorted list, we can identify seasons with highest pollution.
# For example, if November, December, and January have the highest averages,
# then winter is likely the season with the highest pollution.

Months with highest average pollution across all years:
Month
February     110.959513
December     107.284117
January      101.470948
November      96.080077
March         95.163700
October       88.038356
April         87.500785
May           64.154755
September     52.458745
August        49.583154
June          48.909661
July          43.044551
dtype: float64


In [None]:
import plotly.express as px

# Histogram of AQI
fig_hist = px.histogram(df, x='Air_Quality', title='Distribution of Daily AQI')
fig_hist.show()

# Box plot to visualize outliers
fig_box = px.box(df, y='Air_Quality', title='Box Plot of Daily AQI')
fig_box.show()
