# Libraries

In [None]:
# Dataset manipulation modules
import numpy as np
import pandas as pd

# Iteration and naming tools
import re
from itertools import compress, product

# Plot tools
import matplotlib.pyplot as plt
import seaborn as sns

# P-values, Z-scores calculation tools, stats tools
import scipy

# Linear Regression tools
import statsmodels.api as sm

# Error calculation
import tensorflow as tf

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")

# train/ test split
from sklearn.model_selection import train_test_split

# Fourier
from numpy.fft import *


# Dataset

In [None]:
df = pd.read_csv("../input/calculated-aqi-caaqm-central-university-hyd/Raw_DATA.csv")

# Removing the columns that are not useful
df.drop(["To Date"], axis=1, inplace=True)

# Renaming Date column
df.rename({"From Date":"Date"}, axis=1, inplace=True)

# Changing date column into datetime object
df['Date'] = pd.to_datetime(df['Date'], dayfirst= True)

# changing columns to numeric values
for element in df.columns[1:]:
    df[element]= pd.to_numeric(df[element], errors='coerce')

# AIR QUALITY INDEX CALCULATION

## SI functions

In [None]:
# Sub-Index calculation functions (as per Indian Air Quality Standards)

# PM2.5
def SI_PM_25(x):
    SI = 0
    
    if pd.isna(x):
        SI = x
    elif x<=30: 
        SI = x*50/30 
    elif x>30 and x<=60:
        SI = 50+((x-30)*50/30) 
    elif x>60 and x<=90:
        SI = 100+((x-60)*100/30) 
    elif x>90 and x<=120:
        SI = 200+((x-90)*100/30)
    elif x>120 and x<=250:
        SI = 300+((x-120)*100/130)
    elif x>250:
        SI = 400+((x-250)*100/130)
    else:
        SI = x
    
    return SI

# PM10
def SI_PM_10(x):
    SI =0
    
    if pd.isna(x):
        SI = x
    elif x<=50 :
        SI = x
    elif x>50 and x<=100:
        SI= x
    elif x>100 and x<=250:
        SI = 100+((x-100)*100/150)
    elif x>250 and x<=350:
        SI = 200+(x-250)
    elif x>350 and x<=430:
        SI = 300+((x-350)*100/80)
    elif x>430:
        SI = 400+((x-430)*100/80)
    else:
        SI = x
    
    return SI

# NO2
def SI_NO2(x):
    SI =0
    
    if pd.isna(x):
        SI = x
    elif x<=40:
        SI = x*50/40
    elif x>40 and x<=80:
        SI = 50+((x-40)*50/40)
    elif x>80 and x<=180:
        SI = 100+((x-80)*100/100)
    elif x>180 and x<=280:
        SI = 200+((x-180)*100/100)
    elif x>280 and x<=400:
        SI = 300+((x-280)*100/120)
    elif x>400:
        SI = 400+((x-400)*100/120)
    else:
        SI = x
    
    return SI

# NH3
def SI_NH3(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=200:
        SI = x*50/200
    elif x>200 and x<=400:
        SI = 50+((x-200)*50/200)
    elif x>400 and x<=800:
        SI = 100+((x-400)*100/400)
    elif x>800 and x<=1200:
        SI = 200+((x-800)*100/400)
    elif x>1200 and x<=1800:
        SI = 300+((x-1200)*100/600)
    elif x>1800:
        SI = 400+((x-1800)*100/600)
    else:
        SI = x
    
    return SI

# SO2
def SI_SO2(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=40:
        SI = x*50/40
    elif x>40 and x<=80:
        SI = 50+((x-40)*50/40)
    elif x>80 and x<=380:
        SI = 100+((x-80)*100/300)
    elif x>380 and x<=800:
        SI = 200+((x-380)*100/420)
    elif x>800 and x<=1600:
        SI = 300+((x-800)*100/800)
    elif x>1600:
        SI =400+((x-1600)*100/800)
    else:
        SI = x
    
    return SI

# CO
def SI_CO(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=1:
        SI = x*50/1
    elif x>1 and x<=2:
        SI = 50+((x-1)*50/1) 
    elif x>2 and x<=10:
        SI = 100+((x-2)*100/8) 
    elif x>10 and x<=17:
        SI = 200+((x-10)*100/7)
    elif x>17 and x<=34:
        SI = 300+((x-17)*100/17)
    elif x>34:
        SI = 400+((x-34)*100/17)
    else:
        SI = x
    
    return SI

# OZONE
def SI_OZONE(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=50:
        SI = x*50/50 
    elif x>50 and x<=100:
        SI = 50+((x-50)*50/50)
    elif x>100 and x<=168:
        SI = 100+((x-100)*100/68)
    elif x>168 and x<=208:
        SI = 200+((x-168)*100/40)
    elif x>208 and x<=748:
        SI = 300+((x-208)*100/539)
    elif x>748:
        SI = 400+((x-400)*100/539)
    else:
        SI = x
    
    return SI

## SI Calculation

In [None]:
# Initiating a Dataframe with dates
subIndex = pd.DataFrame(df['Date'])

# Calculating Sub-Indices using functions defined in above code block.
subIndex['PM2.5 SI'] = df["PM2.5"].apply(SI_PM_25)
subIndex['PM10 SI'] = df["PM10"].apply(SI_PM_10)
subIndex['NO2 SI'] = df["NO2"].apply(SI_NO2)
subIndex['NH3 SI'] = df["NH3"].apply(SI_NH3)
subIndex['SO2 SI'] = df["SO2"].apply(SI_SO2)
subIndex['CO SI'] = df["CO"].apply(SI_CO)
subIndex['Ozone SI'] = df["Ozone"].apply(SI_OZONE)

## AQI Calculation

In [None]:
# AQI is calculated as per Indian AQI calculation standards

# Initiating Dataframe with Dates
aqi = pd.DataFrame(df['Date'])

# creating an empty AQI row to fill in
Nan = np.nan
aqi['AQI']=Nan

# iterating through rows # calculating AQI
for ind in subIndex.index:

    # checking either one of PM2.5 or PM10 is available
    while pd.notna(subIndex['PM2.5 SI'][ind]) or pd.notna(subIndex['PM10 SI'][ind]):

        p_list = [subIndex['PM2.5 SI'][ind], subIndex['PM10 SI'][ind], subIndex['NO2 SI'][ind], subIndex['NH3 SI'][ind], subIndex['SO2 SI'][ind], subIndex['CO SI'][ind],subIndex['Ozone SI'][ind]]

        # Check_list of bool of available pollutant values in each index or date
        Check_list = list(pd.notna(p_list))

        # checksum of available Pollutant values is used in if statement
        if sum(Check_list) >=3:
            
            # compresSub-Indexng the list of pollutants to remove Nan to remove uncertainities 
            aqi['AQI'][ind]= max(list(compress(p_list, Check_list)))

        break

## Changing Index and removing Year 2017

In [None]:
# converting subIndex to set freq of days for resampling
df = df.set_index('Date').asfreq('D', method="ffill")

# converting subIndex to set freq of days for resampling
subIndex = subIndex.set_index('Date').asfreq('D', method="ffill")

# converting AQI to set freq of days for resampling
aqi = aqi.set_index('Date').asfreq('D', method="ffill")


df_list = [df, subIndex, aqi]

# Dropping year 2017 Data as it is inconsistent with other years.
for itr in df_list:
    index = itr.loc["2017"].index
    itr.drop(index, inplace=True)


## Removing Outliers

In [None]:
# removing outliers in AQI data to perform regression
aqi = aqi.dropna()
z_scores = scipy.stats.zscore(aqi)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores<3).all(axis=1)
aqi = aqi[filtered_entries]

# DATA ANALYSIS

## Trend

In [None]:
t = np.arange(0, len(aqi))
p = np.polyfit(t, aqi.AQI.values, 1)
y = p[0]*t+p[1]

plt.plot(t,y)

After fitting the data to 1 degree polynomial which is basically a linear equation, we obtained coefficients and plotted a graph.
From the graph, we can conclude that the overall trend is decreasing as per time.

## Seasonality

In [None]:
# Define Signal
signal = aqi.AQI.values

freq, PSD = scipy.signal.periodogram(signal)

plt.plot(freq[:10], PSD[:10]) # checking the lower frequencies

In [None]:
periodogram = pd.DataFrame({"Period": (1/freq), "Power Spectral Density": PSD}).round(2)

periodogram[:10]

By checking spectral density related to each period, we can say that there is a seasonality in the data. The densities related to periods 307, 410, 1230 are higher.

# SARIMA with Fourier terms

SARIMA can't handle large seasonality periods like 300, 600. So, we extrapolate the seasonality using fourier transform and use it as an exogenous regressor for SARIMA model.
Since, We will be using fourier extrapolation as regressor, we use 7 as seasonality for SARIMA

## Train-Test Split

In [None]:
train, test = train_test_split(aqi, test_size=0.1, shuffle=False)

print("Train data shape: ",train.shape)
print("Test data shape: ", test.shape)

## Fourier Extrapolation

In [None]:
x = train.AQI.values

def fourierExtrapolation(x, n_predict):
    n = x.size
    
    n_harm = 20                     # number of harmonics in model
    
    t = np.arange(0, n)
    p = np.polyfit(t, x, 1)         # find linear trend in x
    
    x_notrend = x - p[0] * t        # detrended x
    
    x_freqdom = fft(x_notrend)  # detrended x in frequency domain
    f = fftfreq(n)              # frequencies
    
    indexes = list(range(n))
    # sort indexes by frequency, lower -> higher
    indexes.sort(key = lambda i: np.absolute(f[i]))
 
    t = np.arange(0, n + n_predict)
    restored_sig = np.zeros(t.size)
    
    for i in indexes[:1 + n_harm * 2]:
        ampli = np.absolute(x_freqdom[i]) / n   # amplitude
        phase = np.angle(x_freqdom[i])          # phase
        restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase)
    
    return restored_sig + p[0] * t

exog = fourierExtrapolation(x, len(test))

print("Shape of Exogenous Regressor (Fourier Extraction) for seasonality: ", exog.shape)

In [None]:
train['exog']= exog[:len(train)]
test['exog']= exog[len(train):]

## SARIMA

In [None]:
# Initial approximation of parameters using Autocorrelation and Partial Autocorrelation Plots
plt.figure(figsize=(15,7))

ax = plt.subplot(211)
ax.set_xlabel("Lag")
sm.graphics.tsa.plot_acf(train.AQI, lags=20, ax=ax)

ax = plt.subplot(212)
ax.set_xlabel("Lag")
sm.graphics.tsa.plot_pacf(train.AQI, lags=20, ax=ax)

plt.suptitle("ACF and PACF of AQI Data")
plt.tight_layout()
plt.show()

PACF has a significance till the 1st lag but after that it is insignificant.
Then, A best model should be selected using AIC(Akita Information Criteria) criteria.

### Best Model

In [None]:
# Initial approximation of parameters
Qs = range(0,2) # = 0,1
qs = range(0,2)
Ps = range(0,2)
ps = range(0,2)
D=0
d=0

parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)


def bestmodel(parameters_list, df):
    # Model Selection
    results = []
    best_aic = float("inf")
    warnings.filterwarnings('ignore')

    for param in parameters_list:
        
        try:
            model=sm.tsa.statespace.SARIMAX(df.AQI, exog=df.exog,order=(param[0], d, param[1]), 
                                            seasonal_order=(param[2], D, param[3], 7), enforce_stationarity=False).fit(disp=-1)
        except ValueError:
            print('wrong parameters:', param)
            continue

        aic = model.aic

        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        
        results.append([param, model.aic])
    return results, best_model

results, best_model = bestmodel(parameters_list, train)

In [None]:
# Best Models
result_table = pd.DataFrame(results)
result_table.columns = ['parameters (p,q,P,Q)', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())
print(best_model.summary())

In [None]:
best_model.plot_diagnostics(figsize=(18, 8))
plt.show()

### Forecast

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))

# Plot the data (here we are subsetting it to get a better look at the forecasts)
aqi.plot(ax=ax)

# Construct the forecasts
fcast = best_model.get_forecast(len(test), exog= test.exog).summary_frame()
fcast.index = test.index
fcast['mean'].plot(ax=ax, style='k--')
ax.fill_between(fcast.index, fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1)

plt.title("ARIMA with fourier terms FORECAST")
plt.legend(["True Value", "ARIMA Prediction"])
plt.xlabel("DATE")
plt.ylabel("AQI Value")

### Error

In [None]:
true_values = test.AQI.values
pred_values = fcast['mean'].values

mae_a= tf.keras.metrics.mean_absolute_error(y_pred=pred_values, y_true=true_values).numpy()
rmse_a = tf.sqrt(tf.losses.mean_squared_error(y_pred=pred_values, y_true=true_values)).numpy()
print("MAE: ",mae_a)
print("RMSE: ",rmse_a)

## Fourier Forecast

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
aqi.plot(ax=ax)
plt.plot(train.exog)
plt.plot(test.exog)
plt.legend(["True Value", "Fourier Series used for Training", "Predicted Fourier Terms"])
plt.title("FOURIER Forecast")

### Error

In [None]:
true_values = test.AQI.values
pred_values = test.exog.values

mae_f = tf.keras.metrics.mean_absolute_error(y_pred=pred_values, y_true=true_values).numpy()
rmse_f= tf.sqrt(tf.losses.mean_squared_error(y_pred=pred_values, y_true=true_values)).numpy()
print("MAE: ",mae_f)
print("RMSE: ",rmse_f)

## Conclusion

In [None]:
mae = [mae_a, mae_f]
rmse = [rmse_a, rmse_f]

print("DATAFRAME WITH RESPECTIVE ERRORS")
pd.DataFrame({"MAE": mae, "RMSE": rmse}, index=["ARIMA WITH FOURIER", "FOURIER"])

**SARIMA DOESN'T WORK BETTER FOR LONGER SEASONAL PERIODS. SO, WE TRIED MODELING WITH FOURIER TERMS AS EXOGENOUS REGRESSORS. BUT, THE FOURIER FORECAST HAS SLIGHTLY BETTER MAE AND RMSE THAN SARIMA.**

**BUT, IF WE USE MONTHLY MEAN AS A DATA POINT AND USE 12 MONTHS AS A SEASONALITY THEN WE CAN GET BETTER MONTHLY PREDICTION.**

**SINCE, WE NEED DAILY PREDICTION, WE TOOK THIS APPROACH OF USING FOURIER TERMS AS A SEASONAL PARAMETERS.**

**EVEN THOUGH, SARIMA HASN'T GIVEN BETTER PREDICTIONS THAN FOURIER. BY INCREASING DATA POINTS, WE CAN ACHIEVE BETTER ACCURACY IN SARIMA.**