# LINEAR REGRESSION TO PREDICT AQI (TIME SERIES)

In [None]:
import os
print("Folders and files in Dataset:")
print("-"*50)
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Importing modules to work on dataset

In [None]:
# Dataset manipulation modules
import numpy as np
import pandas as pd

# Iteration and naming tools
import re
from itertools import compress

# Plot tools
import matplotlib.pyplot as plt
import seaborn as sns

# P-values, Z-scores calculation tools
import scipy

# Linear Regression tools
import statsmodels.api as sm

# Error calculation
import tensorflow as tf

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")

# Sections in the notebook
---
1. [**Glimpse of Raw Dataset**](#GLimpse)
2. [**Changing Raw to Useful Dataset**](#Useful)
3. [**Sub-Index Calculation (as per Indian Air Quality Standards)**](#Sub)
4. [**Calculation of AQI from Sub-Indices**](#AQI)
5. [**Final Dataframes obtained after Calculations**](#AC)
6. [**Overview of relations between features and AQI**](#Overview)
7. [**Linear Regression**](#LR)
    1. [**With Feautures**](#WF)
    2. [**With Features from Past day**](#WPF)
    3. [**With Itself**](#WI)
        1. [*Previous day*](#Pd)
        2. [*Previous significant days*](#Psd)
8. [**Final Errors for each Regression**](#final)


<div id="GLimpse"/>

# 1. Glimpse of Raw Dataset
---
---

In [None]:
# Reading CSV file

df = pd.read_csv("../input/calculated-aqi-caaqm-central-university-hyd/Raw_DATA.csv")
df.Name ="RAW DATA"

def Elements(df):
    print("Elements in "+str(df.Name)+":")
    print('-'*100)
    print("shape of "+str(df.Name)+" = "+ str(df.shape))
    print('-'*100)
    print("columns of "+str(df.Name)+" = "+ str(df.columns))
    print('-'*100)
    print("Info of Data:")
    print('.'*20)
    print(df.info())

print(df.head(5))
print('-'*50)
Elements(df)

<div id="Useful"/>

# 2. Changing Raw to Useful Data

---
---

In [None]:
# Removing the columns that are not useful
df.drop(["To Date"], axis=1, inplace=True)

# Renaming Date column
df.rename({"From Date":"Date"}, axis=1, inplace=True)

# Changing date column into datetime object
df['Date'] = pd.to_datetime(df['Date'], dayfirst= True)

# changing columns to numeric values
for element in df.columns[1:]:
    df[element]= pd.to_numeric(df[element], errors='coerce')

In [None]:
print(df.head(5))
print('-'*50)
Elements(df)

<div id="Sub"/>

# 3. Sub-Index Calculation (as per Indian Air Quality Standards)
---
---

In [None]:
# Sub-Index calculation functions (as per Indian Air Quality Standards)

# PM2.5
def SI_PM_25(x):
    SI = 0
    
    if pd.isna(x):
        SI = x
    elif x<=30: 
        SI = x*50/30 
    elif x>30 and x<=60:
        SI = 50+((x-30)*50/30) 
    elif x>60 and x<=90:
        SI = 100+((x-60)*100/30) 
    elif x>90 and x<=120:
        SI = 200+((x-90)*100/30)
    elif x>120 and x<=250:
        SI = 300+((x-120)*100/130)
    elif x>250:
        SI = 400+((x-250)*100/130)
    else:
        SI = x
    
    return SI

# PM10
def SI_PM_10(x):
    SI =0
    
    if pd.isna(x):
        SI = x
    elif x<=50 :
        SI = x
    elif x>50 and x<=100:
        SI= x
    elif x>100 and x<=250:
        SI = 100+((x-100)*100/150)
    elif x>250 and x<=350:
        SI = 200+(x-250)
    elif x>350 and x<=430:
        SI = 300+((x-350)*100/80)
    elif x>430:
        SI = 400+((x-430)*100/80)
    else:
        SI = x
    
    return SI

# NO2
def SI_NO2(x):
    SI =0
    
    if pd.isna(x):
        SI = x
    elif x<=40:
        SI = x*50/40
    elif x>40 and x<=80:
        SI = 50+((x-40)*50/40)
    elif x>80 and x<=180:
        SI = 100+((x-80)*100/100)
    elif x>180 and x<=280:
        SI = 200+((x-180)*100/100)
    elif x>280 and x<=400:
        SI = 300+((x-280)*100/120)
    elif x>400:
        SI = 400+((x-400)*100/120)
    else:
        SI = x
    
    return SI

# NH3
def SI_NH3(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=200:
        SI = x*50/200
    elif x>200 and x<=400:
        SI = 50+((x-200)*50/200)
    elif x>400 and x<=800:
        SI = 100+((x-400)*100/400)
    elif x>800 and x<=1200:
        SI = 200+((x-800)*100/400)
    elif x>1200 and x<=1800:
        SI = 300+((x-1200)*100/600)
    elif x>1800:
        SI = 400+((x-1800)*100/600)
    else:
        SI = x
    
    return SI

# SO2
def SI_SO2(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=40:
        SI = x*50/40
    elif x>40 and x<=80:
        SI = 50+((x-40)*50/40)
    elif x>80 and x<=380:
        SI = 100+((x-80)*100/300)
    elif x>380 and x<=800:
        SI = 200+((x-380)*100/420)
    elif x>800 and x<=1600:
        SI = 300+((x-800)*100/800)
    elif x>1600:
        SI =400+((x-1600)*100/800)
    else:
        SI = x
    
    return SI

# CO
def SI_CO(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=1:
        SI = x*50/1
    elif x>1 and x<=2:
        SI = 50+((x-1)*50/1) 
    elif x>2 and x<=10:
        SI = 100+((x-2)*100/8) 
    elif x>10 and x<=17:
        SI = 200+((x-10)*100/7)
    elif x>17 and x<=34:
        SI = 300+((x-17)*100/17)
    elif x>34:
        SI = 400+((x-34)*100/17)
    else:
        SI = x
    
    return SI

# OZONE
def SI_OZONE(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=50:
        SI = x*50/50 
    elif x>50 and x<=100:
        SI = 50+((x-50)*50/50)
    elif x>100 and x<=168:
        SI = 100+((x-100)*100/68)
    elif x>168 and x<=208:
        SI = 200+((x-168)*100/40)
    elif x>208 and x<=748:
        SI = 300+((x-208)*100/539)
    elif x>748:
        SI = 400+((x-400)*100/539)
    else:
        SI = x
    
    return SI

In [None]:
# Initiating a Dataframe with dates
subIndex = pd.DataFrame(df['Date'])

# Calculating Sub-Indices using functions defined in above code block.
subIndex['PM2.5 SI'] = df["PM2.5"].apply(SI_PM_25)
subIndex['PM10 SI'] = df["PM10"].apply(SI_PM_10)
subIndex['NO2 SI'] = df["NO2"].apply(SI_NO2)
subIndex['NH3 SI'] = df["NH3"].apply(SI_NH3)
subIndex['SO2 SI'] = df["SO2"].apply(SI_SO2)
subIndex['CO SI'] = df["CO"].apply(SI_CO)
subIndex['Ozone SI'] = df["Ozone"].apply(SI_OZONE)

subIndex.Name = "Sub-Index Dataframe"

print(subIndex.head(5))
print('-'*50)
Elements(subIndex)
print("** SI means Sub-Index")

<div id="AQI"/>

# 4. Calculation of AQI from Sub-Indices
---
---

In [None]:
# AQI is calculated as per Indian AQI calculation standards

# Initiating Dataframe with Dates
aqi = pd.DataFrame(df['Date'])

# creating an empty AQI row to fill in
Nan = np.nan
aqi['AQI']=Nan

# iterating through rows
for ind in subIndex.index:

    # checking either one of PM2.5 or PM10 is available
    while pd.notna(subIndex['PM2.5 SI'][ind]) or pd.notna(subIndex['PM10 SI'][ind]):

        p_list = [subIndex['PM2.5 SI'][ind], subIndex['PM10 SI'][ind], subIndex['NO2 SI'][ind], subIndex['NH3 SI'][ind], subIndex['SO2 SI'][ind], subIndex['CO SI'][ind],subIndex['Ozone SI'][ind]]

        # Check_list of bool of available pollutant values in each index or date
        Check_list = list(pd.notna(p_list))

        # checksum of available Pollutant values is used in if statement
        if sum(Check_list) >=3:
            
            # compresSub-Indexng the list of pollutants to remove Nan to remove uncertainities 
            aqi['AQI'][ind]= max(list(compress(p_list, Check_list)))

        break

aqi.Name = 'AQI Data'
print(aqi.head(5))
print('-'*50)
Elements(aqi)

<div id="AC"/>

# 5. Final Dataframes obtained after Calculations
---
---

In [None]:
# converting subIndex to set freq of days for resampling
df = df.set_index('Date').asfreq('D', method="ffill")

# converting subIndex to set freq of days for resampling
subIndex = subIndex.set_index('Date').asfreq('D', method="ffill")

# converting AQI to set freq of days for resampling
aqi = aqi.set_index('Date').asfreq('D', method="ffill")

In [None]:
def print_dfs(lst):
    for el in lst:
        print('-'*100)
        print("Dataframe")
        print(el.head())
        print('-'*100)
        print("shape of Dataframe = "+ str(el.shape))
        print('-'*100)
        print("columns of Dataframe"+ str(el.columns))
        print('-'*100)

df_list = [df, subIndex, aqi]

# Dropping year 2017 Data as it is inconsistent with other years.
for itr in df_list:
    itr.drop(itr.loc["2017"].index, inplace=True)

print_dfs(df_list)

<div id="Overview"/>

# 6. Overview of relations between features and AQI
---
---

In [None]:
def percent_missing(lst_dfs):
    lst = []
    for df in lst_dfs:
        percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2)
        total = df.isnull().sum().sort_values(ascending = False)
        lst.append(pd.concat([total, percent], axis=1, keys=['Total', 'Percent']))
    return lst
pm_list = percent_missing(df_list)
for pm in pm_list: print(pm)

In [None]:
Conc_df = pd.concat([df, aqi['AQI']], axis=1)

print("CORRELATION MATRIX")
# Correlation between Concentrations and AQI
Conc_df.corr().round(2)['AQI']

In [None]:
# Pairplots between concentrations and AQI
print("PAIRPLOTS CONCENTRATIONS VS AQI")
sns.pairplot(Conc_df, x_vars=["PM2.5", "PM10", "NO2", "NH3", "SO2", "CO", "Ozone"], y_vars=["AQI"])

In [None]:
SI_df = pd.concat([subIndex, aqi['AQI']], axis=1)

print("CORRELATION MATRIX SUB-INDEX VS AQI")
# Correlation between Concentrations and AQI
SI_df.corr().round(2)['AQI']

In [None]:
# Pairplots between concentrations and AQI
print("PAIRPLOTS SUB-INDICES VS AQI")
sns.pairplot(SI_df, x_vars=["PM2.5 SI", "PM10 SI", "NO2 SI", "NH3 SI", "SO2 SI", "CO SI", "Ozone SI"], y_vars=["AQI"])

#### Conclusion from Overview
- Concentrations and Sub-Indices distribution and graphs won't be much different as Sub-Index was calculated from Concentrations
- Both Concentrations and Sub-Indices of PM2.5, PM10, NO2 are higly correlated with AQI
- But as per plots, PM2.5, PM10, NH3 has better chance of prediction; as points are accumulated in nearly linear fashion.
- A positive correlation and regression can be drawn using PM2.5, PM10, NH3 Sub-Indices.
- Even SO2 can be considered but it will not be useful as it will become statistically insignificant which will be seen in future cells
- from here on, Sub-Indices will be used as features; As, Sub-indices and AQI have same range of Values.

<div id="LR"/>

# 7. LINEAR REGRESSION
---
---

In [None]:
# Removing not available values and outliers for regression
Regdf = SI_df.dropna()
z_scores = scipy.stats.zscore(Regdf)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores<3).all(axis=1)
Regdf = Regdf[filtered_entries]

In [None]:
y = Regdf['AQI']
x = Regdf[Regdf.columns[:-1]]
x = sm.add_constant(x)
regressor_OLS = sm.OLS(y, x).fit()
regressor_OLS.summary()

#### P-values of NO2, NH3, SO2 are more than 0.05 (significance value), So, they are unfit to use for regression
#### CO and OZONE are unfit as well; since coefficients are very small.

<div id="WF"/>

## 7.A. With Features
---

In [None]:
y = Regdf['AQI']
x = Regdf[Regdf.columns[:2]]
x = sm.add_constant(x)
regressor_OLS = sm.OLS(y, x).fit()
regressor_OLS.summary()

### Predicted value and True value plot over time

In [None]:
plt.figure(figsize=(12,6))
plt.grid()
pred_value= regressor_OLS.fittedvalues.copy()
true_val = Regdf['AQI'].copy()

# Plot of last 150 values
plt.plot(true_val[-150:])
plt.plot(pred_value[-150:])

# Labels
plt.xlabel("Date")
plt.ylabel("AQI Value")
plt.legend(["True Value", "Predicted Value"])

### Error in prediction

In [None]:
mae_f = tf.keras.metrics.mean_absolute_error(y_pred=pred_value, y_true=true_val).numpy()
rmse_f = tf.sqrt(tf.losses.mean_squared_error(y_pred=pred_value, y_true=true_val)).numpy()
print("MAE: ",mae_f)
print("RMSE: ",rmse_f)

### Residuals and it's normal fit

In [None]:
plt.figure(figsize=(12,6))
plt.grid()

residuals_f = true_val-pred_value

plt.scatter(pred_value[:400],residuals_f[:400])

plt.hlines(y=0, xmin =0, xmax=165 , linewidth=.5, color='red')
plt.title("Predicted Values vs Residuals")
plt.xlabel("Date")
plt.ylabel("AQI Value")
plt.xlabel("Predicted value")
plt.ylabel("Residual")

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
plt.grid()
_,(_,_,r) = scipy.stats.probplot(residuals_f,plot= ax, fit=True)

print('-'*100)
print("Plot to show how deviated the residuals are from normal distribution")
print('x'*100)
print("R square = "+ str(r**2))
print('-'*100)

<div id="WPF"/>

## 7.B. With past features
---

In [None]:
# Regression
y = Regdf['AQI'][1:]
x = Regdf[Regdf.columns[:2]].shift(1)[1:]
x = sm.add_constant(x)
regressor_OLS = sm.OLS(y, x).fit()
regressor_OLS.summary()

### Predicted value and True value plot over time

In [None]:
plt.figure(figsize=(12,6))
plt.grid()
pred_value= regressor_OLS.fittedvalues.copy()
true_val = Regdf['AQI'][1:].copy()

# Plot of last 150 values
plt.plot(true_val[-150:])
plt.plot(pred_value[-150:])

# Labels
plt.xlabel("Date")
plt.ylabel("AQI Value")
plt.legend(["True Value", "Predicted Value"])

### Error in prediction

In [None]:
mae_pf = tf.keras.metrics.mean_absolute_error(y_pred=pred_value, y_true=true_val).numpy()
rmse_pf = tf.sqrt(tf.losses.mean_squared_error(y_pred=pred_value, y_true=true_val)).numpy()
print("MAE: ",mae_pf)
print("RMSE: ",rmse_pf)

### Residuals and it's normal fit

In [None]:
plt.figure(figsize=(12,6))
plt.grid()

residuals_f = true_val-pred_value

plt.scatter(pred_value[:400],residuals_f[:400])

plt.hlines(y=0, xmin =0, xmax=165 , linewidth=.5, color='red')
plt.title("Predicted Values vs Residuals")
plt.xlabel("Predicted value")
plt.ylabel("Residual")

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
plt.grid()
_,(_,_,r) = scipy.stats.probplot(residuals_f,plot= ax, fit=True)

print('-'*100)
print("Plot to show how deviated the residuals are from normal distribution")
print('x'*100)
print("R square = "+ str(r**2))
print('-'*100)

<div id="WI"/>

## 7.C. With Itself
---


<div id="Pd"/>

### 7.C.a. Previous Day

In [None]:
y = Regdf['AQI'][1:]
x = Regdf['AQI'].shift(1)[1:]
x = sm.add_constant(x)
regressor_OLS = sm.OLS(y, x).fit()
regressor_OLS.summary()

### Predicted value and True value plot over time

In [None]:
plt.figure(figsize=(12,6))
plt.grid()
pred_value= regressor_OLS.fittedvalues.copy()
true_val = Regdf['AQI'][1:].copy()

# Plot of last 150 values
plt.plot(true_val[-150:])
plt.plot(pred_value[-150:])

# Labels
plt.xlabel("Date")
plt.ylabel("AQI Value")
plt.legend(["True Value", "Predicted Value"])

### Error in prediction

In [None]:
mae_ps = tf.keras.metrics.mean_absolute_error(y_pred=pred_value, y_true=true_val).numpy()
rmse_ps = tf.sqrt(tf.losses.mean_squared_error(y_pred=pred_value, y_true=true_val)).numpy()
print("MAE: ",mae_ps)
print("RMSE: ",rmse_ps)

### Residuals and it's normal fit

In [None]:
plt.figure(figsize=(12,6))
plt.grid()

residuals_f = true_val-pred_value

plt.scatter(pred_value[:400],residuals_f[:400])

plt.hlines(y=0, xmin =0, xmax=165 , linewidth=.5, color='red')
plt.title("Predicted Values vs Residuals")
plt.xlabel("Predicted value")
plt.ylabel("Residual")

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
plt.grid()
_,(_,_,r) = scipy.stats.probplot(residuals_f,plot= ax, fit=True)

print('-'*100)
print("Plot to show how deviated the residuals are from normal distribution")
print('x'*100)
print("R square = "+ str(r**2))
print('-'*100)

<div id="Psd"/>
    
### 7.C.b. Previous Significant Days

In [None]:
# for given list or range of days ; a series in dataframe is created by shifting the series by day for each column
# window can be list of values or range
def Window_dataframe(df, window):
    Wdf = pd.DataFrame(index=df.index[max(window):])
    for i in window:
        Inter = df.shift(i)[max(window):]
        Wdf['Past Day - '+str(i)] = Inter
    return Wdf

In [None]:
s = Window_dataframe(aqi, range(1,15))

Regdf = pd.concat([s,aqi[2:]], axis=1).dropna()

z_scores = scipy.stats.zscore(Regdf)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores<3).all(axis=1)
Regdf = Regdf[filtered_entries]

In [None]:
y = Regdf['AQI']
x = Regdf[Regdf.columns[:-1]]
x = sm.add_constant(x)
regressor_OLS = sm.OLS(y, x).fit()
regressor_OLS.summary()

### Previous day and a 12th day in past has more significance statistically

In [None]:
s = Window_dataframe(aqi, [1,12])

Regdf = pd.concat([s,aqi], axis=1).dropna()

z_scores = scipy.stats.zscore(Regdf)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores<3).all(axis=1)
Regdf = Regdf[filtered_entries]

y = Regdf['AQI']
x = Regdf[Regdf.columns[:-1]]
x = sm.add_constant(x)
regressor_OLS = sm.OLS(y, x).fit()
regressor_OLS.summary()

### Predicted value and True value plot over time

In [None]:
plt.figure(figsize=(12,6))
plt.grid()
pred_value= regressor_OLS.fittedvalues.copy()
true_val = Regdf['AQI'].copy()

# Plot of last 150 values
plt.plot(true_val[-150:])
plt.plot(pred_value[-150:])

# Labels
plt.xlabel("Date")
plt.ylabel("AQI Value")
plt.legend(["True Value", "Predicted Value"])

### Error in prediction

In [None]:
mae_pw = tf.keras.metrics.mean_absolute_error(y_pred=pred_value, y_true=true_val).numpy()
rmse_pw = tf.sqrt(tf.losses.mean_squared_error(y_pred=pred_value, y_true=true_val)).numpy()
print("MAE: ",mae_pw)
print("RMSE: ",rmse_pw)

### Residuals and it's normal fit

In [None]:
plt.figure(figsize=(12,6))
plt.grid()

residuals_f = true_val-pred_value

plt.scatter(pred_value[:400],residuals_f[:400])

plt.hlines(y=0, xmin =0, xmax=165 , linewidth=.5, color='red')
plt.title("Predicted Values vs Residuals")
plt.xlabel("Predicted value")
plt.ylabel("Residual")

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
plt.grid()
_,(_,_,r) = scipy.stats.probplot(residuals_f,plot= ax, fit=True)

print('-'*100)
print("Plot to show how deviated the residuals are from normal distribution")
print('x'*100)
print("R square = "+ str(r**2))
print('-'*100)

<div id='final'/>

# FINAL ERRORS OF EACH REGRESSION

In [None]:
mae_lst = [mae_f, mae_pf, mae_ps, mae_pw]
rmse_lst = [rmse_f, rmse_pf, rmse_ps, rmse_pw]
lm_df = pd.DataFrame({"RMSE": rmse_lst, "MAE": mae_lst}, index=["Regression with Features", "Regression with Past Features", "Regression with Past self", "Regression with Past Window"])
lm_df