In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
df = pd.read_csv("../data/raw/city_day.csv")
# since this dataset contains data for a lot of cities, I will be selecting a city and implementing for it
# this city that i have decided to pick => City = 'Delhi' 
# reason: since delhi has the richest pollution recordings. Hence, we will have more data

# Tn the research paper, the authors have explicitly mentioned that meteorological factor's contribution in AQI prediction is negligible
# The dataset that was available to me, did not have those features

df = df.loc[df['City'] == 'Delhi'].reset_index().drop(columns=['index'])
display(df)

df = df.drop(columns=['City','AQI_Bucket'])
display(df)

display(df.dtypes)
# the 'Date' column is object type -> convert it to datetime ->
df['Date'] = pd.to_datetime(df['Date'])

display(df.dtypes)

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Delhi,2015-01-01,313.22,607.98,69.16,36.39,110.59,33.85,15.20,9.25,41.68,14.36,24.86,9.84,472.0,Severe
1,Delhi,2015-01-02,186.18,269.55,62.09,32.87,88.14,31.83,9.54,6.65,29.97,10.55,20.09,4.29,454.0,Severe
2,Delhi,2015-01-03,87.18,131.90,25.73,30.31,47.95,69.55,10.61,2.65,19.71,3.91,10.23,1.99,143.0,Moderate
3,Delhi,2015-01-04,151.84,241.84,25.01,36.91,48.62,130.36,11.54,4.63,25.36,4.26,9.71,3.34,319.0,Very Poor
4,Delhi,2015-01-05,146.60,219.13,14.01,34.92,38.25,122.88,9.20,3.33,23.20,2.80,6.21,2.96,325.0,Very Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004,Delhi,2020-06-27,39.80,155.94,10.88,21.46,22.47,31.43,0.87,10.38,18.88,1.69,19.99,0.43,112.0,Moderate
2005,Delhi,2020-06-28,59.52,308.65,12.67,21.60,23.86,29.27,0.94,10.70,18.05,1.71,25.13,1.74,196.0,Moderate
2006,Delhi,2020-06-29,44.86,184.12,10.50,21.57,21.94,27.97,0.88,11.58,26.61,2.13,23.80,1.13,233.0,Poor
2007,Delhi,2020-06-30,39.80,91.98,5.99,17.96,15.44,28.48,0.84,10.51,37.29,1.57,16.37,0.49,114.0,Moderate


Unnamed: 0,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
0,2015-01-01,313.22,607.98,69.16,36.39,110.59,33.85,15.20,9.25,41.68,14.36,24.86,9.84,472.0
1,2015-01-02,186.18,269.55,62.09,32.87,88.14,31.83,9.54,6.65,29.97,10.55,20.09,4.29,454.0
2,2015-01-03,87.18,131.90,25.73,30.31,47.95,69.55,10.61,2.65,19.71,3.91,10.23,1.99,143.0
3,2015-01-04,151.84,241.84,25.01,36.91,48.62,130.36,11.54,4.63,25.36,4.26,9.71,3.34,319.0
4,2015-01-05,146.60,219.13,14.01,34.92,38.25,122.88,9.20,3.33,23.20,2.80,6.21,2.96,325.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004,2020-06-27,39.80,155.94,10.88,21.46,22.47,31.43,0.87,10.38,18.88,1.69,19.99,0.43,112.0
2005,2020-06-28,59.52,308.65,12.67,21.60,23.86,29.27,0.94,10.70,18.05,1.71,25.13,1.74,196.0
2006,2020-06-29,44.86,184.12,10.50,21.57,21.94,27.97,0.88,11.58,26.61,2.13,23.80,1.13,233.0
2007,2020-06-30,39.80,91.98,5.99,17.96,15.44,28.48,0.84,10.51,37.29,1.57,16.37,0.49,114.0


Date        object
PM2.5      float64
PM10       float64
NO         float64
NO2        float64
NOx        float64
NH3        float64
CO         float64
SO2        float64
O3         float64
Benzene    float64
Toluene    float64
Xylene     float64
AQI        float64
dtype: object

Date       datetime64[ns]
PM2.5             float64
PM10              float64
NO                float64
NO2               float64
NOx               float64
NH3               float64
CO                float64
SO2               float64
O3                float64
Benzene           float64
Toluene           float64
Xylene            float64
AQI               float64
dtype: object

In [92]:
# in the research paper, the authors have dropped rows with missing values.
display(df.isna().sum())

# the dataset suggests that the column 'Xylene' has a lot of missing values -> Hence, it would be wise to drop it
df1 = df.drop(columns=['Xylene'])

# I am going to use linear interpolation to handling missing values 
df2 = df1.interpolate(method='linear')
display(df2)

display(df2.isna().sum())

Date         0
PM2.5        2
PM10        77
NO           2
NO2          2
NOx          0
NH3          9
CO           0
SO2        110
O3          84
Benzene      0
Toluene      0
Xylene     781
AQI         10
dtype: int64

Unnamed: 0,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI
0,2015-01-01,313.22,607.98,69.16,36.39,110.59,33.85,15.20,9.25,41.68,14.36,24.86,472.0
1,2015-01-02,186.18,269.55,62.09,32.87,88.14,31.83,9.54,6.65,29.97,10.55,20.09,454.0
2,2015-01-03,87.18,131.90,25.73,30.31,47.95,69.55,10.61,2.65,19.71,3.91,10.23,143.0
3,2015-01-04,151.84,241.84,25.01,36.91,48.62,130.36,11.54,4.63,25.36,4.26,9.71,319.0
4,2015-01-05,146.60,219.13,14.01,34.92,38.25,122.88,9.20,3.33,23.20,2.80,6.21,325.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004,2020-06-27,39.80,155.94,10.88,21.46,22.47,31.43,0.87,10.38,18.88,1.69,19.99,112.0
2005,2020-06-28,59.52,308.65,12.67,21.60,23.86,29.27,0.94,10.70,18.05,1.71,25.13,196.0
2006,2020-06-29,44.86,184.12,10.50,21.57,21.94,27.97,0.88,11.58,26.61,2.13,23.80,233.0
2007,2020-06-30,39.80,91.98,5.99,17.96,15.44,28.48,0.84,10.51,37.29,1.57,16.37,114.0


Date       0
PM2.5      0
PM10       0
NO         0
NO2        0
NOx        0
NH3        0
CO         0
SO2        0
O3         0
Benzene    0
Toluene    0
AQI        0
dtype: int64

In [93]:
df = df2

# Now, we will do feature engineering. Since, pollution in India is also dependent of the date and season of the year
# this additional dimensionality will help in model training

df['day'] = df['Date'].dt.day
df['month'] = df['Date'].dt.month
df['year'] = df['Date'].dt.year
df['weekday'] = df['Date'].dt.weekday
df['dayofyear'] = df['Date'].dt.dayofyear
df['is_weekend'] = (df['weekday'] >= 5).astype(int)

df = df.drop(columns=['Date'])

display(df)
# Final dataset maps as follows: [pollutant features + time based features] -> [target variable = AQI]

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,day,month,year,weekday,dayofyear,is_weekend
0,313.22,607.98,69.16,36.39,110.59,33.85,15.20,9.25,41.68,14.36,24.86,472.0,1,1,2015,3,1,0
1,186.18,269.55,62.09,32.87,88.14,31.83,9.54,6.65,29.97,10.55,20.09,454.0,2,1,2015,4,2,0
2,87.18,131.90,25.73,30.31,47.95,69.55,10.61,2.65,19.71,3.91,10.23,143.0,3,1,2015,5,3,1
3,151.84,241.84,25.01,36.91,48.62,130.36,11.54,4.63,25.36,4.26,9.71,319.0,4,1,2015,6,4,1
4,146.60,219.13,14.01,34.92,38.25,122.88,9.20,3.33,23.20,2.80,6.21,325.0,5,1,2015,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004,39.80,155.94,10.88,21.46,22.47,31.43,0.87,10.38,18.88,1.69,19.99,112.0,27,6,2020,5,179,1
2005,59.52,308.65,12.67,21.60,23.86,29.27,0.94,10.70,18.05,1.71,25.13,196.0,28,6,2020,6,180,1
2006,44.86,184.12,10.50,21.57,21.94,27.97,0.88,11.58,26.61,2.13,23.80,233.0,29,6,2020,0,181,0
2007,39.80,91.98,5.99,17.96,15.44,28.48,0.84,10.51,37.29,1.57,16.37,114.0,30,6,2020,1,182,0


In [None]:
# defining features and target variable ->

train = df.loc[df['year'] <= 2019]
test = df.loc[df['year'] > 2019]
display(train)
display(test)

X_train = train.drop(columns=['AQI'])
y_train = train['AQI']

X_test = test.drop(columns=['AQI'])
y_test = test['AQI']

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,day,month,year,weekday,dayofyear,is_weekend
0,313.22,607.98,69.16,36.39,110.59,33.85,15.20,9.25,41.68,14.36,24.86,472.0,1,1,2015,3,1,0
1,186.18,269.55,62.09,32.87,88.14,31.83,9.54,6.65,29.97,10.55,20.09,454.0,2,1,2015,4,2,0
2,87.18,131.90,25.73,30.31,47.95,69.55,10.61,2.65,19.71,3.91,10.23,143.0,3,1,2015,5,3,1
3,151.84,241.84,25.01,36.91,48.62,130.36,11.54,4.63,25.36,4.26,9.71,319.0,4,1,2015,6,4,1
4,146.60,219.13,14.01,34.92,38.25,122.88,9.20,3.33,23.20,2.80,6.21,325.0,5,1,2015,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,240.07,371.45,28.66,54.60,54.95,52.64,1.39,7.88,27.55,4.62,32.86,372.0,27,12,2019,4,361,0
1822,275.60,426.12,60.13,55.98,83.47,50.47,1.96,10.26,27.92,6.70,39.22,425.0,28,12,2019,5,362,1
1823,326.79,500.15,32.73,57.76,61.51,49.61,1.86,10.25,35.11,6.28,33.00,455.0,29,12,2019,6,363,1
1824,333.43,486.35,54.97,55.61,79.04,50.75,2.24,11.15,26.44,8.00,47.12,506.0,30,12,2019,0,364,0


Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,day,month,year,weekday,dayofyear,is_weekend
1826,372.14,483.87,109.53,64.18,135.00,56.85,3.15,18.49,39.58,8.50,60.79,492.0,1,1,2020,2,1,0
1827,327.04,439.31,120.76,64.80,143.71,58.26,3.32,19.95,36.50,9.36,73.11,485.0,2,1,2020,3,2,0
1828,228.90,337.04,111.09,55.65,131.13,53.02,2.80,14.93,27.82,6.96,66.80,426.0,3,1,2020,4,3,0
1829,220.60,327.22,86.58,57.67,110.20,53.46,2.20,12.85,29.94,6.81,48.18,366.0,4,1,2020,5,4,1
1830,170.21,257.94,29.99,48.25,54.30,47.94,1.23,9.59,30.21,3.70,28.90,354.0,5,1,2020,6,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004,39.80,155.94,10.88,21.46,22.47,31.43,0.87,10.38,18.88,1.69,19.99,112.0,27,6,2020,5,179,1
2005,59.52,308.65,12.67,21.60,23.86,29.27,0.94,10.70,18.05,1.71,25.13,196.0,28,6,2020,6,180,1
2006,44.86,184.12,10.50,21.57,21.94,27.97,0.88,11.58,26.61,2.13,23.80,233.0,29,6,2020,0,181,0
2007,39.80,91.98,5.99,17.96,15.44,28.48,0.84,10.51,37.29,1.57,16.37,114.0,30,6,2020,1,182,0


In [None]:
# storing the processed dataset

X_train.to_csv("../data/processed/X_train.csv",index=False)
y_train.to_csv("../data/processed/y_train.csv",index=False)

X_test.to_csv("../data/processed/X_test.csv",index=False)
y_test.to_csv("../data/processed/y_test.csv",index=False)