In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
print(os.listdir("../input"))

In [None]:
aq=pd.read_csv('../input/india-air-quality-data/data.csv',encoding="ISO-8859-1")
aq.tail(5)
#Data from years 1987-2015

In [None]:
aq.shape

In [None]:
# Extracting Tamil Nadu state data alone
tn = aq.query('state=="Tamil Nadu" ')
tn.sample(2)

In [None]:
tn.shape

In [None]:
tn.describe(include = 'all')

# Feature Engineering

##### Removing unnecessary datas

In [None]:
tn.drop(labels=['stn_code','sampling_date','agency','location_monitoring_station'], axis = 1, inplace = True)
tn.sample(2)

In [None]:
tn.isnull().sum()

pm2_5 has almost 97% data missing. So omitting pm2_5 column

In [None]:
tn.drop(labels = ['pm2_5'], axis =1, inplace = True)
tn.head(2)

In order to fill the missing values, the values are first need to be sorted in Chronological order

In [None]:
tn.dtypes

In [None]:
# To sort based on dates, the date should be of "datetime" datatype. 
#So converting "object" data type to "datetime" datatype

In [None]:
tn['date'] = pd.to_datetime(tn.date,format='%Y-%m-%d')
tn.info()

In [None]:
tn.sort_values(by='date')

In [None]:
tn['so2'].fillna(method='ffill',inplace = True);
tn['no2'].fillna(method='ffill',inplace = True);
tn['rspm'].fillna(method='ffill',inplace = True);
tn['spm'].fillna(method='ffill',inplace = True);

In [None]:
tn.isnull().sum()

In [None]:
# Even after replacement, we have 1636 missing values in rspm
print(tn.iloc[[1634]],tn.iloc[[1635]],tn.iloc[[1636]],tn.iloc[[1636]])

In [None]:
# This means rspm has not been calculated till 2002. It has been measured only from 2004 onwards.
# We can either omit it or seperate the data set into two. That is before 2004 and after 2004.
# Here for simplicity, I am deleting the column of rspm

In [None]:
tn.drop(labels = ['rspm'], axis = 1, inplace = True)
tn.head()

In [None]:
tn.isnull().sum()

In [None]:
# Dealing with 354 missing values of type

In [None]:
typ=sns.countplot(x ="type",data = tn)
typ.set_xticklabels(typ.get_xticklabels(), rotation=90);

Here we have repetition of types, so replacing all to unique types

In [None]:
tn['type'].replace("Industrial Areas","Industrial",inplace=True)
tn['type'].replace("Industrial Area","Industrial",inplace=True)
tn['type'].replace("Residential and others","Residential",inplace=True)
tn['type'].replace("Residential, Rural and other Areas","Residential",inplace=True)

In [None]:
typ=sns.countplot(x ="type",data = tn)
typ.set_xticklabels(typ.get_xticklabels(), rotation=90);

In [None]:
datacount_ty =sns.countplot(x ="location",hue = 'type',data = tn);
datacount_ty.set_xticklabels(datacount_ty.get_xticklabels(), rotation=90);

In [None]:
# Rows with missing "types"
null_data = tn[tn.isnull().any(axis=1)]
null_data.head(20)

Mode is higher for residential. So filling the missing 354 values in type by "Residential" type

In [None]:
# Converting NaN to zeros
#df['DataFrame Column'] = df['DataFrame Column'].replace(np.nan, 0)
tn['type'] = tn['type'].replace(np.nan, "Residential")

In [None]:
tn.isnull().sum()

In [None]:
#Finding hidden missing values. (i.e. zeros)

In [None]:
aaa = (tn == 0).astype(int).sum(axis=0)
print(aaa)

In [None]:
# Also we can see the "locations" repeated.
# Madras - Chennai, # Turicorin-Tuticorin
# Replacing them into single value

In [None]:
tn['location'].replace("Turicorin","Tuticorin",inplace=True)
tn['location'].replace("Madras","Chennai",inplace=True)

In [None]:
datacount_ty =sns.countplot(x ="location",hue = 'type',data = tn);
datacount_ty.set_xticklabels(datacount_ty.get_xticklabels(), rotation=90);

In [None]:
tn.head()

# Data Visualization

In [None]:
datacount =sns.countplot(x ="location",data = tn);
datacount.set_xticklabels(datacount.get_xticklabels(), rotation=90);

In [None]:
loc = pd.pivot_table(tn, values=['so2','no2','spm'],index='location') # Aggfunc: default-np.mean()
loc

In [None]:
maxso2 = loc.sort_values(by='so2',ascending=False)
maxso2.loc[:,['so2']].head(10).plot(kind='bar'); # Based on average values

In [None]:
maxno2 = loc.sort_values(by='no2',ascending=False);
maxno2.loc[:,['no2']].head(10).plot(kind='bar');

In [None]:
maxspm = loc.sort_values(by='spm',ascending=False);
maxspm.loc[:,['spm']].head(10).plot(kind='bar');

# Calculating AQI

In [None]:
def calculate_si(so2):
    si=0
    if (so2<=40):
     si= "s1"
    if (so2>40 and so2<=80):
     si= "s2"
    if (so2>80 and so2<=380):
     si= "s3"
    if (so2>380 and so2<=800):
     si= "s4"
    if (so2>800 and so2<=1600):
     si= "s5"
    if (so2>1600):
     si= "s6"
    return si
tn['si']=tn['so2'].apply(calculate_si)
ds= tn[['so2','si']]
ds.tail()

In [None]:
def calculate_ni(no2):
    ni=0
    if (no2<=40):
     ni= "n1"
    if (no2>40 and no2<=80):
     ni= "n2"
    if (no2>80 and no2<=180):
     ni= "n3"
    if (no2>180 and no2<=280):
     ni= "n4"
    if (no2>280 and no2<=400):
     ni= "n5"
    if (no2>400):
     ni= "n6"
    return ni
tn['ni']=tn['no2'].apply(calculate_ni)
dn= tn[['no2','ni']]
dn.tail()

In [None]:
def calculate_spi(spm):
    spi=0
    if (spm<=40):
      spi= "sp1"
    if (spm>40 and spm<=80):
      spi= "sp2"
    if (spm>80 and spm<=180):
      spi= "sp3"
    if (spm>180 and spm<=280):
      spi= "sp4"
    if (spm>280 and spm<=400):
      spi= "sp5"
    if (spm>400):
      spi= "sp6"
    return  spi
tn['spi']=tn['spm'].apply(calculate_spi)
dsp= tn[['spm','spi']]
dsp.tail()

In [None]:
tn.sample(2)

In [None]:
# AQI
def calculate_aqi(si,ni,spi):
    aqi=0
    if(si>ni and si>spi):
     aqi=si
    if (spi>ni and spi>si):
     aqi=spi
    if(ni>si and ni>spi):
     aqi= ni
    return aqi
tn['AQI']=tn.apply(lambda x:calculate_aqi(x['so2'],x['no2'],x['spm']),axis=1)

In [None]:
tn.head()

In [None]:
aq_wise = pd.pivot_table(tn, values=['AQI'],index='location')
aq_wise

In [None]:
maxaqi = aq_wise.sort_values(by='AQI',ascending=False)
maxaqi.loc[:,['AQI']].head(37).plot(kind='bar')

In [None]:
date_wise = pd.pivot_table(tn, values=['AQI'],index='date')
date_wise

In [None]:
date_wise.loc[:,['AQI']].head(30).plot(kind='bar')

# Training Data

In [None]:
dum1 = pd.get_dummies(tn['type'])
dum2 = pd.get_dummies(tn['location'])
tn['year'] = tn['date'].dt.year

In [None]:
td = pd.concat([tn, dum1, dum2], axis = 1)
td.head()

In [None]:
td.drop(labels = ['state','location','type','so2','no2','spm','si','ni','spi','date'], axis = 1, inplace = True)
td.sample(2)

In [None]:
td.corr()

"year" has good correlation with "AQI" when compared to others

In [None]:
yr_wise = pd.pivot_table(td, values=['AQI'],index='year')
yr_wise.loc[:,['AQI']].head(30).plot(kind='bar')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=td.drop("AQI",axis=1)
y=td["AQI"]

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.30,random_state=25)

# Model fittings

### Simple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_mod = LinearRegression()
lin_mod.fit(X_train, y_train)

In [None]:
lin_mod.score(X_train, y_train )

In [None]:
lin_mod.score(X_test, y_test)

In [None]:
# Less Score. Underfitting

### Polynomial Regression 

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, y_train)

y_pred = poly_clf.predict(X_test2)

In [None]:
print(poly_clf.score(X_train2, y_train))

In [None]:
print(poly_clf.score(X_test2, y_test))

In [None]:
# Trying with higher degrees

In [None]:
poly = PolynomialFeatures(degree=3, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, y_train)

y_pred = poly_clf.predict(X_test2)
print(poly_clf.score(X_train2, y_train))
print(poly_clf.score(X_test2, y_test))

In [None]:
# degree = 3 has less scores than degree = 2

In [None]:
poly = PolynomialFeatures(degree=4, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, y_train)

y_pred = poly_clf.predict(X_test2)
print(poly_clf.score(X_train2, y_train))
print(poly_clf.score(X_test2, y_test))

In [None]:
# Nearly score to degree = 2. But still less than degree = 2

In [None]:
poly = PolynomialFeatures(degree=5, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, y_train)

y_pred = poly_clf.predict(X_test2)
print(poly_clf.score(X_train2, y_train))
print(poly_clf.score(X_test2, y_test))

In [None]:
# Score reduces as degree increases

In [None]:
poly = PolynomialFeatures(degree=6, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, y_train)

y_pred = poly_clf.predict(X_test2)
print(poly_clf.score(X_train2, y_train))
print(poly_clf.score(X_test2, y_test))

In [None]:
# Same score as prev degree. 
#Underfitting

### K-Nearest Neighbour

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from scipy.stats import zscore

In [None]:
XScaled = X.apply(zscore)

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 27, metric = 'euclidean')

In [None]:
NNH.fit(X_train,y_train)

In [None]:
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

Score Better than Linear Regression models. Trying with different n_neighbours

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 30) # default metric = 'minkowski'
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 55)
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 70)
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

In [None]:
# if we increase n_neigbours more than 55, 
#train fitting increases but test fit decreases. So 55 is the optimum one

Trying with different "metrics"

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 55, metric = 'euclidean')
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

In [None]:
# Better result dan "minskowki"

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 35, metric = 'euclidean')
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 30, metric = 'euclidean')
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

35 is the optimum one

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 35, metric = 'manhattan')
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

In [None]:
# Better than euclidean

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 45, metric = 'manhattan')
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

In [None]:
NNH = KNeighborsRegressor(n_neighbors = 55, metric = 'manhattan')
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

In [None]:
# 45 is optimum

In [None]:
# Therefore best solution is for 
NNH = KNeighborsRegressor(n_neighbors = 45, metric = 'manhattan')
NNH.fit(X_train,y_train)
predicted_labels = NNH.predict(X_test)
print(NNH.score(X_train, y_train))
print(NNH.score(X_test,y_test))

### SVM

In [None]:
from sklearn import svm
from sklearn.svm import SVR

In [None]:
reg= svm.SVR(kernel='rbf',gamma='auto', C=2)
reg.fit(X_train,y_train)

In [None]:
predicted_labels = reg.predict(X_test)
print(reg.score(X_train,y_train))
print(reg.score(X_test,y_test))

In [None]:
# Score is less than KNN. Trying with other "C"

In [None]:
reg= svm.SVR(kernel='rbf',gamma='auto', C=150)
reg.fit(X_train,y_train)
predicted_labels = reg.predict(X_test)
print(reg.score(X_train,y_train))
print(reg.score(X_test,y_test))

In [None]:
reg= svm.SVR(kernel='rbf',gamma='auto', C=160)
reg.fit(X_train,y_train)
predicted_labels = reg.predict(X_test)
print(reg.score(X_train,y_train))
print(reg.score(X_test,y_test))

In [None]:
reg= svm.SVR(kernel='rbf',gamma='auto', C=163)
reg.fit(X_train,y_train)
predicted_labels = reg.predict(X_test)
print(reg.score(X_train,y_train))
print(reg.score(X_test,y_test))

In [None]:
# as C increases after 160, score training score increases but test score decreases.

In [None]:
reg= svm.SVR(kernel='sigmoid',gamma='auto', C=80)
reg.fit(X_train,y_train)
predicted_labels = reg.predict(X_test)
print(reg.score(X_train,y_train))
print(reg.score(X_test,y_test))

In [None]:
# Using poly kernel takes lot of time to run

In [None]:
# Optimum value for SVM is
reg= svm.SVR(kernel='rbf',gamma='auto', C=160)
reg.fit(X_train,y_train)
predicted_labels = reg.predict(X_test)
print(reg.score(X_train,y_train))
print(reg.score(X_test,y_test))

But not as good as KNN

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dTree= DecisionTreeRegressor(criterion='mse',splitter='best',random_state=25,max_depth=5)

In [None]:
dTree.fit(X_train,y_train)

In [None]:
print(dTree.score(X_train,y_train)) 
print(dTree.score(X_test,y_test))

Trying with different "max_depth"

In [None]:
dTree= DecisionTreeRegressor(criterion='mse',splitter='best',random_state=25,max_depth=14)
dTree.fit(X_train,y_train)
print(dTree.score(X_train,y_train)) 
print(dTree.score(X_test,y_test))

In [None]:
# No improvements in score after "max_depth = 14"
# Trying with different criteria

In [None]:
dTree= DecisionTreeRegressor(criterion='mae',splitter='best',random_state=25,max_depth=20)
dTree.fit(X_train,y_train)
print(dTree.score(X_train,y_train)) 
print(dTree.score(X_test,y_test))

In [None]:
dTree= DecisionTreeRegressor(criterion='friedman_mse',splitter='best',random_state=25,max_depth=15)
dTree.fit(X_train,y_train)
print(dTree.score(X_train,y_train)) 
print(dTree.score(X_test,y_test))

In [None]:
# friedman_mse same as mse

In [None]:
# Optimum is 
dTree= DecisionTreeRegressor(criterion='mse',splitter='best',random_state=25,max_depth=14)
dTree.fit(X_train,y_train)
print(dTree.score(X_train,y_train)) 
print(dTree.score(X_test,y_test))

In [None]:
dTree= DecisionTreeRegressor(criterion='mse',splitter='best',random_state=25,max_depth=14)
dTree.fit(X_train,y_train)
dTree_tr=dTree.score(X_train,y_train)
dTree_ts=dTree.score(X_test,y_test)

Better than KNN

### Bagging

In [None]:
from sklearn.ensemble import BaggingRegressor

In [None]:
bgr= BaggingRegressor (n_estimators=9,base_estimator=dTree,random_state=25)
bgr=bgr.fit(X_train,y_train)
print(bgr.score(X_train,y_train))
print(bgr.score(X_test,y_test))

In [None]:
# trying with different "n_estimators"

In [None]:
bgr= BaggingRegressor (n_estimators=12,base_estimator=dTree,random_state=25)
bgr=bgr.fit(X_train,y_train)
print(bgr.score(X_train,y_train))
print(bgr.score(X_test,y_test))

In [None]:
# Increase in "n_estimators" increases train score but decreases test score. 
#so "n_estimators = 9" is good

Not as good as Decision Tree

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
adr= AdaBoostRegressor (n_estimators=5,random_state=25, loss ='linear') # loss = 'linear' is default
adr=adr.fit(X_train,y_train)
print(adr.score(X_train,y_train))
print(adr.score(X_test,y_test))

In [None]:
# trying with different "n_estimators"

In [None]:
adr= AdaBoostRegressor (n_estimators=15,random_state=25,loss ='linear')
adr=adr.fit(X_train,y_train)
print(adr.score(X_train,y_train))
print(adr.score(X_test,y_test))

In [None]:
# Increase in "n_estimators" increases train score but decreases test score. 
#so "n_estimators = 5" is good

In [None]:
adr= AdaBoostRegressor (n_estimators=7,random_state=25,loss ='square')
adr=adr.fit(X_train,y_train)
print(adr.score(X_train,y_train))
print(adr.score(X_test,y_test))

In [None]:
adr= AdaBoostRegressor (n_estimators=5,random_state=25,loss ='exponential')
adr=adr.fit(X_train,y_train)
print(adr.score(X_train,y_train))
print(adr.score(X_test,y_test))

Not good as Decision Tree

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr= GradientBoostingRegressor (n_estimators=10,random_state=25)
gbr=gbr.fit(X_train,y_train)
print(gbr.score(X_train,y_train))
print(gbr.score(X_test,y_test))

In [None]:
# trying with different "n_estimators"

In [None]:
gbr= GradientBoostingRegressor (n_estimators=400,random_state=25)
gbr=gbr.fit(X_train,y_train)
print(gbr.score(X_train,y_train))
print(gbr.score(X_test,y_test))

In [None]:
gbr= GradientBoostingRegressor (n_estimators=410,random_state=25)
gbr=gbr.fit(X_train,y_train)
print(gbr.score(X_train,y_train))
print(gbr.score(X_test,y_test))

In [None]:
# Increase in "n_estimators" beyond 400, increases train score but decreases test score. so "n_estimators = 400" is good

In [None]:
# Optimum is 
gbr= GradientBoostingRegressor (n_estimators=400,random_state=25)
gbr=gbr.fit(X_train,y_train)
print(gbr.score(X_train,y_train))
print(gbr.score(X_test,y_test))

In [None]:
gbr= GradientBoostingRegressor (n_estimators=400,random_state=25)
gbr=gbr.fit(X_train,y_train)
gbr_tr= gbr.score(X_train,y_train)
gbr_ts= gbr.score(X_test,y_test)

Very near to Decision Tree.

Score of Decision Tree

train - 0.7320163141352926
test - 0.7764637553626321

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr= RandomForestRegressor (n_estimators=10,random_state=25,max_features=5)
rfr=rfr.fit(X_train,y_train)
print(rfr.score(X_train,y_train))
print(rfr.score(X_test,y_test))

In [None]:
# trying with different "n_estimators"

In [None]:
rfr= RandomForestRegressor (n_estimators=11,random_state=25,max_features=5)
rfr=rfr.fit(X_train,y_train)
print(rfr.score(X_train,y_train))
print(rfr.score(X_test,y_test))

In [None]:
# No effect

In [None]:
# trying with different "max_features"

In [None]:
rfr= RandomForestRegressor (n_estimators=10,random_state=25,max_features=10)
rfr=rfr.fit(X_train,y_train)
print(rfr.score(X_train,y_train))
print(rfr.score(X_test,y_test))

In [None]:
rfr= RandomForestRegressor (n_estimators=10,random_state=25,max_features=10)
rfr=rfr.fit(X_train,y_train)
rfr_tr = rfr.score(X_train,y_train)
rfr_ts = rfr.score(X_test,y_test)

In [None]:
# the above one is optimum

Score Very similar to Decision Tree, Gradient Boosting



# Therefore the models which perform well are

In [None]:
score_res = pd.DataFrame({'Model':['DecisionTree','GradientBoosting','RandomForest'],
                          'Train Score':[dTree_tr, gbr_tr, rfr_tr],
                         'Test Score':[dTree_ts, gbr_ts, rfr_ts]
                         })
score_res