In [1]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [2]:
#Reading csv
df = pd.read_csv('AirQuality.csv')
df.head(5)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


In [3]:
df.describe()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,-34.207524,1048.990061,-159.090093,1.865683,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032,9.778305,39.48538,-6.837604
std,77.65717,329.83271,139.789093,41.380206,342.333252,257.433866,321.993552,126.940455,467.210125,456.938184,43.203623,51.216145,38.97667
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,0.6,921.0,-200.0,4.0,711.0,50.0,637.0,53.0,1185.0,700.0,10.9,34.1,0.6923
50%,1.5,1053.0,-200.0,7.9,895.0,141.0,794.0,96.0,1446.0,942.0,17.2,48.6,0.9768
75%,2.6,1221.0,-200.0,13.6,1105.0,284.0,960.0,133.0,1662.0,1255.0,24.1,61.9,1.2962
max,11.9,2040.0,1189.0,63.7,2214.0,1479.0,2683.0,340.0,2775.0,2523.0,44.6,88.7,2.231


In [4]:
#Check for null columns
df.isnull().any()

Date             False
Time             False
CO(GT)           False
PT08.S1(CO)      False
NMHC(GT)         False
C6H6(GT)         False
PT08.S2(NMHC)    False
NOx(GT)          False
PT08.S3(NOx)     False
NO2(GT)          False
PT08.S4(NO2)     False
PT08.S5(O3)      False
T                False
RH               False
AH               False
dtype: bool

In [5]:
#Drop unnecessary features
df = df.drop(columns=['Date','Time'])
df.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


In [6]:
#Features and Labels
x = df.drop(columns='AH')
y = df['AH']

In [7]:
x.shape

(9357, 12)

In [8]:
y.shape

(9357,)

In [9]:
#Test Train split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(7485, 12)
(1872, 12)
(7485,)
(1872,)


In [10]:
#Linear Regression
reg = LinearRegression()
reg.fit(x_train,y_train)

LinearRegression()

In [11]:
y_pred = reg.predict(x_test)
y_pred

array([ 1.51914663,  0.54257991,  0.32375332, ...,  1.37462825,
       -0.50174175, -0.57144627])

In [12]:
#RMSE
mean_squared_error(y_test, y_pred,squared = False)

0.9741717330994394

In [13]:
#Random Forest Regressor
rf = RandomForestRegressor()

In [14]:
rf.fit(x_train,y_train)

RandomForestRegressor()

In [15]:
y_pred_1 = rf.predict(x_test)
y_pred_1

array([1.045699, 0.721499, 0.962898, ..., 1.228229, 0.322155, 0.802081])

In [16]:
#RMSE
mean_squared_error(y_test, y_pred_1, squared = False)

0.014902388432068033

In [17]:
#SVM
s = SVR()

In [18]:
s.fit(x_train,y_train)

SVR()

In [19]:
y_pred_2 = s.predict(x_test)
y_pred_2

array([ 1.21577953,  0.52734597,  1.23270881, ...,  0.87185329,
       -0.11531876,  1.18513876])

In [20]:
#RMSE
mean_squared_error(y_test, y_pred_2, squared = False)

1.8793802617809459