In [1]:
# importing some important modules into our workspace

import pandas as pd 
import numpy as np

from sklearn.model_selection import GridSearchCV , train_test_split
from sklearn.metrics import make_scorer , r2_score , confusion_matrix 
from sklearn.cross_validation import ShuffleSplit

from sklearn.tree import DecisionTreeRegressor 



In [2]:
# defining a function that can read data from xlx file format 

data =  pd.read_excel('./AirQualityUCI.xlsx')
data = data.drop(['Date','Time'],axis=1)
data.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [3]:
# extracting the features from data and prediction 
predict = data[['T','RH','AH']]
features = data.drop(predict,axis=1)
print("Features available is :")
print(features.head())
print("Values to be predictited is :")
print(predict.head())

Features available is :
   CO(GT)  PT08.S1(CO)  NMHC(GT)   C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  \
0     2.6      1360.00       150  11.881723        1045.50    166.0   
1     2.0      1292.25       112   9.397165         954.75    103.0   
2     2.2      1402.00        88   8.997817         939.25    131.0   
3     2.2      1375.50        80   9.228796         948.25    172.0   
4     1.6      1272.25        51   6.518224         835.50    131.0   

   PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)  
0       1056.25    113.0       1692.00      1267.50  
1       1173.75     92.0       1558.75       972.25  
2       1140.00    114.0       1554.50      1074.00  
3       1092.00    122.0       1583.75      1203.25  
4       1205.00    116.0       1490.00      1110.00  
Values to be predictited is :
       T         RH        AH
0  13.60  48.875001  0.757754
1  13.30  47.700000  0.725487
2  11.90  53.975000  0.750239
3  11.00  60.000000  0.786713
4  11.15  59.575001  0.788794


In [4]:
# making a performance metrics 

def performance_metrics(y_true,y_predict):
    score = r2_score(y_true,y_predict)
    
    return score

In [5]:
# spliting data for model prediction 

X_train,X_test,y_train,y_test = train_test_split(features,predict)

print("Size of Training data set is {}".format(X_train.shape))

Size of Training data set is (7017, 10)


In [6]:
# making a function to train our model 

def train_model(X,y):
    cv_set = ShuffleSplit(X.shape[0] , n_iter=10, test_size=0.2 , random_state=10)
    
    scoring_fun = make_scorer(performance_metrics)
    
    reg = DecisionTreeRegressor()
    
    params = {'max_depth': list(range(1,11)) }
    
    grid = GridSearchCV(reg,scoring=scoring_fun, param_grid= params,cv=cv_set)
    
    grid.fit(X,y)
    
    return grid.best_estimator_

In [7]:
reg = train_model(X_train,y_train)
print("Best param for the estimator is {}".format(reg.get_params()['max_depth']))

Best param for the estimator is 10


In [8]:
predicted_array =  reg.predict(X_test)

performance_metrics(y_test,predicted_array)

0.9776617575339813