In [34]:
#Loading required libraries
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn') #set same style for all plots
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
pd.options.mode.chained_assignment = None
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import randint as sp_randint
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor,BaggingRegressor,AdaBoostRegressor,BaggingRegressor
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
import keras

output = pd.DataFrame(index=None, columns=['model','train_r2_score','test_r2_score'])

## 1. Data import

In [35]:
data = pd.read_csv('ENB2012_data.csv')
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [None]:
data.columns = ['Relative_Compactness', 'Surface_Area', 'Wall_Area', 'Roof_Area', 'Overall_Height',
                'Orientation', 'Glazing_Area', 'Glazing_Area_Distribution', 'Heating_Load', 'Cooling_Load']

#### Spliting the data in X and Y

In [36]:
X = data[['Relative_Compactness', 'Surface_Area', 'Wall_Area', 'Roof_Area', 'Overall_Height',
                'Orientation', 'Glazing_Area', 'Glazing_Area_Distribution']]
Y = data[[ 'Heating_Load', 'Cooling_Load']]
Y1=data[['Heating_Load']]
Y2=data[['Cooling_Load']]
print(X.isnull().sum())

X1    0
X2    0
X3    0
X4    0
X5    0
X6    0
X7    0
X8    0
dtype: int64


No null treatment or data preprocessing required.

#### Dividing each observation in either of the 3 categories (High Efficient, Average Efficient & Low efficient)

In [37]:
Z = data[['Heating_Load','Cooling_Load']]

# The total load is the sum of the individual heating and cooling load
Z['Overall_load'] = Z['Heating_Load'] + Z['Cooling_Load']
Z['class'] = 1

Z.loc[Z.Overall_load < 42,['class']] = 0
Z.loc[Z.Overall_load > 70 , ['class']] = 2

#Efficiency classes
yclass=Z['class']

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, yclass, random_state = 5)


#### Using MinMax scaler for scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
MinMax = MinMaxScaler(feature_range= (0,1))
X_train_class = MinMax.fit_transform(X_train_class)
X_test_class = MinMax.transform(X_test_class)

#### Now dividing the whole dataset into train and test

In [38]:
#Test-Train split of data for regression models
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 5)

X_train_div, X_test_div, y1_train, y1_test = train_test_split(X, Y1, random_state = 5)

X_train_div, X_test_div, y2_train, y2_test = train_test_split(X, Y2, random_state = 5)

### Scaling the data using MinMax Scaler

In [39]:
# MinMax Scaling done
from sklearn.preprocessing import MinMaxScaler
MinMax = MinMaxScaler(feature_range= (0,1))
X_train = MinMax.fit_transform(X_train)
X_test = MinMax.transform(X_test)

## 2. Modeling

## Linear Regression
#### As the output variables are 2 we will be using Multioutput regressor

In [40]:
linear = LinearRegression()
multiOutput_linear = MultiOutputRegressor(linear,n_jobs=-1)
multiOutput_linear.fit(X_train,y_train)
train_r2_score=r2_score(y_train,multiOutput_linear.predict(X_train))
test_r2_score=r2_score(y_test,multiOutput_linear.predict(X_test))
output = output.append(pd.Series({'model':'Linear Regressor', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354


#### KNN regressor

In [41]:
param_grid = {'weights':['distance', 'uniform'], 'n_neighbors':range(3,75)}
grid_search_knn = MultiOutputRegressor(GridSearchCV(KNeighborsRegressor(), param_grid, cv=5,return_train_score=True))
grid_search_knn.fit(X_train, y_train)
train_r2_score=r2_score(y_train,grid_search_knn.predict(X_train))
test_r2_score=r2_score(y_test,grid_search_knn.predict(X_test))
output = output.append(pd.Series({'model':'KNN Regressor', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454


#### Random Forest Regressor

In [42]:
model = RandomForestRegressor( random_state=5, n_estimators= 700, n_jobs = -1)

param_grid = {'max_features':['auto', 'log2'], 'max_depth':[5,10,15,20,30,50,60]}

grid_search_rf = MultiOutputRegressor(GridSearchCV(model, param_grid, cv= 5,return_train_score=True))
grid_search_rf.fit(X_train, y_train)
train_r2_score=r2_score(y_train,grid_search_rf.predict(X_train))
test_r2_score=r2_score(y_test,grid_search_rf.predict(X_test))
output = output.append(pd.Series({'model':'Random Forest Regressor', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454
2,Random Forest Regressor,0.997856,0.984413


#### Linear SVM

In [43]:
param_grid = {"C": [1e0, 1e1, 1e2, 1e3, 1e4],"gamma": np.logspace(-2, 1, 2, 3, 5)}
grid_search_svm = MultiOutputRegressor(GridSearchCV(SVR(kernel='linear'), param_grid, cv=10,return_train_score=True))

grid_search_svm.fit(X_train, y_train)
train_r2_score=r2_score(y_train,grid_search_svm.predict(X_train))
test_r2_score=r2_score(y_test,grid_search_svm.predict(X_test))
output = output.append(pd.Series({'model':'SVM Linear', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454
2,Random Forest Regressor,0.997856,0.984413
3,SVM Linear,0.897939,0.896891


### Kernalized SVM 

In [44]:
param_grid = {"C": [1e0, 1e1, 1e2, 1e3, 1e4],"gamma": np.logspace(-2, 1, 2, 3, 5)}
grid_search_svm_rbf = MultiOutputRegressor(GridSearchCV(SVR(kernel='rbf'), param_grid, cv=10,return_train_score=True))

grid_search_svm_rbf.fit(X_train, y_train)
train_r2_score=r2_score(y_train,grid_search_svm_rbf.predict(X_train))
test_r2_score=r2_score(y_test,grid_search_svm_rbf.predict(X_test))
output = output.append(pd.Series({'model':'SVM RBF', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454
2,Random Forest Regressor,0.997856,0.984413
3,SVM Linear,0.897939,0.896891
4,SVM RBF,0.992589,0.986182


## 3. Ensembling

### Bagging:
#### Bagging ensembler using Decision Tree Regressor as base model

In [45]:
from sklearn.ensemble import BaggingRegressor


param_grid = {'max_samples':[5,10], 'max_features':[1,2,3,4,5,6,7]}
bagging_DT = MultiOutputRegressor(GridSearchCV(BaggingRegressor(DecisionTreeRegressor(),n_estimators=750), 
                                                          param_grid,cv= 10,return_train_score=True))

bagging_DT.fit(X_train, y_train)
train_r2_score=r2_score(y_train,bagging_DT.predict(X_train))
test_r2_score=r2_score(y_test,bagging_DT.predict(X_test))
output = output.append(pd.Series({'model':'Multi Output DT Bagging', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output



Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454
2,Random Forest Regressor,0.997856,0.984413
3,SVM Linear,0.897939,0.896891
4,SVM RBF,0.992589,0.986182
5,Multi Output DT Bagging,0.875499,0.880045


### Ada Boosting:
#### Ada Boosting on Decision Tree Regressor

In [46]:
param =  { "n_estimators": [100,500] }

base_dt=DecisionTreeRegressor()
ada_dt = AdaBoostRegressor(base_estimator=base_dt,learning_rate = 0.7,random_state=5)
adaboost_dt = MultiOutputRegressor(GridSearchCV(ada_dt,param_grid=param))
adaboost_dt.fit(X_train, y_train)
train_r2_score=r2_score(y_train,adaboost_dt.predict(X_train))
test_r2_score=r2_score(y_test,adaboost_dt.predict(X_test))
output = output.append(pd.Series({'model':'Adaboost_DecisionTree', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454
2,Random Forest Regressor,0.997856,0.984413
3,SVM Linear,0.897939,0.896891
4,SVM RBF,0.992589,0.986182
5,Multi Output DT Bagging,0.875499,0.880045
6,Adaboost_DecisionTree,0.999908,0.983499


#### Ada Boosting on KNN regressor

In [47]:
param =  { "n_estimators": [100,500,1000] }

base_knn=KNeighborsRegressor()
ada_knn = AdaBoostRegressor(base_estimator=base_knn,learning_rate = 0.7,random_state=10)
adaboost_knn = MultiOutputRegressor(GridSearchCV(ada_knn,param_grid=param))
adaboost_knn.fit(X_train, y_train)
train_r2_score=r2_score(y_train,adaboost_knn.predict(X_train))
test_r2_score=r2_score(y_test,adaboost_knn.predict(X_test))
output = output.append(pd.Series({'model':'KNN Adaboost', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454
2,Random Forest Regressor,0.997856,0.984413
3,SVM Linear,0.897939,0.896891
4,SVM RBF,0.992589,0.986182
5,Multi Output DT Bagging,0.875499,0.880045
6,Adaboost_DecisionTree,0.999908,0.983499
7,KNN Adaboost,0.972943,0.92925


#### Ada Boosting on Linear SVM regressor

In [48]:
param =  { "n_estimators": [100,500,1000] }


base_svr=SVR(kernel='linear')
ada_svr = AdaBoostRegressor(base_estimator=base_svr,learning_rate = 0.7,random_state=10)
adaboost_svr = MultiOutputRegressor(GridSearchCV(ada_svr,param_grid=param,n_jobs=-1),n_jobs=-1)
adaboost_svr.fit(X_train, y_train)
train_r2_score=r2_score(y_train,adaboost_svr.predict(X_train))
test_r2_score=r2_score(y_test,adaboost_svr.predict(X_test))
output = output.append(pd.Series({'model':'Adaboost_LinearSVM', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454
2,Random Forest Regressor,0.997856,0.984413
3,SVM Linear,0.897939,0.896891
4,SVM RBF,0.992589,0.986182
5,Multi Output DT Bagging,0.875499,0.880045
6,Adaboost_DecisionTree,0.999908,0.983499
7,KNN Adaboost,0.972943,0.92925
8,Adaboost_LinearSVM,0.896437,0.893522


### Gradient Boosting regressor

In [49]:
param = {"min_samples_split": [3,4,5],
         "max_features": [2,3,4,5,6],
         'n_estimators': [100,500,1000]}

gradient = GradientBoostingRegressor(learning_rate =1.1,random_state=10)
gradient_mr = MultiOutputRegressor(GridSearchCV(gradient,param_grid=param,n_jobs=-1),n_jobs=-1)
gradient_mr.fit(X_train, y_train)
train_r2_score=r2_score(y_train,gradient_mr.predict(X_train))
test_r2_score=r2_score(y_test,gradient_mr.predict(X_test))
output = output.append(pd.Series({'model':'Gradient Boosting Regressor', 'train_r2_score':train_r2_score,'test_r2_score':test_r2_score}),ignore_index=True )
output

Unnamed: 0,model,train_r2_score,test_r2_score
0,Linear Regressor,0.902539,0.899354
1,KNN Regressor,0.935276,0.90454
2,Random Forest Regressor,0.997856,0.984413
3,SVM Linear,0.897939,0.896891
4,SVM RBF,0.992589,0.986182
5,Multi Output DT Bagging,0.875499,0.880045
6,Adaboost_DecisionTree,0.999908,0.983499
7,KNN Adaboost,0.972943,0.92925
8,Adaboost_LinearSVM,0.896437,0.893522
9,Gradient Boosting Regressor,0.999541,0.996787


## 4. Classification modeling
### After running multioutput regressors we will now be running classification models

In [50]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [51]:
#Logistic Regression

param_grid = {'C':[0.01, 0.1, 0.5, 1, 2, 5, 10, 15, 20]}

model = LogisticRegression()

grid_search = GridSearchCV(model, param_grid, cv=10)
grid_search.fit(X_train_class, y_train_class)

print('Best parameters for efficiency classification {}'.format(grid_search.best_params_))

print('The Train Accuracy score for Logistic Reression is',accuracy_score(y_train_class, grid_search.predict(X_train_class)))
print('The Test Accuracy score for Logistic Reression is',accuracy_score(y_test_class, grid_search.predict(X_test_class)))

Best parameters for efficiency classification {'C': 10}
The Train Accuracy score for Logistic Reression is 0.875
The Test Accuracy score for Logistic Reression is 0.8802083333333334


In [52]:
#Linear SVM

param_grid = {'C':[0.001, 0.01, 0.1, 0.5, 1, 10]}

model = LinearSVC()

model = LogisticRegression()

grid_search_Log_Class = GridSearchCV(model, param_grid, cv=5)
grid_search_Log_Class.fit(X_train_class, y_train_class)

print('Best parameters for efficiency classification {}'.format(grid_search_Log_Class.best_params_))

print('The Train Accuracy score for Linear SVC is',accuracy_score(y_train_class, grid_search_Log_Class.predict(X_train_class)))
print('The Test Accuracy score for Linear SVC is',accuracy_score(y_test_class, grid_search_Log_Class.predict(X_test_class)))

Best parameters for efficiency classification {'C': 10}
The Train Accuracy score for Linear SVC is 0.875
The Test Accuracy score for Linear SVC is 0.8802083333333334


In [53]:
param_grid = {'C':[0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000], 'gamma':[0.001, 0.01, 0.1, 0.5, 1, 10]}

model = SVC(kernel='rbf')

grid_search_SVC_Class = GridSearchCV(model, param_grid, cv=10)
grid_search_SVC_Class.fit(X_train_class, y_train_class)

print('Best parameters for efficiency classification {}'.format(grid_search_SVC_Class.best_params_))

print('The Train Accuracy score for SVM rbf is',accuracy_score(y_train_class, grid_search_SVC_Class.predict(X_train_class)))
print('The Test Accuracy score for SVM rbf is',accuracy_score(y_test_class, grid_search_SVC_Class.predict(X_test_class)))

Best parameters for efficiency classification {'C': 50, 'gamma': 0.5}
The Train Accuracy score for SVM rbf is 0.9704861111111112
The Test Accuracy score for SVM rbf is 0.984375


In [54]:
param_grid = {'max_features':['auto', 'log2'], 'max_depth':[5,10,15,20,50,60]}

model = RandomForestClassifier(n_estimators= 700, n_jobs = -1, random_state = 5)

grid_search_RF_Class = GridSearchCV(model, param_grid, cv=5)
grid_search_RF_Class.fit(X_train_class, y_train_class)

print('Best parameters for efficiency classification {}'.format(grid_search_RF_Class.best_params_))

print('The Train Accuracy score for random forest classifier is',accuracy_score(y_train_class, grid_search_RF_Class.predict(X_train_class)))
print('The Test Accuracy score for random forest classifier is',accuracy_score(y_test_class, grid_search_RF_Class.predict(X_test_class)))

Best parameters for efficiency classification {'max_depth': 10, 'max_features': 'auto'}
The Train Accuracy score for random forest classifier is 1.0
The Test Accuracy score for random forest classifier is 0.96875


In [55]:
param_grid = {'max_features':['auto', 'log2'], 'max_depth':[5,10,15,20,50,60], 'learning_rate':[0.01,0.1,0.7,1]}

model =  GradientBoostingClassifier(random_state = 5, n_estimators = 500)


grid_search_GB_Class = GridSearchCV(model, param_grid, cv=5)
grid_search_GB_Class.fit(X_train_class, y_train_class)

print('Best parameters for efficiency classification {}'.format(grid_search_GB_Class.best_params_))

print('The Train Accuracy score for Gradient Boosting Classifier is',accuracy_score(y_train_class, grid_search_GB_Class.predict(X_train_class)))
print('The Test Accuracy score for Gradient Boosting Classifier is',accuracy_score(y_test_class, grid_search_GB_Class.predict(X_test_class)))

Best parameters for efficiency classification {'learning_rate': 0.01, 'max_depth': 5, 'max_features': 'log2'}
The Train Accuracy score for Gradient Boosting Classifier is 1.0
The Test Accuracy score for Gradient Boosting Classifier is 0.9791666666666666


## 4. Neural Network Regression model

In [56]:
from keras.models import Sequential
from keras.layers import Dense

# fix random seed for reproducibility
np.random.seed(5)

In [57]:
from keras.wrappers.scikit_learn import KerasRegressor

def create_model_regressor_heating():
    #create model
    model = Sequential()
    model.add(Dense(8, input_dim=8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, kernel_initializer = 'normal', activation = 'relu')) # hidden layer
    model.add(Dense(10, kernel_initializer = 'normal', activation = 'linear'))
    model.add(Dense(1, kernel_initializer='normal'))
    #compile model
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    return model

In [58]:
from keras.wrappers.scikit_learn import KerasRegressor

def create_model_regressor_cooling():
    #create model
    model2 = Sequential()
    model2.add(Dense(8, input_dim=8, kernel_initializer='normal', activation='relu'))
    model2.add(Dense(4, kernel_initializer = 'normal', activation = 'relu')) # hidden layer
    model2.add(Dense(10, kernel_initializer = 'normal', activation = 'linear'))
    model2.add(Dense(1, kernel_initializer='normal'))
    #compile model
    model2.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    #return model
    return model2

In [59]:
#Model training

param_grid = {'epochs':[50, 100, 200] , 'batch_size':[20, 50, 100]}

In [60]:
model1 = KerasRegressor(build_fn = create_model_regressor_cooling , verbose = 0)

grid_search_Keras_Reg = GridSearchCV(model1 , param_grid , cv =5)

grid_search_Keras_Reg.fit(X_train_div, y1_train)

print('Best parameters for cooling load {}'.format(grid_search_Keras_Reg.best_params_))

print('The Train R2 score is',r2_score(y1_train, grid_search_Keras_Reg.predict(X_train_div)))
print('The Test R2 score is',r2_score(y1_test, grid_search_Keras_Reg.predict(X_test_div)))

Best parameters for cooling load {'batch_size': 20, 'epochs': 200}
The Train R2 score is 0.8447339385412662
The Test R2 score is 0.8184779982513859


In [61]:
model2 = KerasRegressor(build_fn = create_model_regressor_heating , verbose = 0)
grid_search_Keras2 = GridSearchCV(model2 , param_grid , cv =10)

grid_search_Keras2.fit(X_train_div, y2_train)
print('Best parameters for heating load {}'.format(grid_search_Keras2.best_params_))

print('The Train R2 score is',r2_score(y2_train, grid_search_Keras2.predict(X_train_div)))
print('The Test R2 score is',r2_score(y2_test, grid_search_Keras2.predict(X_test_div)))

Best parameters for heating load {'batch_size': 20, 'epochs': 200}
The Train R2 score is 0.8120563702859663
The Test R2 score is 0.7998327631743293


## 5. Neural Network Classification model

In [62]:
from keras.utils import np_utils

# convert integers to dummy variables (i.e. one hot encoded)
y_train = np_utils.to_categorical(y_train_class)
y_test = np_utils.to_categorical(y_test_class)


#### Creating model function for NN

In [63]:
from keras.wrappers.scikit_learn import KerasClassifier

def model_classifier():
    #create model
    model = Sequential()
    model.add(Dense(8, input_dim=8, activation='relu'))
    model.add(Dense(4, activation='relu')) #hidden layer
    model.add(Dense(3,activation='sigmoid')) #output layer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


#### Grid Search for NN

In [64]:
param_grid = {'epochs':[50, 200] , 'batch_size':[10, 50, 100]}

model = KerasClassifier(build_fn = model_classifier , verbose = 0)

grid_search_Keras_Class = GridSearchCV(model , param_grid , cv =10)

grid_search_Keras_Class.fit(X_train_class, y_train)

print('Best parameters for efficiency classification {}'.format(grid_search_Keras_Class.best_params_))

Best parameters for efficiency classification {'batch_size': 10, 'epochs': 200}


In [65]:
from sklearn.metrics import accuracy_score
print('The Train Accuracy score is',accuracy_score(y_train_class, grid_search_Keras_Class.predict(X_train_class)))
print('The Test Accuracy score is',accuracy_score(y_test_class, grid_search_Keras_Class.predict(X_test_class)))

The Train Accuracy score is 0.9635416666666666
The Test Accuracy score is 0.9635416666666666
