In [20]:
import pandas as pd
import numpy as np
import operator
from math import sqrt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from matplotlib import pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor  
from sklearn import cross_validation, metrics   
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import LeaveOneLabelOut
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import datetime

In [4]:
energy_data = pd.read_csv('energydata_complete.csv')
energy_data['date'] = pd.to_datetime(energy_data.date)
energy_data['year'] = energy_data.date.dt.year
energy_data['month'] = energy_data.date.dt.month
energy_data['day'] = energy_data.date.dt.day
energy_data['hours'] = energy_data.date.dt.hour
energy_data['minutes'] = energy_data.date.dt.minute
energy_data['seconds'] = energy_data.date.dt.second
energy_data['week'] = energy_data.date.dt.week
energy_data['day_name'] = energy_data.date.dt.weekday_name
energy_data['day_of_week'] = energy_data.date.dt.dayofweek
energy_data['weekday'] = ((energy_data.date.dt.dayofweek // 5 == 1).astype(int))

day_name_encoding = pd.get_dummies(energy_data['day_name'], drop_first=True)
data = pd.concat([energy_data,day_name_encoding],axis=1 )

In [5]:
X = energy_data[['hours','T1','RH_1','RH_5','RH_6','RH_8','T_out','Windspeed','Visibility','Tdewpoint']]
y = energy_data['Appliances']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
mertic_info = pd.DataFrame({'r2_train': [], 'r2_test': [], 'rms_train':[], 'rms_test': [],'mae_train': [],'mae_test':[],'mape_train':[],'mape_test':[]})
rmse_dict = {} 

In [16]:
def calc_mertic_info(modelname, model, X_train_scale, y_train, X_test_scale, y_test):
    global mertic_info
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)
            
    r2_train = metrics.r2_score(y_train, y_train_predicted)
    r2_test = metrics.r2_score(y_test, y_test_predicted)
    
    rms_train = np.sqrt(metrics.mean_squared_error(y_train, y_train_predicted))
    rms_test = np.sqrt(metrics.mean_squared_error(y_test, y_test_predicted))
        
    mae_train = metrics.mean_absolute_error(y_train, y_train_predicted)
    mae_test = metrics.mean_absolute_error(y_test, y_test_predicted)
        
    mape_train = np.mean(np.abs((y_train - y_train_predicted) / y_train)) * 100
    mape_test = np.mean(np.abs((y_test - y_test_predicted) / y_test)) * 100
        
    rmse_dict[modelname] = rms_test
        
    df_local = pd.DataFrame({'Model':[modelname],
                            'r2_train': [r2_train],
                            'r2_test': [r2_test],
                            'rms_train':[rms_train], 
                            'rms_test': [rms_test],
                            'mae_train': [mae_train],
                            'mae_test': [mae_test],
                            'mape_train':[mape_train],
                            'mape_test':[mape_test]})
        
    mertic_info = pd.concat([mertic_info, df_local])
    return mertic_info

### Adding estimator1: StandardScaler & LinearRegression

In [18]:
pipe_lr = Pipeline([('scl', StandardScaler()),('clf', LinearRegression(normalize=True))])
grid_params_lr =[{}]
gs_lr = GridSearchCV(estimator=pipe_lr, param_grid=grid_params_lr, cv=10) 
gs_lr.fit(X_train, y_train)
calc_mertic_info('Regression', gs_lr, X_train, y_train, X_test, y_test)
print('LinearRegression completed')

LinearRegression completed


### Adding estimator2: StandardScaler & RandomForestRegression

In [22]:
pipe_rf = Pipeline([('scl', StandardScaler()),('rf', RandomForestRegressor(n_estimators=115,max_features=6,random_state=42))])
grid_params_rf = [{}]
gs_rf = GridSearchCV(estimator=pipe_rf, param_grid=grid_params_rf, cv=10)
gs_rf.fit(X_train, y_train)
calc_mertic_info('RandomForest', gs_rf, X_train, y_train, X_test, y_test)
print('RandomForest completed')

RandomForest completed


### Adding estimator3: Neural Network Regression

In [23]:
pipe_nn = Pipeline([('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
                    ('neural network', MLPRegressor(activation = 'logistic',learning_rate='adaptive',alpha=0.5))])
grid_params_nn = [{}]
gs_nn = GridSearchCV(estimator=pipe_nn, param_grid=grid_params_nn, cv=10)
gs_nn.fit(X_train, y_train)
calc_mertic_info('Nueral Network', gs_nn, X_train, y_train, X_test, y_test)
print('Neural Network completed')



Neural Network completed




### Exporting Regression metrics 

In [24]:
optimum_model =  min(rmse_dict.items(),key=operator.itemgetter(1))[0]
print('Model Analysis: ', optimum_model)

print('METRIC INFO:')
print(mertic_info)

mertic_info.to_csv('Metric_Info.csv')

Model Analysis:  RandomForest
METRIC INFO:
            Model   mae_test  mae_train  mape_test  mape_train   r2_test  \
0      Regression  55.335889  55.651561  67.366540   65.439100  0.082277   
0      Regression  55.335889  55.651561  67.366540   65.439100  0.082277   
0    RandomForest  30.922860  11.991840  30.980940   11.864655  0.580944   
0  Nueral Network  55.317005  56.092054  67.682968   66.345399  0.077118   

   r2_train   rms_test  rms_train  
0  0.089554  95.480525  98.713194  
0  0.089554  95.480525  98.713194  
0  0.940003  64.520113  25.340324  
0  0.075978  95.748517  99.446455  
