In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model, ensemble, svm, tree, neural_network

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
result = {}

In [3]:
hhids=[26, 59, 77, 86, 93, 94, 101, 114, 115, 160, 171, 187]


for hhid in hhids: 
    result[hhid] = []
    print('Start :: Process on household {}...'.format(hhid))
    df = pd.read_csv('./data/added_hhdata_{}.csv'.format(hhid), index_col=0)
    df = df.dropna()
    st = []
    ct = 0
    for idx, row in df.iterrows():
        if row.GH < 2000 and row.GH > -1000:
            st.append(row)
        else:
            ct += 1

    df = pd.DataFrame(data=st, columns=df.columns)
    used_feature = ['use', 'is_weekday', 'temperature', 'cloud_cover', 'wind_speed','GH','use_month','use_week']
    ordered_features = ['use', 'GH', 'is_weekday', 'temperature', 'cloud_cover','wind_speed','use_month','use_week']
    
    Y = list(df.use)[1:]
    try:
        Y.append(df.use.iloc[0])
    except:
        break
    Y = np.array(Y)

    X = df[used_feature]
    X = np.array(X)
    X.shape

    temp_df = pd.DataFrame(data=X, columns=used_feature)
    temp_df = temp_df[ordered_features]
    temp_df['y_use'] = Y
    values = temp_df.values

    # normalize features
    scaler = MinMaxScaler()
    y_gt = values[:,-1:]
    scaled = scaler.fit_transform(values)
    values = scaled

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                        test_size=0.10,
                                                        random_state=666)


    classifiers = [
        linear_model.Ridge(alpha=1.0, random_state=0),
        linear_model.Lasso(alpha=0.55, random_state=0),
        linear_model.BayesianRidge(alpha_1=1e-06, alpha_2=1e-06),
        linear_model.LassoLars(alpha=0.55),
        linear_model.LinearRegression(),
        ensemble.RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
        tree.DecisionTreeRegressor(),
        neural_network.MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
    ]


    print('Start :: Find the best model for this household...')
    for clf in classifiers:
        clf.fit(X_train, Y_train)
#         print(clf)
        yhat = clf.predict(X_test)
        scores = cross_val_score(clf, X_train, Y_train)
        rmse = np.sqrt(mean_squared_error(Y_test, yhat))
        mae = mean_absolute_error(Y_test, yhat)
#         print('RMSE =>', rmse)
#         print('MAE =>', mae)
#         print('CV Score =>', scores)
        model_dict = {
            'name': clf.__class__.__name__,
            'rmse': rmse,
            'mae': mae,
        }
        result[hhid].append(model_dict)
#         print('')

Start :: Process on household 26...
Start :: Find the best model for this household...
Start :: Process on household 59...
Start :: Find the best model for this household...
Start :: Process on household 77...
Start :: Find the best model for this household...
Start :: Process on household 86...
Start :: Find the best model for this household...
Start :: Process on household 93...
Start :: Find the best model for this household...
Start :: Process on household 94...
Start :: Find the best model for this household...
Start :: Process on household 101...
Start :: Find the best model for this household...
Start :: Process on household 114...
Start :: Find the best model for this household...
Start :: Process on household 115...
Start :: Find the best model for this household...
Start :: Process on household 160...
Start :: Find the best model for this household...
Start :: Process on household 171...
Start :: Find the best model for this household...
Start :: Process on household 187...
S

In [4]:
result

{26: [{'mae': 0.54206365540012957,
   'name': 'Ridge',
   'rmse': 0.76857753617810642},
  {'mae': 0.69538239985188588, 'name': 'Lasso', 'rmse': 0.92371099319519134},
  {'mae': 0.54218756600939721,
   'name': 'BayesianRidge',
   'rmse': 0.76860188631067738},
  {'mae': 0.88787367797656014,
   'name': 'LassoLars',
   'rmse': 1.1175194888997482},
  {'mae': 0.54205456653001327,
   'name': 'LinearRegression',
   'rmse': 0.76857585317168664},
  {'mae': 0.50362547525391543,
   'name': 'RandomForestRegressor',
   'rmse': 0.73728371526373715},
  {'mae': 0.7141137838620889,
   'name': 'DecisionTreeRegressor',
   'rmse': 1.1283773245793345},
  {'mae': 0.58564791693382157,
   'name': 'MLPRegressor',
   'rmse': 0.86191502343182469}],
 59: [{'mae': 0.50360657598070391,
   'name': 'Ridge',
   'rmse': 0.76627507201043854},
  {'mae': 0.68716451224175223, 'name': 'Lasso', 'rmse': 0.91770564431209745},
  {'mae': 0.50375231520250863,
   'name': 'BayesianRidge',
   'rmse': 0.76628241558768095},
  {'mae': 1.

In [5]:
final = []
for k, v in result.items():
    for i in result[k]:
        final.append([str(k), i['name'], i['rmse'], i['mae']])
col = ['household_id', 'alg', 'RMSE', 'MAE']
final = pd.DataFrame(data=final, columns=col)
final.to_csv('HL.csv')

In [6]:
final = pd.read_csv('HL.csv', index_col=0)
final

Unnamed: 0,household_id,alg,RMSE,MAE
0,26,Ridge,0.768578,0.542064
1,26,Lasso,0.923711,0.695382
2,26,BayesianRidge,0.768602,0.542188
3,26,LassoLars,1.117519,0.887874
4,26,LinearRegression,0.768576,0.542055
5,26,RandomForestRegressor,0.737284,0.503625
6,26,DecisionTreeRegressor,1.128377,0.714114
7,26,MLPRegressor,0.861915,0.585648
8,59,Ridge,0.766275,0.503607
9,59,Lasso,0.917706,0.687165


In [7]:
gb = final.groupby('alg')

In [8]:
N = 8
ind = range(N)
mean = list(gb['RMSE'].describe()['mean'].drop('MLPRegressor'))
mean.append(0.58233023935)
std = list(gb['RMSE'].describe()['std'].drop('MLPRegressor'))
std.append(0.26252271524)

In [9]:
ind, mean, std

(range(0, 8),
 [0.57186958814917743,
  0.7740723183782986,
  0.72166327176078993,
  0.88676988075149266,
  0.57180090583878085,
  0.53685207676283775,
  0.57194754079546206,
  0.58233023935],
 [0.2435311945171322,
  0.31109763874789259,
  0.28386085578173559,
  0.37357621809216263,
  0.24364811404647096,
  0.21837773962981044,
  0.24333198295263084,
  0.26252271524])

In [10]:
ghi_mean = [170.3193060658875, 170.41247073758, 170.29892587653836, 176.40821097466304, 170.33880718520192, 120.17416697811706, 170.33778501001555, 141.517]
ghi_std = [29.422574643451615, 31.113985531908828, 29.424709124388524, 30.161418098094288, 29.4215409738363, 21.356142977582262, 29.424478413240163, 21.3441]

In [11]:
# fig = plt.figure()
# ax = fig.add_subplot(111)
# # ax2 = ax.twinx()
# ax.set_ylabel('RMSE1')
# # ax2.set_ylabel('RMSE2')


# # ax.bar(ind, ghi_mean, 0.3, yerr=ghi_std, color='red', align='center')
# # ax.autoscale(tight=True)
# plt.show()

plt.bar(ind, mean, 0.4, yerr=std, align='center')
plt.ylabel('Home Energy Consumption ($KWh$)')
plt.xlabel('Model')
plt.title('Prediction RMSE of different models')
plt.xticks(ind, ('BR', 'DT', 'Lasso', 'LasLar', 'LR', 'RF', 'Ridge', 'LSTM'))
# plt.yticks(np.arange(0, 2))
plt.axhline(y=gb['RMSE'].describe()['mean']['RandomForestRegressor'], linewidth=0.15)
plt.savefig('hl.png')

In [12]:
fig = plt.figure()