In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model, ensemble, svm, tree, neural_network

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
result = {}

In [3]:
# hhids = [26, 59]
hhids = [26, 59, 77, 86, 93, 94, 101, 114, 115, 160, 171, 187]

for hhid in hhids: 
    result[hhid] = []
    print('Start :: Process on household {}...'.format(hhid))
    df = pd.read_csv('./data/added_hhdata_{}.csv'.format(hhid), index_col=0)
    st = []
    ct = 0
    for idx, row in df.iterrows():
        if row.GH < 2000 and row.GH > -1000:
            st.append(row)
        else:
            ct += 1

    # print(ct)
    df = pd.DataFrame(data=st, columns=df.columns)
    used_feature = ['localhour', 'temperature', 'cloud_cover', 'wind_speed','GH','GH_month','GH_week']
    ordered_features = ['GH', 'localhour', 'temperature', 'cloud_cover','wind_speed','GH_month','GH_week']
    lh = df.apply(lambda x: int(x.localhour[11:13]), axis=1)
    df.localhour = lh

    Y = list(df.GH)[1:]
    Y.append(df.GH.iloc[0])
    Y = np.array(Y)

    X = df[used_feature]
    X = np.array(X)
    X.shape

    temp_df = pd.DataFrame(data=X, columns=used_feature)
    temp_df = temp_df[ordered_features]
    temp_df['y_GH'] = Y
    values = temp_df.values

    # normalize features
    scaler = MinMaxScaler()
    y_gt = values[:,-1:]
    scaled = scaler.fit_transform(values)
    values = scaled

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                        test_size=0.10,
                                                        random_state=666)


    classifiers = [
        linear_model.Ridge(alpha=1.0, random_state=0),
        linear_model.Lasso(alpha=0.55, random_state=0),
        linear_model.BayesianRidge(alpha_1=1e-06, alpha_2=1e-06),
        linear_model.LassoLars(alpha=0.55),
        linear_model.LinearRegression(),
        ensemble.RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
        tree.DecisionTreeRegressor(),
        neural_network.MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
    ]


    print('Start :: Find the best model for this household...')
    for clf in classifiers:
        clf.fit(X_train, Y_train)
        print(clf)
        yhat = clf.predict(X_test)
        scores = cross_val_score(clf, X_train, Y_train)
        rmse = np.sqrt(mean_squared_error(Y_test, yhat))
        mae = mean_absolute_error(Y_test, yhat)
        print('RMSE =>', rmse)
        print('MAE =>', mae)
        print('CV Score =>', scores)
        model_dict = {
            'name': clf.__class__.__name__,
            'rmse': rmse,
            'mae': mae,
        }
        result[hhid].append(model_dict)
        print('')

Start :: Process on household 26...
Start :: Find the best model for this household...
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=0, solver='auto', tol=0.001)
RMSE => 157.807596312
MAE => 102.236546121
CV Score => [ 0.66918512  0.61617114  0.69707543]

Lasso(alpha=0.55, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False)
RMSE => 157.775520369
MAE => 102.145127215
CV Score => [ 0.66901279  0.61646565  0.69683   ]

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)
RMSE => 157.818960394
MAE => 102.091509335
CV Score => [ 0.66819881  0.61693413  0.69559507]

LassoLars(alpha=0.55, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500,

Start :: Find the best model for this household...
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=0, solver='auto', tol=0.001)
RMSE => 157.807596312
MAE => 102.236546121
CV Score => [ 0.66918512  0.61617114  0.69707543]

Lasso(alpha=0.55, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False)
RMSE => 157.775520369
MAE => 102.145127215
CV Score => [ 0.66901279  0.61646565  0.69683   ]

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)
RMSE => 157.818960394
MAE => 102.091509335
CV Score => [ 0.66819881  0.61693413  0.69559507]

LassoLars(alpha=0.55, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False

Start :: Find the best model for this household...
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=0, solver='auto', tol=0.001)
RMSE => 157.807596312
MAE => 102.236546121
CV Score => [ 0.66918512  0.61617114  0.69707543]

Lasso(alpha=0.55, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False)
RMSE => 157.775520369
MAE => 102.145127215
CV Score => [ 0.66901279  0.61646565  0.69683   ]

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)
RMSE => 157.818960394
MAE => 102.091509335
CV Score => [ 0.66819881  0.61693413  0.69559507]

LassoLars(alpha=0.55, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False

RMSE => 56.4536947487
MAE => 33.6635620833
CV Score => [ 0.92995129  0.94716532  0.89220748]

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
RMSE => 127.149010848
MAE => 41.5562171667
CV Score => [ 0.87544914  0.90326421  0.86188619]

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
RMSE => 149.438219506
MAE => 98.1111964955
CV Score => [ 0.76572319

In [4]:
final = []
for k, v in result.items():
    for i in result[k]:
        final.append([str(k), i['name'], i['rmse'], i['mae']])
col = ['household_id', 'alg', 'RMSE', 'MAE']
final = pd.DataFrame(data=final, columns=col)
final.to_csv('GHI.csv')

In [5]:
final

Unnamed: 0,household_id,alg,RMSE,MAE
0,26,Ridge,157.807596,102.236546
1,26,Lasso,157.775520,102.145127
2,26,BayesianRidge,157.818960,102.091509
3,26,LassoLars,169.291042,123.384916
4,26,LinearRegression,157.807838,102.237228
5,26,RandomForestRegressor,121.981909,59.849076
6,26,DecisionTreeRegressor,185.566940,81.655118
7,26,MLPRegressor,154.121465,90.691203
8,59,Ridge,157.807596,102.236546
9,59,Lasso,157.775520,102.145127


In [6]:
gb = final.groupby('alg')

In [7]:
N = 9
ind = range(N)
mean = list(gb['RMSE'].describe()['mean'])
mean.append(141.517)
std = list(gb['RMSE'].describe()['std'])
std.append(21.3441)

In [8]:
print(mean, std)

[155.6466811141064, 179.27062523053692, 155.46572681956789, 165.48748935550694, 155.49257923487031, 155.26119920542416, 119.18989546199481, 155.49051061437476, 141.517] [23.440991520739214, 17.100186345380038, 23.79326666813116, 25.996144816408886, 23.779444050008575, 9.4396759061028579, 20.853456280790219, 23.785352476493319, 21.3441]


In [9]:
p1 = plt.bar(ind, mean, 0.4, yerr=std)

plt.ylabel('GHI ($W/m^2$)')
plt.title('Prediction RMSE of different models')
plt.xlabel('Model')
plt.xticks(ind, ('BR', 'DT', 'Lasso', 'LasLar', 'LR', 'MLP', 'RF', 'Ridge', 'LSTM'))
# plt.yticks(np.arange(50, 2))
plt.axhline(y=gb['RMSE'].describe()['mean']['RandomForestRegressor'], linewidth=0.15)
plt.savefig('ghi.png')