In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
plt.rc('font', family='serif')
plt.rc('axes', labelsize=12)
plt.rc('xtick', labelsize=10, color='grey')
plt.rc('ytick', labelsize=10, color='grey')
plt.rc('legend', fontsize=12, loc='lower left')
plt.rc('figure', titlesize=12)
plt.rc('savefig', dpi=330, bbox='tight')
%matplotlib inline

default_colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [3]:
import utils
from sklearn.metrics import mean_absolute_error, mean_squared_error

import lightgbm as lgb

In [None]:
def generate_fig_path(fig_name):
    '''
    Generate the figure path and name
    To be used by plt.savefig function
    '''
    return '{}/results/fig/section3.4 tabular data model/{}'.format(PATH_TO_ROOT, fig_name)

def generate_input_data_path(data_name):
    '''
    Generate the path to input data
    '''
    return '{}/results/data/{}.csv'.format(PATH_TO_ROOT, data_name)

def generate_result_data_path(data_name):
    '''
    Generate the path to save the result data
    '''
    return '{}/results/data/predict/{}.csv'.format(PATH_TO_ROOT, data_name)

In [4]:
def lightGBM_train(region, params):
    
    # read the data
    data = pd.read_csv(generate_input_data_path(f'{region}_daily'), index_col=0)
    data.index = pd.to_datetime(data.index)
    data = data.truncate(after='2019-07-01')  # do not use the last year to avoid the influence of COVID
    data_lgmb = data[['Temperature, daily mean (degC)', 'Temperature, daily peak (degC)', 'Holiday', 'Weekend',
                      'Electricity demand, daily sum, (GWh)']]
    data_lgmb['Month'] = data_lgmb.index.month
    data_lgmb['dayOfWeek'] = data_lgmb.index.weekday
    data_lgmb = data_lgmb.dropna()

    # train_test split
    data_train, data_test = utlis.prepare_data(data_lgmb, train_ratio=0.75)
    X_train = data_train[['Temperature, daily mean (degC)', 'Temperature, daily peak (degC)', 
                          'Holiday', 'dayOfWeek', 'Month']].values
    X_test = data_test[['Temperature, daily mean (degC)', 'Temperature, daily peak (degC)', 
                        'Holiday', 'dayOfWeek', 'Month']].values
    y_train = data_train['Electricity demand, daily sum, (GWh)'].values
    y_test = data_test['Electricity demand, daily sum, (GWh)'].values
    X_all = data_lgmb[['Temperature, daily mean (degC)', 'Temperature, daily peak (degC)', 
                       'Holiday', 'dayOfWeek', 'Month']].values
    
    d_train = lgb.Dataset(X_train, categorical_feature=[2,3,4], label=y_train)

    # train and print the errors
    regr = lgb.train(params, d_train, 5000)
    
    rmse_train = mean_squared_error(regr.predict(X_train), y_train)**0.5
    rmse_test = mean_squared_error(regr.predict(X_test), y_test)**0.5
    print(f'-------City: {region}--------------')
    print(f'RMSE for Train: {rmse_train}')
    print(f'RMSE for Test: {rmse_test}')
    
    # make prediction on the whole data set and save the results for model comparison
    data_lgmb['yhat_lgbm'] = regr.predict(X_all)
    data_lgmb.index = data_lgmb['ds']
    data_lgmb.to_csv(generate_result_data_path(f'lgbm_{region}'), index=False)
    
    # prepare the data for plotting
    verif_plot = data_lgmb[['Electricity demand, daily sum, (GWh)','yhat_lgbm']]
    verif_plot.rename(columns={'Electricity demand, daily sum, (GWh)':'y',
                               'yhat_lgbm':'yhat'}, inplace=True)
    verif_plot['train'] = False
    verif_plot.loc[data_train.ds, 'train'] = True
    
    return verif_plot