# Table of contents <a id="0"></a>
* [Imports](#imports)
* [Select modules](#select-modules)
* [Define models](#define-models)
* [Select models](#select-models)
* [Cross validation](#cross-validation)
* [Fit and predict test](#fit-and-predict-test)

In [None]:
import os
__print__ = print
def print(string):
    os.system(f'echo \"{string}\"')
    __print__(string)

## Imports

In [None]:
# System imports
import os, gc, time, joblib
import numpy as np, pandas as pd
import ashrae_scripts as scp
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

# Models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression, Lasso
from xgboost import XGBRegressor
import lightgbm as lgb

# Utilities
from sklearn.metrics import accuracy_score, mean_squared_log_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from mlxtend.regressor import StackingCVRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
dir = !ls -a
dir = !dir /b
output_compression_type = 'gzip'
if ('kernel-metadata.json' in dir):
    # Local environment
    src = 'Local'
    compression_type = 'gzip'
    data_folder = '../../../data/'
    output_folder = '../../../data/'
    df_path = data_folder + 'ashrae-train-test-1-0/df_label_enc.csv.gz'
    df_test_path = data_folder + 'ashrae-train-test-1-0/df_test_label_enc.csv.gz'
else:
    # Kaggle environment
    src = 'Kaggle'
    compression_type = None
    data_folder = '../input/'
    output_folder = ''
    df_path = data_folder + 'ashrae-train-test-1-0/df_label_enc.csv.gz'
    df_test_path = data_folder + 'ashrae-train-test-1-0/df_test_label_enc.csv.gz'
       
print('Environment set to [{env}]'.format(env=src))
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Select modules <a id="select-modules"></a>
[Go back to top](#0)

In [None]:
predict_lgbm = False
perform_cross_validation = True
predict_train = True
predict_test = False
generate_output = False

debug = True
scikit = True
if debug:
    rows = 1000000
    rounds = 20000
    stopping_rounds = 50
else:
    rows = None
    rounds = 20000
    stopping_rounds = 50

In [None]:
dtypes = [{'col_name': 'building_id', 'data_type': 'int16', 'feature_col': 1}
         , {'col_name': 'meter', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'site_id', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'primary_use', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'year_built', 'data_type': 'int16', 'feature_col': 1}
         , {'col_name': 'floor_count', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'air_temperature', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'cloud_coverage', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'dew_temperature', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'precip_depth_1_hr', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'sea_level_pressure', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'wind_direction', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'wind_speed', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'meter_reading', 'data_type': 'float16', 'feature_col': 0}
         , {'col_name': 'square_feet', 'data_type': 'float64', 'feature_col': 1}
         , {'col_name': 'month', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'day', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'hour', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'day_of_week', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'weekend', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'night', 'data_type': 'int8', 'feature_col': 1}
         ]
dtype = {col['col_name']: col['data_type'] for col in dtypes}
df = pd.read_csv(df_path
                 , dtype=dtype
                 , parse_dates=['timestamp']
                 , nrows=rows)
df.drop(df[df['meter_reading']==np.inf].index, axis=0, inplace=True)
sc = StandardScaler()

In [None]:
feature_cols = [col['col_name'] for col in dtypes if col['feature_col']==1]
target_col = 'meter_reading'
min_val = df[target_col].min()
max_val = df[target_col].max()
X = df[feature_cols]
y = df[target_col]
# X = sc.fit_transform(X)

In [None]:
if not debug:
    del df
    gc.collect

In [None]:
if predict_train:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8)

if not debug and not predict_test:
    del X
    del y
gc.collect()

In [None]:
# using scikit api
if scikit:
    def rmsle(y_true, y_pred):
        y_pred = np.clip(y_pred, min_val, max_val)
        return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
# using native api
else:
    def rmsle(preds, train_data):
        labels = train_data.get_label()
        preds = np.clip(preds, min_val, max_val)
        return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(preds) - np.log1p(labels), 2))), False

In [None]:
if scikit:
    gbm = lgb.LGBMRegressor(metrics='rmsle'
                        , learning_rate=0.05
                        , objective='regression'
                        , bagging_freq=5
                        , feature_fraction=0.9
                        , bagging_fraction=0.8
                        , num_round=rounds
                        , verbose=-1
                        )
    
else:
    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metrics': 'rmsle',
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
    }

In [None]:
if predict_lgbm:
    if scikit:
        gbm.fit(X_train, y_train
                , eval_set=[(X_val, y_val)]
                , eval_metric=rmsle
                , early_stopping_rounds=stopping_rounds)
        y_pred = gbm.predict(X_val)
        y_pred = np.clip(y_pred, min_val, max_val)
    else:
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val)
        gbm = lgb.train(params=params
                        , train_set=lgb_train
                        , num_boost_round=num_round
                        , valid_sets=lgb_val
                        , feval=rmsle
                        , early_stopping_rounds=stopping_rounds)
        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    y_pred = np.clip(y_pred, min_val, max_val)
    rmsle = mean_squared_log_error(y_val, y_pred)**0.5
    print('Root mean square log error: {:.2f}'.format(rmsle))

## Define models <a id="define-modules"></a>
[Go back to top](#0)

In [None]:
svr = LinearSVR(random_state=13)
lasso = Lasso(normalize=True, random_state=13)
rf = RandomForestRegressor(n_estimators=5, random_state=13)
lr = LinearRegression(fit_intercept=False, normalize=True)
gbm = lgb.LGBMRegressor(boosting_type='gbdt'
                        , num_leaves=500
                        , metrics='rmsle'
                        , learning_rate=0.05
                        , objective='regression'
                        , bagging_freq=5
                        , feature_fraction=0.9
                        , bagging_fraction=0.8
                        , num_round=rounds
                        , verbose=-1
                        , random_state=13
                        )

## Select models <a id="select-models"></a>
[Go back to top](#0)

In [None]:
base_reg = [(gbm, 'LightGBM', 0) #1.3
            #, (svr, 'SVR', 1)
            #, (rf, 'RandomForest', 0) #0.67
            #, (lasso, 'Lasso', 1)
            , (lr, 'LinearRegression', 0) #1.77
           ]
meta_reg = [(lasso, 'Lasso1', 1)] #1
#meta_reg = [(rf, 'RandomForest', 1)] #0.69
meta, meta_label, meta_cross_val=meta_reg[0]

stack = StackingCVRegressor(regressors=[model for model, label, cross_val in base_reg]
                            , meta_regressor=meta
                            , use_features_in_secondary=True
                            , random_state=13)
stack_reg = [(stack, 'Stack', 1)]

## Cross validation <a id="cross-validation"></a>
[Go back to top](#0)

In [None]:
if perform_cross_validation:
    def rmsle_crossval(y_true, y_pred):
        y_pred = np.clip(y_pred, min_val, max_val)
        return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2)))
    
    rmsle_scorer = make_scorer(rmsle_crossval, greater_is_better=False)
    models = base_reg + stack_reg
    #models = stack_reg
    
    for model, label, cross_val in models:
        if cross_val == 1:
            t1_start = time.perf_counter()
            scores = (cross_val_score(model, X_train, y_train, cv=5
                                      #, scoring='neg_mean_squared_error'
                                      , scoring=rmsle_scorer
                                      , verbose=0
                                     )*-1)**0.5
            print("Root mean squared log error score: %0.2f (+/- %0.2f) [%s]" % (
            scores.mean(), scores.std(), label))
            t1_stop = time.perf_counter()
            print('Elapsed time in minutes: {sec:.2f}\n'.format(sec=(t1_stop-t1_start)/60))

In [None]:
if predict_train:
    stack.fit(X_train, y_train)
    y_pred_stack = stack.predict(X_val)
    y_pred_stack = np.clip(y_pred_stack, min_val, max_val)
    rmsle = mean_squared_log_error(y_val, y_pred_stack)**0.5
    print('Root mean square log error (stack): {:.2f}'.format(rmsle))

In [None]:
if predict_test:
    dtype = {col['col_name']: col['data_type'] for col in dtypes if col['feature_col']==1}
    df_test = pd.read_csv(df_test_path
                     , dtype=dtype
                     , parse_dates=['timestamp']
                     , nrows=rows)
    #df_test = sc.transform(df_test)

## Fit and predict test <a id="fit-and-predict-test"></a>
[Go back to top](#0)

In [None]:
if predict_test:
    sites = list(X['site_id'].unique())
    outputs = pd.DataFrame()
    
    for site in sites:
        t1_start = time.perf_counter()
        print('Site [{site_no}] start...'.format(site_no=site))
        stack = StackingCVRegressor(regressors=[model for model, label, cross_val in base_reg]
                                    , meta_regressor=meta
                                    , use_features_in_secondary=True
                                    , random_state=13)
        
        site_index = list(X[X['site_id']==site].index)
        X_stack = X.loc[site_index, :]
        y_stack = y[site_index]
        stack.fit(X_stack, y_stack)
        if sum(df_test['site_id']==site) > 0:
            pred_site = stack.predict(df_test.loc[df_test['site_id']==site, feature_cols])
            pred_site = np.clip(pred_site, min_val, max_val) 
            output_site = pd.DataFrame({'row_id': df_test.loc[df_test['site_id']==site, ['row_id']]['row_id'],'meter_reading': pred_site})
            outputs = outputs.append(output_site, ignore_index = True)
        print('Site [{site_no}] end!'.format(site_no=site))
        t1_stop = time.perf_counter()
        print('Elapsed time in minutes: {sec:.2f}'.format(sec=(t1_stop-t1_start)/60))
    y_pred = outputs[target_col]    

In [None]:
if not debug and predict_test:
    del X
    del y
    gc.collect()

In [None]:
if predict_test:
    del df_test
    gc.collect()

In [None]:
if generate_output:
    outputs.to_csv('submission.csv', index=False)