# Table of contents <a id="0"></a>
* [Imports](#imports)
* [Select modules](#select-modules)
* [Select variables](#select-variables)
* [LGBM Params](#lgbm-params)
* [Define models](#define-models)
* [Select models](#select-models)
* [Cross validation](#cross-validation)
* [Fit and predict test](#fit-and-predict-test)

In [None]:
import os
__print__ = print
def print(string):
    os.system(f'echo \"{string}\"')
    __print__(string)

## Imports <a id="imports"></a>
[Go back to top](#0)

In [2]:
# System imports
import os, gc, time, joblib, copy, datetime
import numpy as np, pandas as pd
import ashrae_scripts as scp
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

# Models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression, Lasso
from xgboost import XGBRegressor
import lightgbm as lgb

# Utilities
from sklearn.metrics import accuracy_score, mean_squared_log_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from mlxtend.regressor import StackingCVRegressor
from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named 'lightgbm'

In [5]:
# Local environment
src = 'Local'
compression_type = 'gzip'
data_folder = '../../../data/'
output_folder = '../../../data/'
df_path = data_folder + 'ashrae-train-test-1-0/df_label_enc.csv.gz'
df_test_path = data_folder + 'ashrae-train-test-1-0/df_test_label_enc.csv.gz'

# Kaggle environment
# src = 'Kaggle'
# compression_type = None
# data_folder = '../input/'
# output_folder = ''
# df_path = data_folder + 'ashrae-train-test-1-0/df_label_enc.csv.gz'
# df_test_path = data_folder + 'ashrae-train-test-1-0/df_test_label_enc.csv.gz'
       
print('Environment set to [{env}]'.format(env=src))

# for dirname, _, filenames in os.walk(data_folder):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Environment set to [Local]


## Select modules <a id="select-modules"></a>
[Go back to top](#0)

In [6]:
predict_lgbm = False
perform_cross_validation = False
predict_train = True
predict_test = False
generate_output = False

## Select variables <a id="select-variables"></a>
[Go back to top](#0)

In [None]:
debug = True
scikit = True
RANDOM_STATE=13
if debug:
    rows = 5000000
else:
    rows = None

if 'site_id' in cols:
    cols.remove('site_id')
# LightGBM
if debug:
    rounds = 10000
    stopping_rounds = 50
else:
    rounds = 25000
    stopping_rounds = 50
    
boosting_type='gbdt'
num_leaves=300
min_data_in_leaf=20
max_depth=None
metrics='rmsle'
learning_rate=0.05
bagging_freq=5
feature_fraction=0.9
bagging_fraction=0.8

In [None]:
dtypes = [{'col_name': 'building_id', 'data_type': 'int16', 'feature_col': 1}
         , {'col_name': 'meter', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'site_id', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'primary_use', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'year_built', 'data_type': 'int16', 'feature_col': 1}
         , {'col_name': 'floor_count', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'air_temperature', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'cloud_coverage', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'dew_temperature', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'precip_depth_1_hr', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'sea_level_pressure', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'wind_direction', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'wind_speed', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'meter_reading', 'data_type': 'float16', 'feature_col': 0}
         , {'col_name': 'square_feet', 'data_type': 'float64', 'feature_col': 1}
         , {'col_name': 'month', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'day', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'hour', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'day_of_week', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'weekend', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'night', 'data_type': 'int8', 'feature_col': 1}
         ]
dtype = {col['col_name']: col['data_type'] for col in dtypes}
df = pd.read_csv(df_path
                 , dtype=dtype
                 , parse_dates=['timestamp']
                 , nrows=rows)

In [None]:
df.drop(df[df['meter_reading']==np.inf].index, axis=0, inplace=True)

if debug:
    sites = [0,1,2,3,4,5,6,7]
else:
    sites = list(df['site_id'].unique())

# sc = StandardScaler()

In [None]:
feature_cols = [col['col_name'] for col in dtypes if col['feature_col']==1]
target_col = 'meter_reading'
min_val = df[target_col].min()
max_val = df[target_col].max()
X = df[feature_cols]
y = df[target_col]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, train_size=0.5)
# X = sc.fit_transform(X)

## LGBM Params <a id="lgbm-params"></a>
[Go back to top](#0)

In [7]:
if scikit:
    gbm = lgb.LGBMRegressor(boosting_type=boosting_type
                            , num_leaves=num_leaves
                            , min_data_in_leaf=min_data_in_leaf
                            , max_depth=max_depth
                            , metrics=metrics
                            , learning_rate=learning_rate
                            , bagging_freq=bagging_freq
                            , feature_fraction=feature_fraction
                            , bagging_fraction=bagging_fraction
                            , num_round=rounds
                            , random_state=RANDOM_STATE
                            , verbose_eval = -1
                            , verbose = -1
                           )
    
else:
    params = {
        'boosting_type': boosting_type,
        'num_leaves': num_leaves,
        'min_data_in_leaf': min_data_in_leaf,
        'max_depth': max_depth,
        'metrics': metrics,
        'learning_rate': learning_rate,
        'bagging_freq': bagging_freq,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction
    }

NameError: name 'scikit' is not defined

In [None]:
# using scikit api
if scikit:
    def rmsle(y_true, y_pred):
        y_pred = np.clip(y_pred, min_val, max_val)
        return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
# using native api
else:
    def rmsle(preds, train_data):
        labels = train_data.get_label()
        preds = np.clip(preds, min_val, max_val)
        return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(preds) - np.log1p(labels), 2))), False

In [None]:
gbm.get_params

## Define models <a id="define-modules"></a>
[Go back to top](#0)

In [None]:
# svr = LinearSVR(random_state=RANDOM_STATE)
# lasso = Lasso(normalize=True, random_state=RANDOM_STATE)
# rf = RandomForestRegressor(n_estimators=5, random_state=RANDOM_STATE)
# lr = LinearRegression(fit_intercept=False, normalize=True)

# base_reg = [(gbm, 'LightGBM', 1) #1.3
#             #, (svr, 'SVR', 1)
#             #, (rf, 'RandomForest', 0) #0.67
#             , (lasso, 'Lasso', 1)
#             , (lr, 'LinearRegression', 1) #1.77
#            ]
# meta_reg = [(lasso, 'Lasso1', 0)] #1
# #meta_reg = [(rf, 'RandomForest', 1)] #0.69
# meta, meta_label, meta_cross_val=meta_reg[0]

# stack = StackingCVRegressor(regressors=[model for model, label, cross_val in base_reg]
#                             , meta_regressor=meta
#                             , use_features_in_secondary=True
#                             , random_state=RANDOM_STATE)
# stack_reg = [(stack, 'Stack', 0)] 

## Cross validation <a id="cross-validation"></a>
[Go back to top](#0)

In [None]:
if perform_cross_validation:
    def rmsle_crossval(y_true, y_pred):
        y_pred = np.clip(y_pred, min_val, max_val)
        return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2)))
    
    rmsle_scorer = make_scorer(rmsle_crossval, greater_is_better=False)
    sites = list(X['site_id'].unique())
    sites = [0,1,2,3,4,5,6,7]
    for site in sites:
        print('Site [{site_no}] start...'.format(site_no=site))
        site_index = list(X[X['site_id']==site].index)
        print('No of rows in site: [{rows}]'.format(rows=len(site_index)))
        
        X_site = X.loc[site_index, :]
        y_site = y[site_index]
        
        model = copy.deepcopy(gbm)
        t_start = time.perf_counter()
        scores = (cross_val_score(model, X_site, y_site, cv=5
                                  #, scoring='neg_mean_squared_error'
                                  , scoring=rmsle_scorer
                                  , verbose=0
                                 )*-1)**0.5
        print("Root mean squared log error score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
        
        t_stop = time.perf_counter()
        print('{label:<30s}: {value:.2f}'.format(
            label='Elapsed time in minutes',
            value=(t_stop-t_start)/60))
        print('{dt:%Y-%m-%d %H-%M-%S}'.format(dt=datetime.datetime.now()))
        print('Site [{site_no}] end!\n'.format(site_no=site))

In [None]:
# predict site-wise
outputs = pd.DataFrame()
cols = copy.deepcopy(feature_cols)

for site in sites:
    t_start = time.perf_counter()
    model = copy.deepcopy(gbm)
    site_index = list(X_train[X_train['site_id']==site].index)
    print('{label:<30s}: {value}'.format(
        label='Site',
        value=site))
    print('{label:<30s}: {value}'.format(
        label='Rows',
        value=len(site_index)))
    print('Start...')
    print('{dt:%Y-%m-%d %H-%M-%S}'.format(dt=datetime.datetime.now()))
    
    X_train_site = X_train.loc[site_index, cols]
    y_train_site = y_train[site_index]
    
    site_index_val = list(X_val[X_val['site_id']==site].index)
    X_val_site = X_val.loc[site_index_val, cols]
    y_val_site = y_val[site_index_val]
    
    model.fit(X_train_site, y_train_site
              ,eval_set=[(X_val_site, y_val_site)]
              ,eval_metric=rmsle
              ,verbose=False
              ,early_stopping_rounds=100)
        
    y_pred_site = model.predict(X_val_site)
    output_site = pd.DataFrame({'meter_reading': y_pred_site,'meter_reading_val': y_val_site})
    outputs = outputs.append(output_site, ignore_index = True)
    print('Completed'.format(site=site))
    print('{label:<30s}: {value:.2f}'.format(
        label='Elapsed time in minutes',
        value=(time.perf_counter()-t_start)/60))
    print('\n')
y_pred = outputs[target_col]
y_pred = np.clip(y_pred, min_val, max_val)
y_val_site = outputs['meter_reading_val']
rmsle = mean_squared_log_error(y_val_site, y_pred)**0.5
print('Root mean square log error (stack): {:.2f}'.format(rmsle))

In [None]:
# predict full
t_start = time.perf_counter()
print('Start...')
print('{dt:%Y-%m-%d %H-%M-%S}'.format(dt=datetime.datetime.now()))
    
model = copy.deepcopy(gbm)
model.fit(X_train, y_train)
y_pred_full = model.predict(X_val)
y_pred_full = np.clip(y_pred_full, min_val, max_val)
rmsle = mean_squared_log_error(y_val, y_pred_full)**0.5
print('Completed'.format(site=site))
print('{label:<30s}: {value:.2f}'.format(
    label='Elapsed time in minutes',
    value=(time.perf_counter()-t_start)/60))
print('\n')
print('Root mean square log error (full): {:.2f}'.format(rmsle))

In [None]:
if predict_test:
    for site in sites:
        if site < 1:
            t_start = time.perf_counter()
            model = copy.deepcopy(gbm)
            print('Site [{site_no}] start...'.format(site_no=site))
            print('{dt:%Y-%m-%d %H-%M-%S}'.format(dt=datetime.datetime.now()))

            site_index = list(X[X['site_id']==site].index)
            X_site = X.loc[site_index, cols]
            y_site = y[site_index]
            model.fit(X_site, y_site)
            
            t_stop = time.perf_counter()
            print('{label:<30s}: {value:.2f}'.format(
                label='Elapsed time in minutes',
                value=(t_stop-t_start)/60))
            print('{dt:%Y-%m-%d %H-%M-%S}'.format(dt=datetime.datetime.now()))
            print('Site [{site_no}] end!\n'.format(site_no=site))
            # save model
            model_name = 'lgbm_' + str(site) + '_.pkl'
            joblib.dump(model, model_name)


In [None]:
# model = copy.deepcopy(gbm)
# print('Full model start...')
# t1_start = time.perf_counter()

# X_full = X.loc[:, cols]
# y_full = copy.deepcopy(y)
# model.fit(X_full, y_full)

# t1_end = time.perf_counter()
# print('Model fit time: {sec:.2f}'.format(sec=(t1_end-t1_start)/60))
# # save model
# model_name = 'lgbm_full.pkl'
# joblib.dump(model, model_name)