In [39]:
# toggle to save space
mode = '_mean' if False else '_all'
print(mode)

_all


In [40]:
gbm_params = {
    'n_estimators' : 1500, # 500,  
    'max_depth' : -1,
    'learning_rate': 0.05,
    'bagging_fraction': 0.7, 
    
    'feature_fraction' : 0.9,
    'bagging_freq': 5,
    #'subsample' : 0.1,  # 
    #'subsample_freq' : 1,
    'num_leaves' : 31,
    'metric':'rmse',
    #'lambda_l1' : 1,  # Try defaults
    #'lambda_l2': 1, # Try defaults
    'verbose': 100
}

In [41]:
# TODO: add https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [43]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [44]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
}

def loadFile(name):
    for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
        if path.exists(dir_path + name + '.csv'):
            return  ConvertToDatetime().transform(
                pd.read_csv(dir_path + name + '.csv', dtype=file_dtype[name]))
        


In [45]:
building = loadFile('building_metadata')
pre_train = loadFile('train')
#test = loadFile('test')

weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')

In [46]:
def merge(x):
    weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')
    x = x.merge(building, on=['building_id'], how='left')
    gc.collect()
    x = x.merge(weather_processed_df, on=['site_id', 'timestamp'], how='left')
    gc.collect()
    x['site_id'] = x['site_id'].astype('int8');
    x['cloud_coverage'] = x['cloud_coverage'].astype('float16')
    gc.collect()
    return x
        
train = merge(pre_train)  
print(train)
print('!!!! Warning we are missing weather for '+ str(train['air_temperature'].isnull().sum())+' rows')
train = train.dropna(axis=0, subset=['air_temperature'])


          building_id  meter           timestamp  meter_reading  site_id  \
0                   0      0 2016-01-01 00:00:00       0.000000        0   
1                   1      0 2016-01-01 00:00:00       0.000000        0   
2                   2      0 2016-01-01 00:00:00       0.000000        0   
3                   3      0 2016-01-01 00:00:00       0.000000        0   
4                   4      0 2016-01-01 00:00:00       0.000000        0   
5                   5      0 2016-01-01 00:00:00       0.000000        0   
6                   6      0 2016-01-01 00:00:00       0.000000        0   
7                   7      0 2016-01-01 00:00:00       0.000000        0   
8                   8      0 2016-01-01 00:00:00       0.000000        0   
9                   9      0 2016-01-01 00:00:00       0.000000        0   
10                 10      0 2016-01-01 00:00:00       0.000000        0   
11                 11      0 2016-01-01 00:00:00       0.000000        0   
12          

[20216100 rows x 101 columns]


In [47]:
# See holiday notebook to generate, this is optional
holiday_df = None
if path.exists('../input/ashrae-energy-prediction-pickles/holiday_df.pickle'):
    holiday_df = pd.read_pickle('../input/ashrae-energy-prediction-pickles/holiday_df.pickle')
if holiday_df is not None:
    print(holiday_df.sample(20))

          site_id           timestamp                                 holiday
57370069       14 2018-12-25 14:00:00                           Christmas Day
54545077       14 2017-11-11 18:00:00                            Veterans Day
10100026        4 2016-07-04 14:00:00                        Independence Day
47993          15 2016-01-01 20:00:00                          New Year's Day
4416702         7 2016-03-25 01:00:00                             Good Friday
61477171       15 2018-11-22 04:00:00                            Thanksgiving
56828629       14 2018-10-08 06:00:00                            Columbus Day
61217671       15 2018-10-08 22:00:00                            Columbus Day
35786309        5 2017-12-25 09:00:00                           Christmas Day
209900          5 2016-01-04 19:00:00  New Year Holiday [Scotland] (Observed)
34257391        4 2018-01-15 18:00:00             Martin Luther King, Jr. Day
21346656        0 2018-01-01 04:00:00                          N

In [48]:
class MeterReadingLog1p(TransformerMixin):
  
    def transform(self, df, **transform_params):
        if 'meter_reading' in df.columns:
            df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
            df = df.drop('meter_reading', axis=1)
        return df
    
    def fit(self, X, y=None, **fit_params):
        return self
print(train.sample(20, random_state=42))
print(MeterReadingLog1p().transform(train.sample(20, random_state=42)))
gc.collect()

          building_id  meter           timestamp  meter_reading  site_id  \
19993926         1389      1 2016-12-28 01:00:00      18.774500       15   
6530527          1240      3 2016-05-02 03:00:00       0.000000       14   
6484853           464      0 2016-05-01 08:00:00      13.480000        3   
18072271          656      0 2016-11-23 15:00:00      11.100000        5   
15944342          322      0 2016-10-16 18:00:00      24.889999        3   
4563775           643      0 2016-03-27 23:00:00      72.724197        4   
18118279         1278      0 2016-11-24 10:00:00     113.069000       14   
2121957           575      0 2016-02-08 11:00:00      69.300003        4   
18890113          890      2 2016-12-08 14:00:00     631.349976        9   
14306902         1390      2 2016-09-17 18:00:00     134.160004       15   
5056620           702      0 2016-04-05 22:00:00       9.500000        5   
6530049           991      2 2016-05-02 03:00:00       0.000000        9   
5177920     

[20 rows x 101 columns]
          building_id  meter           timestamp  site_id  \
19993926         1389      1 2016-12-28 01:00:00       15   
6530527          1240      3 2016-05-02 03:00:00       14   
6484853           464      0 2016-05-01 08:00:00        3   
18072271          656      0 2016-11-23 15:00:00        5   
15944342          322      0 2016-10-16 18:00:00        3   
4563775           643      0 2016-03-27 23:00:00        4   
18118279         1278      0 2016-11-24 10:00:00       14   
2121957           575      0 2016-02-08 11:00:00        4   
18890113          890      2 2016-12-08 14:00:00        9   
14306902         1390      2 2016-09-17 18:00:00       15   
5056620           702      0 2016-04-05 22:00:00        5   
6530049           991      2 2016-05-02 03:00:00        9   
5177920           224      0 2016-04-08 02:00:00        2   
19974616         1247      0 2016-12-27 17:00:00       14   
9989434            43      1 2016-07-02 15:00:00        0   


[20 rows x 101 columns]


27

In [115]:
meter_desc_columns={'mean': 'meter_mean', 'max': 'meter_max', 'min': 'meter_min', 'std':'meter_std'}

class CreateMeterDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _building_meter_desc_DF
        print(df.columns)
        if 'meter_reading_log1p' in df.columns:
            cols = ['mean']
            if mode == '_all':
                cols = ['mean','max','min','std']
            group = df.groupby(['building_id','meter'])['meter_reading_log1p']
            desc_DF = group.describe()[cols]
            desc_DF = desc_DF.reset_index()
            _building_meter_desc_DF = desc_DF.rename(columns=meter_desc_columns)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterDescDF().transform(MeterReadingLog1p().transform(train.sample(2000, random_state=0)))    
print(_building_meter_desc_DF)
gc.collect()

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage',
       ...
       'meter_reading_log1p', 'meter_mean', 'meter_min', 'meter_max',
       'meter_std', 'dayofweek', 'hour', 'log_square_feet', 'holiday',
       'building_age'],
      dtype='object', length=110)
     building_id meter  meter_mean  meter_max  meter_min  meter_std
0              7     0    6.204909   6.204909   6.204909        NaN
1              7     1    8.746770   8.746770   8.746770        NaN
2              8     0    5.831633   5.831633   5.831633        NaN
3              9     0    4.528017   5.232920   4.153541   0.610861
4              9     1    4.106835   4.447588   3.766081   0.481898
5             14     1    6.516735   6.516735   6.516735        NaN
6             15     1    8.060471   8.060471   8.060471        NaN
7             16     0    7.063904   7.063904   7.063904        NaN
8            

18

In [None]:
class MergeMeterDescDF(TransformerMixin):
  
    def transform(self, df, **transform_params):
        # drop any columns to add
        df = df.drop(meter_desc_columns.values(), axis=1, errors='ignore') 
        return df.merge(_building_meter_desc_DF, on=['building_id','meter'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeMeterDescDF().transform(train.sample(2000, random_state=0)))

In [None]:
class MergeMeterDescDF(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.merge(_building_meter_desc_DF, on=['building_id','meter'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self


In [50]:
# "As you can see above, this data looks weired until May 20. It is 
# reported in this discussion by @barnwellguy that All electricity
# meter is 0 until May 20 for site_id == 0. Let's remove these data 
# from training data."
# https://www.kaggle.com/kaushal2896/ashrae-eda-fe-lightgbm-1-13
class RmS0M0(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

    def fit(self, X, y=None, **fit_params):
        return self
    


In [51]:
# TODO: write filter to remove any 0 meter reading that continue more than N days (try 3)
# Also we need to account for this by meter

In [52]:
# TODO: try rolling with power

In [53]:
    
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df['holiday'] = df['holiday'].astype('category')
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self
# Test 
if holiday_df is not None:
    print(holiday_df.columns)
    print(AddHolidays().transform(train.head(20))[['holiday','timestamp']])

Index(['site_id', 'timestamp', 'holiday'], dtype='object')
           holiday  timestamp
0   New Year's Day 2016-01-01
1   New Year's Day 2016-01-01
2   New Year's Day 2016-01-01
3   New Year's Day 2016-01-01
4   New Year's Day 2016-01-01
5   New Year's Day 2016-01-01
6   New Year's Day 2016-01-01
7   New Year's Day 2016-01-01
8   New Year's Day 2016-01-01
9   New Year's Day 2016-01-01
10  New Year's Day 2016-01-01
11  New Year's Day 2016-01-01
12  New Year's Day 2016-01-01
13  New Year's Day 2016-01-01
14  New Year's Day 2016-01-01
15  New Year's Day 2016-01-01
16  New Year's Day 2016-01-01
17  New Year's Day 2016-01-01
18  New Year's Day 2016-01-01
19  New Year's Day 2016-01-01


In [54]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df = df.drop(df[df['holiday'].notnull()].index)
            df = df.drop(['holiday'], axis=1)
            gc.collect()
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# Test you should see the new years removed
#print(train.head(100000).merge(building, on='building_id', how='left').columns)
print(RmHolidays().transform(train.head(100000)))

       building_id  meter           timestamp  meter_reading  site_id  \
52897            0      0 2016-01-02 00:00:00       0.000000        0   
52898            1      0 2016-01-02 00:00:00       0.000000        0   
52899            2      0 2016-01-02 00:00:00       0.000000        0   
52900            3      0 2016-01-02 00:00:00       0.000000        0   
52901            4      0 2016-01-02 00:00:00       0.000000        0   
52902            5      0 2016-01-02 00:00:00       0.000000        0   
52903            6      0 2016-01-02 00:00:00       0.000000        0   
52904            7      0 2016-01-02 00:00:00       0.000000        0   
52905            8      0 2016-01-02 00:00:00       0.000000        0   
52906            9      0 2016-01-02 00:00:00       0.000000        0   
52907           10      0 2016-01-02 00:00:00       0.000000        0   
52908           11      0 2016-01-02 00:00:00       0.000000        0   
52909           12      0 2016-01-02 00:00:00      

[43848 rows x 106 columns]


In [55]:
class SetCatTypes(TransformerMixin):
    
    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col]= df[col].astype('category')
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [56]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(building.head(20)['square_feet'])

0       7432.0
1       2720.0
2       5376.0
3      23685.0
4     116607.0
5       8000.0
6      27926.0
7     121074.0
8      60809.0
9      27000.0
10    370773.0
11     49073.0
12     37100.0
13     99380.0
14     86250.0
15     83957.0
16     54644.0
17     15250.0
18    111891.0
19     18717.0
Name: square_feet, dtype: float32


In [57]:
# TODO: Play with scaling cloud coverage

In [58]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [59]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','year_built']]
        year_built_median = tmp_df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        del tmp_df, year_built_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(ImputeYearBuilt().transform(train.sample(20))['building_age'])

11696699     74
3366644      70
8907210      70
13401922     13
1770607      70
16260090     75
9570584     111
9738708      70
5697283      70
18578203     62
3493822      62
849390       70
8084183      67
8582067     102
9376707      70
3766760     101
7287512      74
16821237     76
15062882     70
19212787     70
Name: building_age, dtype: uint8


In [60]:
class ImputeFloorCount(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','floor_count']]
        floors_median = tmp_df['floor_count'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['floor_count'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = i_median
            else:
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = floors_median
        del tmp_df, floors_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(ImputeFloorCount().transform(train.sample(20))['floor_count'])

15556680    3.0
15594831    3.0
2314833     3.0
6119838     3.0
6844068     3.0
5939042     5.0
18367668    1.0
10211869    3.0
4363584     3.0
893088      3.0
19718426    3.0
767285      3.0
16125416    3.0
2776491     3.0
8158361     3.0
10712812    3.0
9101422     3.0
16024169    3.0
9622259     1.0
12581953    3.0
Name: floor_count, dtype: float16


In [61]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [62]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        # TODO: try week of year as numerical 
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('uint8') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('uint8')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [63]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [64]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [65]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [66]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [67]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [114]:
%%time

x_pre_pipes = Pipeline(
    steps=[
        ('meterReadingLog1p',MeterReadingLog1p()),
        ('rmS0M0', RmS0M0()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('logSquareFeet', LogSquareFeet()),
        #('rmHolidays', RmHolidays()),
        #('addHolidays', AddHolidays()),
        ('createMeterDescDF', CreateMeterDescDF()), # note declares a globe variable to pass
        ('mergeMeterDescDF', MergeMeterDescDF()), # populates both test and train from global
        ('setCatTypes', SetCatTypes(['building_id', 'site_id', 'meter', 'primary_use'])),
        ('GC', GC())
    ]
)

train = x_pre_pipes.transform(train)
print(train.columns)

NameError: name 'MergeMeterDescDF' is not defined

In [70]:
# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_fold_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('imputeFloorCount', ImputeFloorCount()),
        ('dropCols', DropCols(['timestamp','square_feet', 'year_built'])),
        ('GC', GC())
    ]
)

sample_train_X = x_fold_pipes.transform(train.sample(20))
print(sample_train_X.columns)
print(sample_train_X.dtypes)

Index(['building_id', 'meter', 'site_id', 'primary_use', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure',
       ...
       'meter_reading_log1p', 'meter_mean', 'meter_min', 'meter_max',
       'meter_std', 'dayofweek', 'hour', 'log_square_feet', 'holiday',
       'building_age'],
      dtype='object', length=107)
building_id                      category
meter                            category
site_id                          category
primary_use                      category
floor_count                       float16
air_temperature                   float16
cloud_coverage                    float16
dew_temperature                   float16
precip_depth_1_hr                 float16
sea_level_pressure                float16
wind_direction                    float16
wind_speed                        float16
s_radiation                       float64
air_temperature_mean_lag3         float16
air_temperature_m

In [71]:
# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
# trying no shuffle https://www.kaggle.com/c/ashrae-energy-prediction/discussion/115851#latest-666115
folds = 5
kf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=42)


In [81]:
## cross val multipule meter models
def cvTrainMeterEnsemble(train, gbm_params):
    meter_models = [ [], [], [], [] ]
    for train_index, val_index in kf.split(train, train['building_id']):
        f_train = x_fold_pipes.transform(train.loc[train_index])
        f_val = x_fold_pipes.transform(train.loc[val_index])
        for i in range(4):
            print(f'training meter {i}')
            f_train_m = f_train[f_train['meter'] == i]
            f_val_m = f_val[f_val['meter'] == i]
            gbm_params_m = gbm_params
            if i == 0:
                gbm_params_m['learning_rate']=0.04
            elif i == 1:
                gbm_params_m['learning_rate']=0.06
                gbm_params_m['bagging_fraction']=0.5
            elif i == 2:
                gbm_params_m['learning_rate']=0.05
                gbm_params_m['bagging_fraction']=0.8
            else:
                gbm_params_m['learning_rate']=0.04
                gbm_params_m['bagging_fraction']=0.9                
            gbm = LGBMRegressor(**gbm_params_m)
            print("traing meter " + str(i))
            gbm.fit(f_train_m.drop('meter_reading_log1p', axis=1), f_train_m['meter_reading_log1p'],
                eval_set=[(f_val_m.drop('meter_reading_log1p', axis=1), f_val_m['meter_reading_log1p'])],
                # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
                # eval_metric=lbm_rmslee,
                early_stopping_rounds=20)
            meter_models[i].append(gbm)
            del f_train_m, f_val_m, gbm
            gc.collect()
        del f_train, f_val
        gc.collect()
    return meter_models


In [82]:
%%time
meter_models = cvTrainMeterEnsemble(train, gbm_params)

training meter 0
traing meter 0
[1]	valid_0's rmse: 1.52811
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.47813
[3]	valid_0's rmse: 1.43071
[4]	valid_0's rmse: 1.38563
[5]	valid_0's rmse: 1.34271
[6]	valid_0's rmse: 1.3018
[7]	valid_0's rmse: 1.26293
[8]	valid_0's rmse: 1.22704
[9]	valid_0's rmse: 1.19179
[10]	valid_0's rmse: 1.15848
[11]	valid_0's rmse: 1.12701
[12]	valid_0's rmse: 1.09713
[13]	valid_0's rmse: 1.06888
[14]	valid_0's rmse: 1.04232
[15]	valid_0's rmse: 1.01721
[16]	valid_0's rmse: 0.993401
[17]	valid_0's rmse: 0.971334
[18]	valid_0's rmse: 0.950811
[19]	valid_0's rmse: 0.931182
[20]	valid_0's rmse: 0.913796
[21]	valid_0's rmse: 0.896614
[22]	valid_0's rmse: 0.880657
[23]	valid_0's rmse: 0.865126
[24]	valid_0's rmse: 0.850701
[25]	valid_0's rmse: 0.836587
[26]	valid_0's rmse: 0.823666
[27]	valid_0's rmse: 0.811982
[28]	valid_0's rmse: 0.800543
[29]	valid_0's rmse: 0.790124
[30]	valid_0's rmse: 0.780676
[31]	valid_0's rmse: 0.77116
[32

[267]	valid_0's rmse: 0.636657
[268]	valid_0's rmse: 0.636657
[269]	valid_0's rmse: 0.636526
[270]	valid_0's rmse: 0.636594
[271]	valid_0's rmse: 0.636651
[272]	valid_0's rmse: 0.636623
[273]	valid_0's rmse: 0.636654
[274]	valid_0's rmse: 0.636542
[275]	valid_0's rmse: 0.636556
[276]	valid_0's rmse: 0.636689
[277]	valid_0's rmse: 0.636628
[278]	valid_0's rmse: 0.636598
[279]	valid_0's rmse: 0.636593
[280]	valid_0's rmse: 0.636579
[281]	valid_0's rmse: 0.636528
[282]	valid_0's rmse: 0.636284
[283]	valid_0's rmse: 0.636059
[284]	valid_0's rmse: 0.63615
[285]	valid_0's rmse: 0.636037
[286]	valid_0's rmse: 0.63595
[287]	valid_0's rmse: 0.635875
[288]	valid_0's rmse: 0.6358
[289]	valid_0's rmse: 0.635816
[290]	valid_0's rmse: 0.635629
[291]	valid_0's rmse: 0.635624
[292]	valid_0's rmse: 0.635623
[293]	valid_0's rmse: 0.635572
[294]	valid_0's rmse: 0.635542
[295]	valid_0's rmse: 0.635683
[296]	valid_0's rmse: 0.635654
[297]	valid_0's rmse: 0.635533
[298]	valid_0's rmse: 0.635411
[299]	valid_

[532]	valid_0's rmse: 0.629096
[533]	valid_0's rmse: 0.629075
[534]	valid_0's rmse: 0.629067
[535]	valid_0's rmse: 0.629049
[536]	valid_0's rmse: 0.628982
[537]	valid_0's rmse: 0.62892
[538]	valid_0's rmse: 0.628871
[539]	valid_0's rmse: 0.628862
[540]	valid_0's rmse: 0.628894
[541]	valid_0's rmse: 0.628841
[542]	valid_0's rmse: 0.628835
[543]	valid_0's rmse: 0.628853
[544]	valid_0's rmse: 0.628818
[545]	valid_0's rmse: 0.628875
[546]	valid_0's rmse: 0.629036
[547]	valid_0's rmse: 0.629
[548]	valid_0's rmse: 0.629009
[549]	valid_0's rmse: 0.628968
[550]	valid_0's rmse: 0.628888
[551]	valid_0's rmse: 0.628857
[552]	valid_0's rmse: 0.628673
[553]	valid_0's rmse: 0.628682
[554]	valid_0's rmse: 0.628683
[555]	valid_0's rmse: 0.628691
[556]	valid_0's rmse: 0.628725
[557]	valid_0's rmse: 0.628726
[558]	valid_0's rmse: 0.628713
[559]	valid_0's rmse: 0.628743
[560]	valid_0's rmse: 0.628658
[561]	valid_0's rmse: 0.628667
[562]	valid_0's rmse: 0.628575
[563]	valid_0's rmse: 0.628572
[564]	valid_

[796]	valid_0's rmse: 0.624806
[797]	valid_0's rmse: 0.624823
[798]	valid_0's rmse: 0.624795
[799]	valid_0's rmse: 0.624814
[800]	valid_0's rmse: 0.624844
[801]	valid_0's rmse: 0.624855
[802]	valid_0's rmse: 0.624854
[803]	valid_0's rmse: 0.624848
[804]	valid_0's rmse: 0.624799
[805]	valid_0's rmse: 0.624768
[806]	valid_0's rmse: 0.624777
[807]	valid_0's rmse: 0.624787
[808]	valid_0's rmse: 0.624816
[809]	valid_0's rmse: 0.624771
[810]	valid_0's rmse: 0.624734
[811]	valid_0's rmse: 0.624662
[812]	valid_0's rmse: 0.624573
[813]	valid_0's rmse: 0.624602
[814]	valid_0's rmse: 0.624575
[815]	valid_0's rmse: 0.624523
[816]	valid_0's rmse: 0.624565
[817]	valid_0's rmse: 0.624609
[818]	valid_0's rmse: 0.624662
[819]	valid_0's rmse: 0.624665
[820]	valid_0's rmse: 0.624711
[821]	valid_0's rmse: 0.624733
[822]	valid_0's rmse: 0.624692
[823]	valid_0's rmse: 0.624694
[824]	valid_0's rmse: 0.624659
[825]	valid_0's rmse: 0.624489
[826]	valid_0's rmse: 0.624488
[827]	valid_0's rmse: 0.62447
[828]	val

[183]	valid_0's rmse: 1.32805
[184]	valid_0's rmse: 1.32803
[185]	valid_0's rmse: 1.32788
[186]	valid_0's rmse: 1.32808
[187]	valid_0's rmse: 1.32821
[188]	valid_0's rmse: 1.32813
[189]	valid_0's rmse: 1.32798
[190]	valid_0's rmse: 1.32811
[191]	valid_0's rmse: 1.32805
[192]	valid_0's rmse: 1.3282
[193]	valid_0's rmse: 1.32796
[194]	valid_0's rmse: 1.32802
[195]	valid_0's rmse: 1.32815
[196]	valid_0's rmse: 1.32809
[197]	valid_0's rmse: 1.32809
[198]	valid_0's rmse: 1.328
[199]	valid_0's rmse: 1.32802
[200]	valid_0's rmse: 1.32807
[201]	valid_0's rmse: 1.32814
[202]	valid_0's rmse: 1.32785
[203]	valid_0's rmse: 1.32764
[204]	valid_0's rmse: 1.32782
[205]	valid_0's rmse: 1.32776
[206]	valid_0's rmse: 1.32774
[207]	valid_0's rmse: 1.32745
[208]	valid_0's rmse: 1.32743
[209]	valid_0's rmse: 1.32726
[210]	valid_0's rmse: 1.32731
[211]	valid_0's rmse: 1.32746
[212]	valid_0's rmse: 1.32752
[213]	valid_0's rmse: 1.32743
[214]	valid_0's rmse: 1.32715
[215]	valid_0's rmse: 1.32702
[216]	valid_0

[130]	valid_0's rmse: 1.50189
[131]	valid_0's rmse: 1.50177
[132]	valid_0's rmse: 1.50142
[133]	valid_0's rmse: 1.50089
[134]	valid_0's rmse: 1.50083
[135]	valid_0's rmse: 1.50158
[136]	valid_0's rmse: 1.50061
[137]	valid_0's rmse: 1.50059
[138]	valid_0's rmse: 1.50033
[139]	valid_0's rmse: 1.50016
[140]	valid_0's rmse: 1.49972
[141]	valid_0's rmse: 1.49978
[142]	valid_0's rmse: 1.49971
[143]	valid_0's rmse: 1.49976
[144]	valid_0's rmse: 1.4997
[145]	valid_0's rmse: 1.49975
[146]	valid_0's rmse: 1.49849
[147]	valid_0's rmse: 1.49852
[148]	valid_0's rmse: 1.49836
[149]	valid_0's rmse: 1.4979
[150]	valid_0's rmse: 1.49756
[151]	valid_0's rmse: 1.49756
[152]	valid_0's rmse: 1.49785
[153]	valid_0's rmse: 1.49815
[154]	valid_0's rmse: 1.49773
[155]	valid_0's rmse: 1.49742
[156]	valid_0's rmse: 1.49746
[157]	valid_0's rmse: 1.49751
[158]	valid_0's rmse: 1.49721
[159]	valid_0's rmse: 1.49753
[160]	valid_0's rmse: 1.49683
[161]	valid_0's rmse: 1.49682
[162]	valid_0's rmse: 1.49653
[163]	valid_

[44]	valid_0's rmse: 1.68739
[45]	valid_0's rmse: 1.68092
[46]	valid_0's rmse: 1.67598
[47]	valid_0's rmse: 1.67123
[48]	valid_0's rmse: 1.66535
[49]	valid_0's rmse: 1.66104
[50]	valid_0's rmse: 1.65652
[51]	valid_0's rmse: 1.65194
[52]	valid_0's rmse: 1.64729
[53]	valid_0's rmse: 1.64324
[54]	valid_0's rmse: 1.63983
[55]	valid_0's rmse: 1.63576
[56]	valid_0's rmse: 1.63333
[57]	valid_0's rmse: 1.62987
[58]	valid_0's rmse: 1.62621
[59]	valid_0's rmse: 1.62236
[60]	valid_0's rmse: 1.61894
[61]	valid_0's rmse: 1.61613
[62]	valid_0's rmse: 1.61265
[63]	valid_0's rmse: 1.6107
[64]	valid_0's rmse: 1.60688
[65]	valid_0's rmse: 1.60349
[66]	valid_0's rmse: 1.60074
[67]	valid_0's rmse: 1.59737
[68]	valid_0's rmse: 1.59524
[69]	valid_0's rmse: 1.59311
[70]	valid_0's rmse: 1.59077
[71]	valid_0's rmse: 1.58825
[72]	valid_0's rmse: 1.58636
[73]	valid_0's rmse: 1.58451
[74]	valid_0's rmse: 1.58243
[75]	valid_0's rmse: 1.57993
[76]	valid_0's rmse: 1.57862
[77]	valid_0's rmse: 1.57693
[78]	valid_0's 

[110]	valid_0's rmse: 0.516771
[111]	valid_0's rmse: 0.516574
[112]	valid_0's rmse: 0.51603
[113]	valid_0's rmse: 0.515786
[114]	valid_0's rmse: 0.515552
[115]	valid_0's rmse: 0.515255
[116]	valid_0's rmse: 0.51508
[117]	valid_0's rmse: 0.514771
[118]	valid_0's rmse: 0.51478
[119]	valid_0's rmse: 0.514452
[120]	valid_0's rmse: 0.514265
[121]	valid_0's rmse: 0.514094
[122]	valid_0's rmse: 0.51386
[123]	valid_0's rmse: 0.513719
[124]	valid_0's rmse: 0.513454
[125]	valid_0's rmse: 0.513206
[126]	valid_0's rmse: 0.512887
[127]	valid_0's rmse: 0.512568
[128]	valid_0's rmse: 0.512356
[129]	valid_0's rmse: 0.512087
[130]	valid_0's rmse: 0.511764
[131]	valid_0's rmse: 0.511696
[132]	valid_0's rmse: 0.511539
[133]	valid_0's rmse: 0.511636
[134]	valid_0's rmse: 0.511437
[135]	valid_0's rmse: 0.511345
[136]	valid_0's rmse: 0.5109
[137]	valid_0's rmse: 0.510602
[138]	valid_0's rmse: 0.510175
[139]	valid_0's rmse: 0.510071
[140]	valid_0's rmse: 0.509849
[141]	valid_0's rmse: 0.509601
[142]	valid_0'

[375]	valid_0's rmse: 0.481729
[376]	valid_0's rmse: 0.481699
[377]	valid_0's rmse: 0.481525
[378]	valid_0's rmse: 0.481369
[379]	valid_0's rmse: 0.481221
[380]	valid_0's rmse: 0.481276
[381]	valid_0's rmse: 0.481253
[382]	valid_0's rmse: 0.481177
[383]	valid_0's rmse: 0.481112
[384]	valid_0's rmse: 0.481088
[385]	valid_0's rmse: 0.481113
[386]	valid_0's rmse: 0.481137
[387]	valid_0's rmse: 0.481097
[388]	valid_0's rmse: 0.481155
[389]	valid_0's rmse: 0.481078
[390]	valid_0's rmse: 0.480997
[391]	valid_0's rmse: 0.480993
[392]	valid_0's rmse: 0.48101
[393]	valid_0's rmse: 0.480905
[394]	valid_0's rmse: 0.480905
[395]	valid_0's rmse: 0.480738
[396]	valid_0's rmse: 0.480574
[397]	valid_0's rmse: 0.48057
[398]	valid_0's rmse: 0.480568
[399]	valid_0's rmse: 0.480357
[400]	valid_0's rmse: 0.480401
[401]	valid_0's rmse: 0.480526
[402]	valid_0's rmse: 0.480384
[403]	valid_0's rmse: 0.48031
[404]	valid_0's rmse: 0.480334
[405]	valid_0's rmse: 0.480321
[406]	valid_0's rmse: 0.480311
[407]	valid

[640]	valid_0's rmse: 0.473249
[641]	valid_0's rmse: 0.473238
[642]	valid_0's rmse: 0.473149
[643]	valid_0's rmse: 0.473124
[644]	valid_0's rmse: 0.473088
[645]	valid_0's rmse: 0.473049
[646]	valid_0's rmse: 0.473074
[647]	valid_0's rmse: 0.473063
[648]	valid_0's rmse: 0.473076
[649]	valid_0's rmse: 0.473069
[650]	valid_0's rmse: 0.473015
[651]	valid_0's rmse: 0.473026
[652]	valid_0's rmse: 0.472964
[653]	valid_0's rmse: 0.472975
[654]	valid_0's rmse: 0.472938
[655]	valid_0's rmse: 0.472951
[656]	valid_0's rmse: 0.472895
[657]	valid_0's rmse: 0.472881
[658]	valid_0's rmse: 0.472878
[659]	valid_0's rmse: 0.472887
[660]	valid_0's rmse: 0.472877
[661]	valid_0's rmse: 0.472873
[662]	valid_0's rmse: 0.472879
[663]	valid_0's rmse: 0.472795
[664]	valid_0's rmse: 0.472777
[665]	valid_0's rmse: 0.472771
[666]	valid_0's rmse: 0.472788
[667]	valid_0's rmse: 0.472746
[668]	valid_0's rmse: 0.472683
[669]	valid_0's rmse: 0.472668
[670]	valid_0's rmse: 0.472721
[671]	valid_0's rmse: 0.472722
[672]	va

[93]	valid_0's rmse: 1.23946
[94]	valid_0's rmse: 1.23864
[95]	valid_0's rmse: 1.23805
[96]	valid_0's rmse: 1.23742
[97]	valid_0's rmse: 1.23634
[98]	valid_0's rmse: 1.236
[99]	valid_0's rmse: 1.23582
[100]	valid_0's rmse: 1.23497
[101]	valid_0's rmse: 1.23425
[102]	valid_0's rmse: 1.23388
[103]	valid_0's rmse: 1.23333
[104]	valid_0's rmse: 1.23309
[105]	valid_0's rmse: 1.23266
[106]	valid_0's rmse: 1.23202
[107]	valid_0's rmse: 1.23174
[108]	valid_0's rmse: 1.23121
[109]	valid_0's rmse: 1.23057
[110]	valid_0's rmse: 1.22939
[111]	valid_0's rmse: 1.22825
[112]	valid_0's rmse: 1.22794
[113]	valid_0's rmse: 1.22771
[114]	valid_0's rmse: 1.22724
[115]	valid_0's rmse: 1.22652
[116]	valid_0's rmse: 1.22592
[117]	valid_0's rmse: 1.22562
[118]	valid_0's rmse: 1.22538
[119]	valid_0's rmse: 1.22496
[120]	valid_0's rmse: 1.22467
[121]	valid_0's rmse: 1.22426
[122]	valid_0's rmse: 1.22326
[123]	valid_0's rmse: 1.22296
[124]	valid_0's rmse: 1.22303
[125]	valid_0's rmse: 1.22271
[126]	valid_0's rms

[367]	valid_0's rmse: 1.1856
[368]	valid_0's rmse: 1.18554
[369]	valid_0's rmse: 1.1855
[370]	valid_0's rmse: 1.18545
[371]	valid_0's rmse: 1.18553
[372]	valid_0's rmse: 1.18548
[373]	valid_0's rmse: 1.18568
[374]	valid_0's rmse: 1.18568
[375]	valid_0's rmse: 1.1857
[376]	valid_0's rmse: 1.18577
[377]	valid_0's rmse: 1.1857
[378]	valid_0's rmse: 1.18561
[379]	valid_0's rmse: 1.18567
[380]	valid_0's rmse: 1.18557
[381]	valid_0's rmse: 1.18594
[382]	valid_0's rmse: 1.18566
[383]	valid_0's rmse: 1.18563
[384]	valid_0's rmse: 1.1855
[385]	valid_0's rmse: 1.18497
[386]	valid_0's rmse: 1.1847
[387]	valid_0's rmse: 1.1847
[388]	valid_0's rmse: 1.18471
[389]	valid_0's rmse: 1.18453
[390]	valid_0's rmse: 1.18433
[391]	valid_0's rmse: 1.18426
[392]	valid_0's rmse: 1.18422
[393]	valid_0's rmse: 1.18416
[394]	valid_0's rmse: 1.18414
[395]	valid_0's rmse: 1.18397
[396]	valid_0's rmse: 1.1839
[397]	valid_0's rmse: 1.18385
[398]	valid_0's rmse: 1.18377
[399]	valid_0's rmse: 1.18373
[400]	valid_0's rm

[140]	valid_0's rmse: 1.51798
[141]	valid_0's rmse: 1.51777
[142]	valid_0's rmse: 1.5162
[143]	valid_0's rmse: 1.51595
[144]	valid_0's rmse: 1.51602
[145]	valid_0's rmse: 1.51495
[146]	valid_0's rmse: 1.5151
[147]	valid_0's rmse: 1.51498
[148]	valid_0's rmse: 1.51459
[149]	valid_0's rmse: 1.51463
[150]	valid_0's rmse: 1.51435
[151]	valid_0's rmse: 1.51375
[152]	valid_0's rmse: 1.51363
[153]	valid_0's rmse: 1.51346
[154]	valid_0's rmse: 1.51293
[155]	valid_0's rmse: 1.51281
[156]	valid_0's rmse: 1.51235
[157]	valid_0's rmse: 1.51203
[158]	valid_0's rmse: 1.51213
[159]	valid_0's rmse: 1.51201
[160]	valid_0's rmse: 1.51073
[161]	valid_0's rmse: 1.51064
[162]	valid_0's rmse: 1.51039
[163]	valid_0's rmse: 1.51071
[164]	valid_0's rmse: 1.51032
[165]	valid_0's rmse: 1.51001
[166]	valid_0's rmse: 1.51011
[167]	valid_0's rmse: 1.50913
[168]	valid_0's rmse: 1.50905
[169]	valid_0's rmse: 1.5088
[170]	valid_0's rmse: 1.50863
[171]	valid_0's rmse: 1.50829
[172]	valid_0's rmse: 1.50839
[173]	valid_0

[413]	valid_0's rmse: 1.47079
[414]	valid_0's rmse: 1.47082
[415]	valid_0's rmse: 1.4706
[416]	valid_0's rmse: 1.4703
[417]	valid_0's rmse: 1.47018
[418]	valid_0's rmse: 1.47006
[419]	valid_0's rmse: 1.47
[420]	valid_0's rmse: 1.46988
[421]	valid_0's rmse: 1.4699
[422]	valid_0's rmse: 1.46966
[423]	valid_0's rmse: 1.46963
[424]	valid_0's rmse: 1.46968
[425]	valid_0's rmse: 1.46953
[426]	valid_0's rmse: 1.46914
[427]	valid_0's rmse: 1.46912
[428]	valid_0's rmse: 1.46902
[429]	valid_0's rmse: 1.46908
[430]	valid_0's rmse: 1.46872
[431]	valid_0's rmse: 1.4687
[432]	valid_0's rmse: 1.46878
[433]	valid_0's rmse: 1.46857
[434]	valid_0's rmse: 1.46841
[435]	valid_0's rmse: 1.46841
[436]	valid_0's rmse: 1.46846
[437]	valid_0's rmse: 1.46844
[438]	valid_0's rmse: 1.46855
[439]	valid_0's rmse: 1.46873
[440]	valid_0's rmse: 1.46856
[441]	valid_0's rmse: 1.46865
[442]	valid_0's rmse: 1.46878
[443]	valid_0's rmse: 1.46889
[444]	valid_0's rmse: 1.46886
[445]	valid_0's rmse: 1.46896
[446]	valid_0's r

[230]	valid_0's rmse: 1.61238
[231]	valid_0's rmse: 1.61254
[232]	valid_0's rmse: 1.61248
[233]	valid_0's rmse: 1.61251
[234]	valid_0's rmse: 1.61242
[235]	valid_0's rmse: 1.61211
[236]	valid_0's rmse: 1.61198
[237]	valid_0's rmse: 1.61206
[238]	valid_0's rmse: 1.61201
[239]	valid_0's rmse: 1.61207
[240]	valid_0's rmse: 1.61213
[241]	valid_0's rmse: 1.61221
Early stopping, best iteration is:
[221]	valid_0's rmse: 1.61197
training meter 0
traing meter 0
[1]	valid_0's rmse: 1.55723
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.50742
[3]	valid_0's rmse: 1.45986
[4]	valid_0's rmse: 1.41467
[5]	valid_0's rmse: 1.37158
[6]	valid_0's rmse: 1.33071
[7]	valid_0's rmse: 1.2918
[8]	valid_0's rmse: 1.25584
[9]	valid_0's rmse: 1.22057
[10]	valid_0's rmse: 1.18722
[11]	valid_0's rmse: 1.15557
[12]	valid_0's rmse: 1.12556
[13]	valid_0's rmse: 1.09709
[14]	valid_0's rmse: 1.07011
[15]	valid_0's rmse: 1.04465
[16]	valid_0's rmse: 1.02077
[17]	valid_0's rmse: 0.99802

[253]	valid_0's rmse: 0.637481
[254]	valid_0's rmse: 0.637491
[255]	valid_0's rmse: 0.637442
[256]	valid_0's rmse: 0.637407
[257]	valid_0's rmse: 0.637552
[258]	valid_0's rmse: 0.637516
[259]	valid_0's rmse: 0.63745
[260]	valid_0's rmse: 0.637484
[261]	valid_0's rmse: 0.637405
[262]	valid_0's rmse: 0.637326
[263]	valid_0's rmse: 0.637342
[264]	valid_0's rmse: 0.637124
[265]	valid_0's rmse: 0.637072
[266]	valid_0's rmse: 0.637044
[267]	valid_0's rmse: 0.636979
[268]	valid_0's rmse: 0.636955
[269]	valid_0's rmse: 0.636945
[270]	valid_0's rmse: 0.636852
[271]	valid_0's rmse: 0.636834
[272]	valid_0's rmse: 0.636913
[273]	valid_0's rmse: 0.636964
[274]	valid_0's rmse: 0.636924
[275]	valid_0's rmse: 0.636871
[276]	valid_0's rmse: 0.636762
[277]	valid_0's rmse: 0.63674
[278]	valid_0's rmse: 0.636711
[279]	valid_0's rmse: 0.636636
[280]	valid_0's rmse: 0.636649
[281]	valid_0's rmse: 0.636596
[282]	valid_0's rmse: 0.636491
[283]	valid_0's rmse: 0.636472
[284]	valid_0's rmse: 0.636528
[285]	vali

[37]	valid_0's rmse: 1.5697
[38]	valid_0's rmse: 1.5606
[39]	valid_0's rmse: 1.55436
[40]	valid_0's rmse: 1.54702
[41]	valid_0's rmse: 1.53826
[42]	valid_0's rmse: 1.53258
[43]	valid_0's rmse: 1.52686
[44]	valid_0's rmse: 1.51955
[45]	valid_0's rmse: 1.51424
[46]	valid_0's rmse: 1.50966
[47]	valid_0's rmse: 1.50494
[48]	valid_0's rmse: 1.50053
[49]	valid_0's rmse: 1.4948
[50]	valid_0's rmse: 1.49028
[51]	valid_0's rmse: 1.48712
[52]	valid_0's rmse: 1.4826
[53]	valid_0's rmse: 1.47941
[54]	valid_0's rmse: 1.47443
[55]	valid_0's rmse: 1.47093
[56]	valid_0's rmse: 1.467
[57]	valid_0's rmse: 1.46396
[58]	valid_0's rmse: 1.46253
[59]	valid_0's rmse: 1.46022
[60]	valid_0's rmse: 1.45824
[61]	valid_0's rmse: 1.45575
[62]	valid_0's rmse: 1.45256
[63]	valid_0's rmse: 1.45011
[64]	valid_0's rmse: 1.44796
[65]	valid_0's rmse: 1.44645
[66]	valid_0's rmse: 1.44458
[67]	valid_0's rmse: 1.44253
[68]	valid_0's rmse: 1.44146
[69]	valid_0's rmse: 1.44002
[70]	valid_0's rmse: 1.43854
[71]	valid_0's rmse:

[53]	valid_0's rmse: 1.34728
[54]	valid_0's rmse: 1.34138
[55]	valid_0's rmse: 1.33833
[56]	valid_0's rmse: 1.33492
[57]	valid_0's rmse: 1.33255
[58]	valid_0's rmse: 1.32723
[59]	valid_0's rmse: 1.3237
[60]	valid_0's rmse: 1.32087
[61]	valid_0's rmse: 1.3181
[62]	valid_0's rmse: 1.31638
[63]	valid_0's rmse: 1.31509
[64]	valid_0's rmse: 1.31235
[65]	valid_0's rmse: 1.30933
[66]	valid_0's rmse: 1.3079
[67]	valid_0's rmse: 1.30582
[68]	valid_0's rmse: 1.30429
[69]	valid_0's rmse: 1.30267
[70]	valid_0's rmse: 1.29975
[71]	valid_0's rmse: 1.29822
[72]	valid_0's rmse: 1.2974
[73]	valid_0's rmse: 1.2953
[74]	valid_0's rmse: 1.29362
[75]	valid_0's rmse: 1.2918
[76]	valid_0's rmse: 1.29145
[77]	valid_0's rmse: 1.29112
[78]	valid_0's rmse: 1.29024
[79]	valid_0's rmse: 1.28879
[80]	valid_0's rmse: 1.2887
[81]	valid_0's rmse: 1.28773
[82]	valid_0's rmse: 1.28756
[83]	valid_0's rmse: 1.28692
[84]	valid_0's rmse: 1.28494
[85]	valid_0's rmse: 1.28483
[86]	valid_0's rmse: 1.28407
[87]	valid_0's rmse: 

[134]	valid_0's rmse: 0.687301
[135]	valid_0's rmse: 0.687052
[136]	valid_0's rmse: 0.686897
[137]	valid_0's rmse: 0.686673
[138]	valid_0's rmse: 0.686434
[139]	valid_0's rmse: 0.686309
[140]	valid_0's rmse: 0.686249
[141]	valid_0's rmse: 0.686178
[142]	valid_0's rmse: 0.686105
[143]	valid_0's rmse: 0.686036
[144]	valid_0's rmse: 0.685808
[145]	valid_0's rmse: 0.685857
[146]	valid_0's rmse: 0.685854
[147]	valid_0's rmse: 0.685768
[148]	valid_0's rmse: 0.685494
[149]	valid_0's rmse: 0.685389
[150]	valid_0's rmse: 0.685389
[151]	valid_0's rmse: 0.685348
[152]	valid_0's rmse: 0.685269
[153]	valid_0's rmse: 0.68521
[154]	valid_0's rmse: 0.685081
[155]	valid_0's rmse: 0.685136
[156]	valid_0's rmse: 0.685093
[157]	valid_0's rmse: 0.685117
[158]	valid_0's rmse: 0.685031
[159]	valid_0's rmse: 0.684995
[160]	valid_0's rmse: 0.684908
[161]	valid_0's rmse: 0.684785
[162]	valid_0's rmse: 0.684658
[163]	valid_0's rmse: 0.684701
[164]	valid_0's rmse: 0.684557
[165]	valid_0's rmse: 0.684534
[166]	val

[399]	valid_0's rmse: 0.673916
[400]	valid_0's rmse: 0.673931
[401]	valid_0's rmse: 0.673926
[402]	valid_0's rmse: 0.673918
[403]	valid_0's rmse: 0.67391
[404]	valid_0's rmse: 0.673915
[405]	valid_0's rmse: 0.673901
[406]	valid_0's rmse: 0.673876
[407]	valid_0's rmse: 0.673876
[408]	valid_0's rmse: 0.673958
[409]	valid_0's rmse: 0.673907
[410]	valid_0's rmse: 0.67391
[411]	valid_0's rmse: 0.673818
[412]	valid_0's rmse: 0.673783
[413]	valid_0's rmse: 0.673761
[414]	valid_0's rmse: 0.673805
[415]	valid_0's rmse: 0.67378
[416]	valid_0's rmse: 0.673788
[417]	valid_0's rmse: 0.673774
[418]	valid_0's rmse: 0.673743
[419]	valid_0's rmse: 0.673807
[420]	valid_0's rmse: 0.673856
[421]	valid_0's rmse: 0.673949
[422]	valid_0's rmse: 0.673979
[423]	valid_0's rmse: 0.673992
[424]	valid_0's rmse: 0.67405
[425]	valid_0's rmse: 0.674046
[426]	valid_0's rmse: 0.674054
[427]	valid_0's rmse: 0.674039
[428]	valid_0's rmse: 0.673962
[429]	valid_0's rmse: 0.674004
[430]	valid_0's rmse: 0.673954
[431]	valid_

[36]	valid_0's rmse: 1.56178
[37]	valid_0's rmse: 1.55716
[38]	valid_0's rmse: 1.5529
[39]	valid_0's rmse: 1.55022
[40]	valid_0's rmse: 1.54845
[41]	valid_0's rmse: 1.54512
[42]	valid_0's rmse: 1.54225
[43]	valid_0's rmse: 1.53974
[44]	valid_0's rmse: 1.5372
[45]	valid_0's rmse: 1.53461
[46]	valid_0's rmse: 1.53375
[47]	valid_0's rmse: 1.53072
[48]	valid_0's rmse: 1.52957
[49]	valid_0's rmse: 1.52838
[50]	valid_0's rmse: 1.52616
[51]	valid_0's rmse: 1.52383
[52]	valid_0's rmse: 1.52096
[53]	valid_0's rmse: 1.51929
[54]	valid_0's rmse: 1.51827
[55]	valid_0's rmse: 1.51601
[56]	valid_0's rmse: 1.51388
[57]	valid_0's rmse: 1.51235
[58]	valid_0's rmse: 1.5102
[59]	valid_0's rmse: 1.50991
[60]	valid_0's rmse: 1.50855
[61]	valid_0's rmse: 1.50876
[62]	valid_0's rmse: 1.50697
[63]	valid_0's rmse: 1.50618
[64]	valid_0's rmse: 1.50482
[65]	valid_0's rmse: 1.50457
[66]	valid_0's rmse: 1.50326
[67]	valid_0's rmse: 1.503
[68]	valid_0's rmse: 1.50243
[69]	valid_0's rmse: 1.50095
[70]	valid_0's rmse

[312]	valid_0's rmse: 1.46126
[313]	valid_0's rmse: 1.46123
[314]	valid_0's rmse: 1.46104
[315]	valid_0's rmse: 1.46108
[316]	valid_0's rmse: 1.46118
[317]	valid_0's rmse: 1.46135
[318]	valid_0's rmse: 1.4613
[319]	valid_0's rmse: 1.46137
[320]	valid_0's rmse: 1.4613
[321]	valid_0's rmse: 1.46104
[322]	valid_0's rmse: 1.46085
[323]	valid_0's rmse: 1.46084
[324]	valid_0's rmse: 1.46079
[325]	valid_0's rmse: 1.4609
[326]	valid_0's rmse: 1.46088
[327]	valid_0's rmse: 1.46085
[328]	valid_0's rmse: 1.4609
[329]	valid_0's rmse: 1.46096
[330]	valid_0's rmse: 1.46072
[331]	valid_0's rmse: 1.46083
[332]	valid_0's rmse: 1.46078
[333]	valid_0's rmse: 1.4607
[334]	valid_0's rmse: 1.46082
[335]	valid_0's rmse: 1.46085
[336]	valid_0's rmse: 1.46085
[337]	valid_0's rmse: 1.46088
[338]	valid_0's rmse: 1.46085
[339]	valid_0's rmse: 1.4609
[340]	valid_0's rmse: 1.46082
[341]	valid_0's rmse: 1.46067
[342]	valid_0's rmse: 1.46062
[343]	valid_0's rmse: 1.46054
[344]	valid_0's rmse: 1.4605
[345]	valid_0's r

[197]	valid_0's rmse: 1.5978
[198]	valid_0's rmse: 1.59778
[199]	valid_0's rmse: 1.59763
[200]	valid_0's rmse: 1.59754
[201]	valid_0's rmse: 1.59737
[202]	valid_0's rmse: 1.59747
[203]	valid_0's rmse: 1.59742
[204]	valid_0's rmse: 1.5966
[205]	valid_0's rmse: 1.59663
[206]	valid_0's rmse: 1.59664
[207]	valid_0's rmse: 1.59649
[208]	valid_0's rmse: 1.59635
[209]	valid_0's rmse: 1.59627
[210]	valid_0's rmse: 1.596
[211]	valid_0's rmse: 1.5961
[212]	valid_0's rmse: 1.59612
[213]	valid_0's rmse: 1.59589
[214]	valid_0's rmse: 1.59577
[215]	valid_0's rmse: 1.5957
[216]	valid_0's rmse: 1.59594
[217]	valid_0's rmse: 1.59596
[218]	valid_0's rmse: 1.59571
[219]	valid_0's rmse: 1.59571
[220]	valid_0's rmse: 1.59581
[221]	valid_0's rmse: 1.59594
[222]	valid_0's rmse: 1.59599
[223]	valid_0's rmse: 1.59593
[224]	valid_0's rmse: 1.59584
[225]	valid_0's rmse: 1.5957
[226]	valid_0's rmse: 1.59558
[227]	valid_0's rmse: 1.59525
[228]	valid_0's rmse: 1.59521
[229]	valid_0's rmse: 1.59512
[230]	valid_0's r

[155]	valid_0's rmse: 1.56953
[156]	valid_0's rmse: 1.56958
[157]	valid_0's rmse: 1.56951
[158]	valid_0's rmse: 1.56948
[159]	valid_0's rmse: 1.56919
[160]	valid_0's rmse: 1.56925
[161]	valid_0's rmse: 1.56896
[162]	valid_0's rmse: 1.56885
[163]	valid_0's rmse: 1.56864
[164]	valid_0's rmse: 1.56859
[165]	valid_0's rmse: 1.56862
[166]	valid_0's rmse: 1.5686
[167]	valid_0's rmse: 1.56896
[168]	valid_0's rmse: 1.56898
[169]	valid_0's rmse: 1.56888
[170]	valid_0's rmse: 1.5689
[171]	valid_0's rmse: 1.56879
[172]	valid_0's rmse: 1.56851
[173]	valid_0's rmse: 1.5685
[174]	valid_0's rmse: 1.56837
[175]	valid_0's rmse: 1.56838
[176]	valid_0's rmse: 1.56826
[177]	valid_0's rmse: 1.5681
[178]	valid_0's rmse: 1.56811
[179]	valid_0's rmse: 1.56776
[180]	valid_0's rmse: 1.56751
[181]	valid_0's rmse: 1.5673
[182]	valid_0's rmse: 1.56711
[183]	valid_0's rmse: 1.56705
[184]	valid_0's rmse: 1.56708
[185]	valid_0's rmse: 1.56692
[186]	valid_0's rmse: 1.56705
[187]	valid_0's rmse: 1.56705
[188]	valid_0's

[12]	valid_0's rmse: 1.06342
[13]	valid_0's rmse: 1.03438
[14]	valid_0's rmse: 1.00684
[15]	valid_0's rmse: 0.981458
[16]	valid_0's rmse: 0.957349
[17]	valid_0's rmse: 0.934598
[18]	valid_0's rmse: 0.912885
[19]	valid_0's rmse: 0.893016
[20]	valid_0's rmse: 0.875599
[21]	valid_0's rmse: 0.857683
[22]	valid_0's rmse: 0.841611
[23]	valid_0's rmse: 0.825797
[24]	valid_0's rmse: 0.811077
[25]	valid_0's rmse: 0.797199
[26]	valid_0's rmse: 0.784145
[27]	valid_0's rmse: 0.772058
[28]	valid_0's rmse: 0.761025
[29]	valid_0's rmse: 0.751009
[30]	valid_0's rmse: 0.74174
[31]	valid_0's rmse: 0.732819
[32]	valid_0's rmse: 0.724557
[33]	valid_0's rmse: 0.71798
[34]	valid_0's rmse: 0.710451
[35]	valid_0's rmse: 0.703372
[36]	valid_0's rmse: 0.697
[37]	valid_0's rmse: 0.691605
[38]	valid_0's rmse: 0.686157
[39]	valid_0's rmse: 0.681534
[40]	valid_0's rmse: 0.676684
[41]	valid_0's rmse: 0.672431
[42]	valid_0's rmse: 0.668614
[43]	valid_0's rmse: 0.664829
[44]	valid_0's rmse: 0.661312
[45]	valid_0's rms

[2]	valid_0's rmse: 2.31438
[3]	valid_0's rmse: 2.24131
[4]	valid_0's rmse: 2.17298
[5]	valid_0's rmse: 2.10965
[6]	valid_0's rmse: 2.05012
[7]	valid_0's rmse: 1.99521
[8]	valid_0's rmse: 1.94469
[9]	valid_0's rmse: 1.89906
[10]	valid_0's rmse: 1.85635
[11]	valid_0's rmse: 1.81689
[12]	valid_0's rmse: 1.77957
[13]	valid_0's rmse: 1.74827
[14]	valid_0's rmse: 1.71784
[15]	valid_0's rmse: 1.69133
[16]	valid_0's rmse: 1.66379
[17]	valid_0's rmse: 1.64144
[18]	valid_0's rmse: 1.61976
[19]	valid_0's rmse: 1.60116
[20]	valid_0's rmse: 1.58312
[21]	valid_0's rmse: 1.56668
[22]	valid_0's rmse: 1.55172
[23]	valid_0's rmse: 1.53961
[24]	valid_0's rmse: 1.52785
[25]	valid_0's rmse: 1.51633
[26]	valid_0's rmse: 1.50603
[27]	valid_0's rmse: 1.49696
[28]	valid_0's rmse: 1.49036
[29]	valid_0's rmse: 1.48295
[30]	valid_0's rmse: 1.47554
[31]	valid_0's rmse: 1.47112
[32]	valid_0's rmse: 1.46581
[33]	valid_0's rmse: 1.46206
[34]	valid_0's rmse: 1.45686
[35]	valid_0's rmse: 1.45228
[36]	valid_0's rmse: 1

In [83]:
for i in range(4):
    print('meter: '+ str(i))
    for model in meter_models[i]:
        print(model.best_score_['valid_0']['rmse'])


meter: 0
0.6242115601306601
0.4706509971006489
0.6363170549341303
0.6710966921475525
0.6286958340356329
meter: 1
1.3207008011473647
1.1805612070685731
1.2784919316782757
1.4589426463573532
1.3177154019442268
meter: 2
1.4822690075873926
1.4684062020689617
1.3856683022636251
1.5916329511269884
1.4270280438149139
meter: 3
1.5223281922916958
1.611970761634611
1.266066964762435
1.5565098428231963
1.6737044325615138


In [84]:
# Importance rank for first model in cross val models
for i in range(4):
    print('meter: '+ str(i))
    imprtc_df = pd.DataFrame()
    imprtc_df['feature'] = sample_train_X.drop('meter_reading_log1p', axis=1).columns   
    imprtc_df['importance'] = meter_models[i][0].feature_importances_
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    print(imprtc_df)


meter: 0
                            feature  importance
0                       building_id        8991
97                       meter_mean        1046
102                            hour         767
69       air_temperature_mean_lag72         657
78        dew_temperature_max_lag72         621
70        air_temperature_max_lag72         585
71        air_temperature_min_lag72         558
100                       meter_std         550
56     precip_depth_1_hr_std_lag3_c         548
28       precip_depth_1_hr_std_lag3         547
79        dew_temperature_min_lag72         545
93            wind_speed_mean_lag72         507
77       dew_temperature_mean_lag72         499
87     sea_level_pressure_min_lag72         453
2                           site_id         448
7                   dew_temperature         430
86     sea_level_pressure_max_lag72         417
5                   air_temperature         416
89        wind_direction_mean_lag72         372
101                       dayof

[106 rows x 2 columns]
meter: 3
                           feature  importance
0                      building_id        2146
5                  air_temperature         372
97                      meter_mean         225
71       air_temperature_min_lag72         199
102                           hour         190
7                  dew_temperature         159
70       air_temperature_max_lag72         154
12                     s_radiation         144
79       dew_temperature_min_lag72         142
86    sea_level_pressure_max_lag72         125
69      air_temperature_mean_lag72         115
28      precip_depth_1_hr_std_lag3          99
100                      meter_std          94
77      dew_temperature_mean_lag72          84
93           wind_speed_mean_lag72          79
92        wind_direction_std_lag72          78
89       wind_direction_mean_lag72          74
80       dew_temperature_std_lag72          70
78       dew_temperature_max_lag72          70
56    precip_depth_1_hr_std_

In [40]:
# %%time
# ## Single fit single model

# gbm = LGBMRegressor(**gbm_params)
# f_train_X, f_train_y = getInFoldXY(train.index)
# gbm.fit(f_train_X, f_train_y)

In [107]:
test = loadFile('test')
test = merge(test)
test = x_pre_pipes.transform(test)
test = x_fold_pipes.transform(test)
test = test.drop('row_id', axis=1)


#print(test.sample(n=20,  random_state=42))
print(test.shape)
#print(test.dtypes)

(41697600, 106)


In [108]:
l1 =  x_fold_pipes.transform(train).columns.tolist()
l2 =  test.columns.tolist()
print(list(x for x in l1 if x not in l2))
print(list(x for x in l2 if x not in l1))

['meter_reading_log1p']
[]


In [110]:
def predMeters(test_X):
    test_y = test_X[['meter']] 
    test_y['meter_reading_log1p'] = np.nan
    for i in range(4):
        X = test_X[test_X['meter'] == i]
        if X.shape[0] > 0:
            preds = np.expm1(sum([model.predict(X) for model in meter_models[i]])/folds)
            test_y.loc[test_y['meter'] == i, 'meter_reading_log1p'] = preds
    return test_y['meter_reading_log1p'].tolist()
    
print(predMeters(test.sample(n=20,  random_state=42)))    


[148.39616804940331, 8.010313350980178, 61.56420244307233, 1.6883602433219487, 1127.455167176363, 6.0295103191622115, 334.39255343705724, 1176.2921559368137, 364.65831061892624, 39.787114236964534, 44.58237700920967, 756.2727101030689, 1018.8097991406814, 28.48173836101922, 63.17715593225739, 50.41080437075202, 13.099507494373876, 54.539901903991144, 46.17372502150975, 164.60011798577725]


In [111]:
# Predict using cross val models ensemble 
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    res.append(predMeters(test.iloc[i:i+step_size]))
    i+=step_size
    gc.collect()


100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [1:07:38<00:00,  5.74s/it]


In [112]:
# Save using cross val models ensemble 
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission_meter.csv.zip', index=False)
submission.shape

41697600


(41697600, 2)

In [None]:
# # Predict single model fit
# i=0
# res=[]
# step_size = 50000
# for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
#    #res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
#    res.append(np.expm1(gbm.predict(test_X.iloc[i:i+step_size])))
#    i+=step_size
    