In [1]:
# toggle to save space
mode = '_mean' if False else '_all'
print(mode)

In [2]:
gbm_params = {
    'n_estimators' : 1500, # 500,  
    'max_depth' : -1,
    'learning_rate': 0.05,
    'bagging_fraction': 0.7, 
    
    'feature_fraction' : 0.9,
    'bagging_freq': 5,
    #'subsample' : 0.1,  # 
    #'subsample_freq' : 1,
    'num_leaves' : 31,
    'metric':'rmse',
    #'lambda_l1' : 1,  # Try defaults
    #'lambda_l2': 1, # Try defaults
    'verbose': 100
}

In [3]:
# TODO: add https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [5]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [6]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
}

def loadFile(name):
    for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
        if path.exists(dir_path + name + '.csv'):
            return  ConvertToDatetime().transform(
                pd.read_csv(dir_path + name + '.csv', dtype=file_dtype[name]))
        


In [7]:
building = loadFile('building_metadata')
pre_train = loadFile('train')
#test = loadFile('test')

weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')

In [8]:
def merge(x):
    weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')
    x = x.merge(building, on=['building_id'], how='left')
    gc.collect()
    x = x.merge(weather_processed_df, on=['site_id', 'timestamp'], how='left')
    gc.collect()
    x['site_id'] = x['site_id'].astype('int8');
    x['cloud_coverage'] = x['cloud_coverage'].astype('float16')
    gc.collect()
    return x
        
train = merge(pre_train)  
print(train)
print('!!!! Warning we are missing weather for '+ str(train['air_temperature'].isnull().sum())+' rows')
train = train.dropna(axis=0, subset=['air_temperature'])


          building_id  meter           timestamp  meter_reading  site_id  \
0                   0      0 2016-01-01 00:00:00       0.000000        0   
1                   1      0 2016-01-01 00:00:00       0.000000        0   
2                   2      0 2016-01-01 00:00:00       0.000000        0   
3                   3      0 2016-01-01 00:00:00       0.000000        0   
4                   4      0 2016-01-01 00:00:00       0.000000        0   
5                   5      0 2016-01-01 00:00:00       0.000000        0   
6                   6      0 2016-01-01 00:00:00       0.000000        0   
7                   7      0 2016-01-01 00:00:00       0.000000        0   
8                   8      0 2016-01-01 00:00:00       0.000000        0   
9                   9      0 2016-01-01 00:00:00       0.000000        0   
10                 10      0 2016-01-01 00:00:00       0.000000        0   
11                 11      0 2016-01-01 00:00:00       0.000000        0   
12          

[20216100 rows x 100 columns]


In [9]:
# See holiday notebook to generate, this is optional
holiday_df = None
if path.exists('../input/ashrae-energy-prediction-pickles/holiday_df.pickle'):
    holiday_df = pd.read_pickle('../input/ashrae-energy-prediction-pickles/holiday_df.pickle')
if holiday_df is not None:
    print(holiday_df.sample(20))

          site_id           timestamp  \
46160392       11 2017-07-01 14:00:00   
46147330       11 2017-05-22 21:00:00   
10100788       13 2016-07-04 14:00:00   
23145051        1 2018-03-19 16:00:00   
18138967       10 2016-11-24 19:00:00   
46234228       11 2018-02-19 09:00:00   
17401930       13 2016-11-11 15:00:00   
10545384        5 2016-07-12 12:00:00   
38053104        7 2017-04-14 08:00:00   
989667          4 2016-01-18 23:00:00   
46215720       11 2017-12-25 18:00:00   
57074005       14 2018-11-12 18:00:00   
19918117       15 2016-12-26 17:00:00   
24594390        2 2017-05-29 09:00:00   
177768          5 2016-01-04 05:00:00   
2470420         7 2016-02-15 06:00:00   
4017197        12 2016-03-17 03:00:00   
38516112        7 2018-08-06 09:00:00   
46193292       11 2017-10-09 07:00:00   
39812960        8 2018-11-22 11:00:00   

                                                  holiday  
46160392                                       Canada Day  
46147330          

In [10]:
class MeterReadingLog1p(TransformerMixin):
  
    def transform(self, df, **transform_params):
        if 'meter_reading' in df.columns:
            df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
            df = df.drop('meter_reading', axis=1)
        return df
    
    def fit(self, X, y=None, **fit_params):
        return self
print(train.sample(20, random_state=42))
print(MeterReadingLog1p().transform(train.sample(20, random_state=42)))
gc.collect()

          building_id  meter           timestamp  meter_reading  site_id  \
14245562         1324      1 2016-09-16 16:00:00       0.000000       14   
1282718          1013      0 2016-01-24 06:00:00      32.000099       10   
13883790          229      1 2016-09-10 07:00:00     567.655029        2   
4781820           217      3 2016-04-01 01:00:00       0.000000        2   
10415393         1434      0 2016-07-10 04:00:00      65.750000       15   
1057008          1047      0 2016-01-20 04:00:00      90.983299       12   
4507399           911      1 2016-03-26 20:00:00     295.063995        9   
19478829         1039      0 2016-12-18 23:00:00      16.900000       12   
8955615           265      0 2016-06-14 06:00:00     128.369995        2   
13799839          896      0 2016-09-08 19:00:00     300.000000        9   
15647011          973      0 2016-10-11 11:00:00     247.000000        9   
2524294           813      0 2016-02-16 08:00:00      10.958300        8   
10016102    

[20 rows x 100 columns]
          building_id  meter           timestamp  site_id  \
14245562         1324      1 2016-09-16 16:00:00       14   
1282718          1013      0 2016-01-24 06:00:00       10   
13883790          229      1 2016-09-10 07:00:00        2   
4781820           217      3 2016-04-01 01:00:00        2   
10415393         1434      0 2016-07-10 04:00:00       15   
1057008          1047      0 2016-01-20 04:00:00       12   
4507399           911      1 2016-03-26 20:00:00        9   
19478829         1039      0 2016-12-18 23:00:00       12   
8955615           265      0 2016-06-14 06:00:00        2   
13799839          896      0 2016-09-08 19:00:00        9   
15647011          973      0 2016-10-11 11:00:00        9   
2524294           813      0 2016-02-16 08:00:00        8   
10016102          870      0 2016-07-03 02:00:00        8   
3915750           898      0 2016-03-15 03:00:00        9   
17217526          903      0 2016-11-08 09:00:00        9   


[20 rows x 100 columns]


31

In [11]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        # TODO: try week of year as numerical 
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('uint8') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('uint8')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [12]:
meter_desc_columns={'mean': 'meter_mean', 'max': 'meter_max', 'min': 'meter_min', 'std':'meter_std'}

class CreateMeterDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _building_meter_desc_DF
        print(df.columns)
        if 'meter_reading_log1p' in df.columns:
            cols = ['mean']
            if mode == '_all':
                cols = ['mean','max','min','std']
            group = df.groupby(['building_id','meter'])['meter_reading_log1p']
            desc_DF = group.describe()[cols]
            desc_DF = desc_DF.reset_index()
            _building_meter_desc_DF = desc_DF.rename(columns=meter_desc_columns)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterDescDF().transform(
    AddTimeFeatures().transform(
        MeterReadingLog1p().transform(
            train.sample(2000, random_state=0)
        )
    )
)    
print(_building_meter_desc_DF)
gc.collect()

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'dew_temperature',
       ...
       'wind_direction_max_lag96', 'wind_direction_min_lag96',
       'wind_direction_std_lag96', 'wind_speed_mean_lag96',
       'wind_speed_max_lag96', 'wind_speed_min_lag96', 'wind_speed_std_lag96',
       'meter_reading_log1p', 'dayofweek', 'hour'],
      dtype='object', length=102)
      building_id  meter  meter_mean  meter_max  meter_min  meter_std
0               1      0    4.921724   4.921724   4.921724        NaN
1               2      0    0.000000   0.000000   0.000000        NaN
2               3      0    5.727568   5.818533   5.636602   0.128644
3               5      0    0.000000   0.000000   0.000000        NaN
4               6      0    0.000000   0.000000   0.000000        NaN
5               8      0    5.930674   5.930674   5.930674        NaN
6               9      1    5.300433   6.8347

21

In [13]:
class MergeMeterDescDF(TransformerMixin):
  
    def transform(self, df, **transform_params):
        # drop any columns to add
        df = df.drop(meter_desc_columns.values(), axis=1, errors='ignore') 
        return df.merge(_building_meter_desc_DF, on=['building_id','meter'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeMeterDescDF().transform(train.sample(2000, random_state=0)))

      building_id  meter           timestamp  meter_reading  site_id  \
0             774      1 2016-08-07 08:00:00      36.128899        6   
1             206      0 2016-10-04 14:00:00     226.270004        2   
2            1269      0 2016-11-29 10:00:00      28.670799       14   
3             951      0 2016-10-10 04:00:00     113.000000        9   
4             656      0 2016-05-01 21:00:00      32.700001        5   
5              36      0 2016-06-05 19:00:00     178.830994        0   
6            1262      0 2016-07-19 05:00:00      73.739998       14   
7              52      0 2016-03-17 14:00:00       0.000000        0   
8            1133      2 2016-07-10 17:00:00     984.375000       13   
9            1123      0 2016-10-02 09:00:00      18.243999       13   
10           1237      0 2016-10-06 18:00:00      85.000000       14   
11            960      1 2016-11-17 10:00:00      90.066498        9   
12             11      0 2016-09-25 21:00:00     472.332001     

[2000 rows x 104 columns]


In [14]:
# "As you can see above, this data looks weired until May 20. It is 
# reported in this discussion by @barnwellguy that All electricity
# meter is 0 until May 20 for site_id == 0. Let's remove these data 
# from training data."
# https://www.kaggle.com/kaushal2896/ashrae-eda-fe-lightgbm-1-13
class RmS0M0(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

    def fit(self, X, y=None, **fit_params):
        return self
    


In [15]:
# TODO: write filter to remove any 0 meter reading that continue more than N days (try 3)
# Also we need to account for this by meter

In [16]:
# TODO: try rolling with power

In [17]:
    
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df['holiday'] = df['holiday'].astype('category')
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self
# Test 
if holiday_df is not None:
    print(holiday_df.columns)
    print(AddHolidays().transform(train.head(20))[['holiday','timestamp']])

Index(['site_id', 'timestamp', 'holiday'], dtype='object')
           holiday  timestamp
0   New Year's Day 2016-01-01
1   New Year's Day 2016-01-01
2   New Year's Day 2016-01-01
3   New Year's Day 2016-01-01
4   New Year's Day 2016-01-01
5   New Year's Day 2016-01-01
6   New Year's Day 2016-01-01
7   New Year's Day 2016-01-01
8   New Year's Day 2016-01-01
9   New Year's Day 2016-01-01
10  New Year's Day 2016-01-01
11  New Year's Day 2016-01-01
12  New Year's Day 2016-01-01
13  New Year's Day 2016-01-01
14  New Year's Day 2016-01-01
15  New Year's Day 2016-01-01
16  New Year's Day 2016-01-01
17  New Year's Day 2016-01-01
18  New Year's Day 2016-01-01
19  New Year's Day 2016-01-01


In [18]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df = df.drop(df[df['holiday'].notnull()].index)
            df = df.drop(['holiday'], axis=1)
            gc.collect()
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# Test you should see the new years removed
#print(train.head(100000).merge(building, on='building_id', how='left').columns)
print(RmHolidays().transform(train.head(100000)))

       building_id  meter           timestamp  meter_reading  site_id  \
55121            0      0 2016-01-02 00:00:00       0.000000        0   
55122            1      0 2016-01-02 00:00:00       0.000000        0   
55123            2      0 2016-01-02 00:00:00       0.000000        0   
55124            3      0 2016-01-02 00:00:00       0.000000        0   
55125            4      0 2016-01-02 00:00:00       0.000000        0   
55126            5      0 2016-01-02 00:00:00       0.000000        0   
55127            6      0 2016-01-02 00:00:00       0.000000        0   
55128            7      0 2016-01-02 00:00:00       0.000000        0   
55129            8      0 2016-01-02 00:00:00       0.000000        0   
55130            9      0 2016-01-02 00:00:00       0.000000        0   
55131           10      0 2016-01-02 00:00:00       0.000000        0   
55132           11      0 2016-01-02 00:00:00       0.000000        0   
55133           12      0 2016-01-02 00:00:00      

[41839 rows x 100 columns]


In [19]:
class SetCatTypes(TransformerMixin):
    
    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col]= df[col].astype('category')
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [20]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(building.head(20)['square_feet'])

0       7432.0
1       2720.0
2       5376.0
3      23685.0
4     116607.0
5       8000.0
6      27926.0
7     121074.0
8      60809.0
9      27000.0
10    370773.0
11     49073.0
12     37100.0
13     99380.0
14     86250.0
15     83957.0
16     54644.0
17     15250.0
18    111891.0
19     18717.0
Name: square_feet, dtype: float32


In [21]:
# TODO: Play with scaling cloud coverage

In [22]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [23]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','year_built']]
        year_built_median = tmp_df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        del tmp_df, year_built_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(ImputeYearBuilt().transform(train.sample(20))['building_age'])

6029294      70
15819220     70
4209078      70
12589662     70
15722246     76
155558       70
4805839      70
14461316     70
12366349     70
10483963    105
14002981     70
59024        70
17972980    113
10566078     82
7956456      70
12100536     56
12588882     70
4025477      70
16500201    103
5049416      62
Name: building_age, dtype: uint8


In [24]:
class ImputeFloorCount(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','floor_count']]
        floors_median = tmp_df['floor_count'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['floor_count'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = i_median
            else:
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = floors_median
        del tmp_df, floors_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(ImputeFloorCount().transform(train.sample(20))['floor_count'])

13174966    3.0
4184988     5.0
18668002    3.0
15328534    3.0
9618361     3.0
4134506     4.0
20105696    3.0
4554375     3.0
7675290     3.0
13120547    3.0
6774908     3.0
1065773     5.0
5090416     3.0
8774874     3.0
10661519    3.0
2759467     3.0
2129531     1.0
17865092    3.0
1144166     3.0
430987      3.0
Name: floor_count, dtype: float16


In [25]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [26]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [27]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [28]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [29]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [30]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [31]:
%%time

x_pre_pipes = Pipeline(
    steps=[
        ('meterReadingLog1p',MeterReadingLog1p()),
        ('rmS0M0', RmS0M0()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('logSquareFeet', LogSquareFeet()),
        #('rmHolidays', RmHolidays()),
        #('addHolidays', AddHolidays()),
        ('createMeterDescDF', CreateMeterDescDF()), # note declares a globe variable to pass
        ('mergeMeterDescDF', MergeMeterDescDF()), # populates both test and train from global
        ('setCatTypes', SetCatTypes(['building_id', 'site_id', 'meter', 'primary_use'])),
        ('GC', GC())
    ]
)

train = x_pre_pipes.transform(train)
print(train.columns)

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'dew_temperature',
       ...
       'wind_direction_min_lag96', 'wind_direction_std_lag96',
       'wind_speed_mean_lag96', 'wind_speed_max_lag96', 'wind_speed_min_lag96',
       'wind_speed_std_lag96', 'meter_reading_log1p', 'dayofweek', 'hour',
       'log_square_feet'],
      dtype='object', length=103)
Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'dew_temperature',
       ...
       'wind_speed_min_lag96', 'wind_speed_std_lag96', 'meter_reading_log1p',
       'dayofweek', 'hour', 'log_square_feet', 'meter_mean', 'meter_max',
       'meter_min', 'meter_std'],
      dtype='object', length=107)
Wall time: 40.1 s


In [32]:
# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_fold_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('imputeFloorCount', ImputeFloorCount()),
        ('dropCols', DropCols(['timestamp','square_feet', 'year_built'])),
        ('GC', GC())
    ]
)

sample_train_X = x_fold_pipes.transform(train.sample(20))
print(sample_train_X.columns)
print(sample_train_X.dtypes)

Index(['building_id', 'meter', 'site_id', 'primary_use', 'floor_count',
       'air_temperature', 'dew_temperature', 'cloud_coverage',
       'precip_depth_1_hr', 'wind_direction',
       ...
       'wind_speed_std_lag96', 'meter_reading_log1p', 'dayofweek', 'hour',
       'log_square_feet', 'meter_mean', 'meter_max', 'meter_min', 'meter_std',
       'building_age'],
      dtype='object', length=105)
building_id                      category
meter                            category
site_id                          category
primary_use                      category
floor_count                       float16
air_temperature                   float16
dew_temperature                   float16
cloud_coverage                    float16
precip_depth_1_hr                 float16
wind_direction                    float16
wind_speed                        float16
sea_level_pressure                float16
air_temperature_mean_lag3         float16
air_temperature_max_lag3          float16
air_temp

In [33]:
# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
# trying no shuffle https://www.kaggle.com/c/ashrae-energy-prediction/discussion/115851#latest-666115
folds = 5
kf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=42)


In [34]:
def cvTrainMeterEnsemble(train, gbm_params):
    meter_models = [ [], [], [], [] ]
    for train_index, val_index in kf.split(train, train['building_id']):
        f_train = x_fold_pipes.transform(train.loc[train_index])
        f_val = x_fold_pipes.transform(train.loc[val_index])
        for i in range(4):
            print(f'training meter {i}')
            f_train_m = f_train[f_train['meter'] == i]
            f_val_m = f_val[f_val['meter'] == i]
            gbm_params_m = gbm_params
            if i == 0:
                gbm_params_m['learning_rate']=0.04
            elif i == 1:
                gbm_params_m['learning_rate']=0.06
                gbm_params_m['bagging_fraction']=0.5
            elif i == 2:
                gbm_params_m['learning_rate']=0.05
                gbm_params_m['bagging_fraction']=0.8
            else:
                gbm_params_m['learning_rate']=0.04
                gbm_params_m['bagging_fraction']=0.9                
            gbm = LGBMRegressor(**gbm_params_m)
            gbm.fit(f_train_m.drop('meter_reading_log1p', axis=1), f_train_m['meter_reading_log1p'],
                eval_set=[(f_val_m.drop('meter_reading_log1p', axis=1), f_val_m['meter_reading_log1p'])],
                # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
                # eval_metric=lbm_rmslee,
                early_stopping_rounds=20)
            meter_models[i].append(gbm)
            del f_train_m, f_val_m, gbm
            gc.collect()
        del f_train, f_val
        gc.collect()
    return meter_models

In [35]:
%%time
meter_models = cvTrainMeterEnsemble(train, gbm_params)

training meter 0
[1]	valid_0's rmse: 1.52662
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.47655
[3]	valid_0's rmse: 1.42897
[4]	valid_0's rmse: 1.38364
[5]	valid_0's rmse: 1.34043
[6]	valid_0's rmse: 1.29936
[7]	valid_0's rmse: 1.26087
[8]	valid_0's rmse: 1.22374
[9]	valid_0's rmse: 1.19131
[10]	valid_0's rmse: 1.15776
[11]	valid_0's rmse: 1.12608
[12]	valid_0's rmse: 1.09602
[13]	valid_0's rmse: 1.06721
[14]	valid_0's rmse: 1.04031
[15]	valid_0's rmse: 1.01495
[16]	valid_0's rmse: 0.990835
[17]	valid_0's rmse: 0.96859
[18]	valid_0's rmse: 0.947267
[19]	valid_0's rmse: 0.927632
[20]	valid_0's rmse: 0.90921
[21]	valid_0's rmse: 0.891672
[22]	valid_0's rmse: 0.874786
[23]	valid_0's rmse: 0.860553
[24]	valid_0's rmse: 0.845852
[25]	valid_0's rmse: 0.831871
[26]	valid_0's rmse: 0.819643
[27]	valid_0's rmse: 0.807562
[28]	valid_0's rmse: 0.796256
[29]	valid_0's rmse: 0.786104
[30]	valid_0's rmse: 0.776896
[31]	valid_0's rmse: 0.768484
[32]	valid_0's rms

[160]	valid_0's rmse: 1.29688
[161]	valid_0's rmse: 1.29694
[162]	valid_0's rmse: 1.29701
[163]	valid_0's rmse: 1.29699
[164]	valid_0's rmse: 1.29667
[165]	valid_0's rmse: 1.29641
[166]	valid_0's rmse: 1.296
[167]	valid_0's rmse: 1.29582
[168]	valid_0's rmse: 1.29563
[169]	valid_0's rmse: 1.29551
[170]	valid_0's rmse: 1.29554
[171]	valid_0's rmse: 1.29578
[172]	valid_0's rmse: 1.29595
[173]	valid_0's rmse: 1.29596
[174]	valid_0's rmse: 1.29578
[175]	valid_0's rmse: 1.29658
[176]	valid_0's rmse: 1.29641
[177]	valid_0's rmse: 1.29614
[178]	valid_0's rmse: 1.29667
[179]	valid_0's rmse: 1.29678
[180]	valid_0's rmse: 1.29687
[181]	valid_0's rmse: 1.2969
[182]	valid_0's rmse: 1.29658
[183]	valid_0's rmse: 1.29671
[184]	valid_0's rmse: 1.29677
[185]	valid_0's rmse: 1.29743
[186]	valid_0's rmse: 1.29728
[187]	valid_0's rmse: 1.29706
[188]	valid_0's rmse: 1.29815
[189]	valid_0's rmse: 1.29846
Early stopping, best iteration is:
[169]	valid_0's rmse: 1.29551
training meter 2
[1]	valid_0's rmse: 2

[243]	valid_0's rmse: 1.44418
[244]	valid_0's rmse: 1.44413
[245]	valid_0's rmse: 1.44312
[246]	valid_0's rmse: 1.44282
[247]	valid_0's rmse: 1.44289
[248]	valid_0's rmse: 1.4421
[249]	valid_0's rmse: 1.44226
[250]	valid_0's rmse: 1.44248
[251]	valid_0's rmse: 1.44225
[252]	valid_0's rmse: 1.44223
[253]	valid_0's rmse: 1.44233
[254]	valid_0's rmse: 1.44183
[255]	valid_0's rmse: 1.44183
[256]	valid_0's rmse: 1.44173
[257]	valid_0's rmse: 1.44171
[258]	valid_0's rmse: 1.44177
[259]	valid_0's rmse: 1.44169
[260]	valid_0's rmse: 1.44167
[261]	valid_0's rmse: 1.442
[262]	valid_0's rmse: 1.44234
[263]	valid_0's rmse: 1.44174
[264]	valid_0's rmse: 1.44174
[265]	valid_0's rmse: 1.44159
[266]	valid_0's rmse: 1.44143
[267]	valid_0's rmse: 1.44119
[268]	valid_0's rmse: 1.44044
[269]	valid_0's rmse: 1.44045
[270]	valid_0's rmse: 1.44062
[271]	valid_0's rmse: 1.4407
[272]	valid_0's rmse: 1.4407
[273]	valid_0's rmse: 1.44049
[274]	valid_0's rmse: 1.4404
[275]	valid_0's rmse: 1.4404
[276]	valid_0's r

[79]	valid_0's rmse: 1.54447
[80]	valid_0's rmse: 1.54361
[81]	valid_0's rmse: 1.54273
[82]	valid_0's rmse: 1.5426
[83]	valid_0's rmse: 1.54202
[84]	valid_0's rmse: 1.54049
[85]	valid_0's rmse: 1.53983
[86]	valid_0's rmse: 1.53939
[87]	valid_0's rmse: 1.53807
[88]	valid_0's rmse: 1.53772
[89]	valid_0's rmse: 1.53749
[90]	valid_0's rmse: 1.53553
[91]	valid_0's rmse: 1.53384
[92]	valid_0's rmse: 1.53226
[93]	valid_0's rmse: 1.53093
[94]	valid_0's rmse: 1.53009
[95]	valid_0's rmse: 1.52911
[96]	valid_0's rmse: 1.52777
[97]	valid_0's rmse: 1.52707
[98]	valid_0's rmse: 1.52598
[99]	valid_0's rmse: 1.52465
[100]	valid_0's rmse: 1.52412
[101]	valid_0's rmse: 1.52314
[102]	valid_0's rmse: 1.52329
[103]	valid_0's rmse: 1.52307
[104]	valid_0's rmse: 1.5224
[105]	valid_0's rmse: 1.52132
[106]	valid_0's rmse: 1.52061
[107]	valid_0's rmse: 1.52018
[108]	valid_0's rmse: 1.51898
[109]	valid_0's rmse: 1.51785
[110]	valid_0's rmse: 1.51744
[111]	valid_0's rmse: 1.51645
[112]	valid_0's rmse: 1.516
[113]

[119]	valid_0's rmse: 0.532274
[120]	valid_0's rmse: 0.531876
Early stopping, best iteration is:
[100]	valid_0's rmse: 0.529704
training meter 1
[1]	valid_0's rmse: 2.36178
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 2.26632
[3]	valid_0's rmse: 2.18031
[4]	valid_0's rmse: 2.10058
[5]	valid_0's rmse: 2.02846
[6]	valid_0's rmse: 1.96086
[7]	valid_0's rmse: 1.9006
[8]	valid_0's rmse: 1.84316
[9]	valid_0's rmse: 1.79159
[10]	valid_0's rmse: 1.74421
[11]	valid_0's rmse: 1.70002
[12]	valid_0's rmse: 1.66033
[13]	valid_0's rmse: 1.62496
[14]	valid_0's rmse: 1.59191
[15]	valid_0's rmse: 1.56233
[16]	valid_0's rmse: 1.53416
[17]	valid_0's rmse: 1.50985
[18]	valid_0's rmse: 1.48946
[19]	valid_0's rmse: 1.46821
[20]	valid_0's rmse: 1.4494
[21]	valid_0's rmse: 1.43225
[22]	valid_0's rmse: 1.41814
[23]	valid_0's rmse: 1.4037
[24]	valid_0's rmse: 1.39184
[25]	valid_0's rmse: 1.38093
[26]	valid_0's rmse: 1.37157
[27]	valid_0's rmse: 1.36105
[28]	valid_0's rmse: 1.

[271]	valid_0's rmse: 1.18248
[272]	valid_0's rmse: 1.18245
[273]	valid_0's rmse: 1.18255
[274]	valid_0's rmse: 1.1825
[275]	valid_0's rmse: 1.18241
[276]	valid_0's rmse: 1.18234
[277]	valid_0's rmse: 1.18226
[278]	valid_0's rmse: 1.18227
[279]	valid_0's rmse: 1.18235
[280]	valid_0's rmse: 1.1812
[281]	valid_0's rmse: 1.18118
[282]	valid_0's rmse: 1.18102
[283]	valid_0's rmse: 1.18104
[284]	valid_0's rmse: 1.18112
[285]	valid_0's rmse: 1.18113
[286]	valid_0's rmse: 1.18105
[287]	valid_0's rmse: 1.18099
[288]	valid_0's rmse: 1.18023
[289]	valid_0's rmse: 1.17997
[290]	valid_0's rmse: 1.1795
[291]	valid_0's rmse: 1.17948
[292]	valid_0's rmse: 1.17943
[293]	valid_0's rmse: 1.17929
[294]	valid_0's rmse: 1.17926
[295]	valid_0's rmse: 1.1787
[296]	valid_0's rmse: 1.17871
[297]	valid_0's rmse: 1.17871
[298]	valid_0's rmse: 1.17843
[299]	valid_0's rmse: 1.17862
[300]	valid_0's rmse: 1.17875
[301]	valid_0's rmse: 1.17865
[302]	valid_0's rmse: 1.17859
[303]	valid_0's rmse: 1.17853
[304]	valid_0'

[163]	valid_0's rmse: 1.46505
[164]	valid_0's rmse: 1.465
[165]	valid_0's rmse: 1.46458
[166]	valid_0's rmse: 1.4622
[167]	valid_0's rmse: 1.46128
[168]	valid_0's rmse: 1.46107
[169]	valid_0's rmse: 1.46058
[170]	valid_0's rmse: 1.46017
[171]	valid_0's rmse: 1.46013
[172]	valid_0's rmse: 1.46035
[173]	valid_0's rmse: 1.46044
[174]	valid_0's rmse: 1.46017
[175]	valid_0's rmse: 1.46003
[176]	valid_0's rmse: 1.46005
[177]	valid_0's rmse: 1.46002
[178]	valid_0's rmse: 1.46017
[179]	valid_0's rmse: 1.45987
[180]	valid_0's rmse: 1.45947
[181]	valid_0's rmse: 1.45935
[182]	valid_0's rmse: 1.45843
[183]	valid_0's rmse: 1.45853
[184]	valid_0's rmse: 1.45626
[185]	valid_0's rmse: 1.4559
[186]	valid_0's rmse: 1.45585
[187]	valid_0's rmse: 1.4556
[188]	valid_0's rmse: 1.45547
[189]	valid_0's rmse: 1.45506
[190]	valid_0's rmse: 1.45491
[191]	valid_0's rmse: 1.4549
[192]	valid_0's rmse: 1.45387
[193]	valid_0's rmse: 1.45322
[194]	valid_0's rmse: 1.45335
[195]	valid_0's rmse: 1.4533
[196]	valid_0's r

[27]	valid_0's rmse: 1.72775
[28]	valid_0's rmse: 1.71852
[29]	valid_0's rmse: 1.70953
[30]	valid_0's rmse: 1.70243
[31]	valid_0's rmse: 1.69544
[32]	valid_0's rmse: 1.68945
[33]	valid_0's rmse: 1.68262
[34]	valid_0's rmse: 1.67751
[35]	valid_0's rmse: 1.67227
[36]	valid_0's rmse: 1.66865
[37]	valid_0's rmse: 1.66347
[38]	valid_0's rmse: 1.65943
[39]	valid_0's rmse: 1.65658
[40]	valid_0's rmse: 1.65401
[41]	valid_0's rmse: 1.65093
[42]	valid_0's rmse: 1.6476
[43]	valid_0's rmse: 1.6446
[44]	valid_0's rmse: 1.64146
[45]	valid_0's rmse: 1.63904
[46]	valid_0's rmse: 1.63698
[47]	valid_0's rmse: 1.63569
[48]	valid_0's rmse: 1.63346
[49]	valid_0's rmse: 1.63278
[50]	valid_0's rmse: 1.6311
[51]	valid_0's rmse: 1.63071
[52]	valid_0's rmse: 1.62952
[53]	valid_0's rmse: 1.62884
[54]	valid_0's rmse: 1.62758
[55]	valid_0's rmse: 1.62647
[56]	valid_0's rmse: 1.62559
[57]	valid_0's rmse: 1.62521
[58]	valid_0's rmse: 1.62499
[59]	valid_0's rmse: 1.62374
[60]	valid_0's rmse: 1.62299
[61]	valid_0's rm

[303]	valid_0's rmse: 1.59196
[304]	valid_0's rmse: 1.59202
[305]	valid_0's rmse: 1.59212
[306]	valid_0's rmse: 1.59205
[307]	valid_0's rmse: 1.59238
[308]	valid_0's rmse: 1.59244
[309]	valid_0's rmse: 1.5923
[310]	valid_0's rmse: 1.59204
[311]	valid_0's rmse: 1.59224
[312]	valid_0's rmse: 1.59216
Early stopping, best iteration is:
[292]	valid_0's rmse: 1.59164
training meter 0
[1]	valid_0's rmse: 1.55633
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.50641
[3]	valid_0's rmse: 1.45877
[4]	valid_0's rmse: 1.41368
[5]	valid_0's rmse: 1.37056
[6]	valid_0's rmse: 1.32938
[7]	valid_0's rmse: 1.29048
[8]	valid_0's rmse: 1.25328
[9]	valid_0's rmse: 1.22076
[10]	valid_0's rmse: 1.18712
[11]	valid_0's rmse: 1.15527
[12]	valid_0's rmse: 1.12534
[13]	valid_0's rmse: 1.09687
[14]	valid_0's rmse: 1.06972
[15]	valid_0's rmse: 1.04408
[16]	valid_0's rmse: 1.01995
[17]	valid_0's rmse: 0.99763
[18]	valid_0's rmse: 0.976083
[19]	valid_0's rmse: 0.955884
[20]	valid_0's

[45]	valid_0's rmse: 1.34442
[46]	valid_0's rmse: 1.34124
[47]	valid_0's rmse: 1.33629
[48]	valid_0's rmse: 1.33317
[49]	valid_0's rmse: 1.33143
[50]	valid_0's rmse: 1.3296
[51]	valid_0's rmse: 1.32524
[52]	valid_0's rmse: 1.3243
[53]	valid_0's rmse: 1.32047
[54]	valid_0's rmse: 1.31812
[55]	valid_0's rmse: 1.31599
[56]	valid_0's rmse: 1.31324
[57]	valid_0's rmse: 1.31076
[58]	valid_0's rmse: 1.30784
[59]	valid_0's rmse: 1.30457
[60]	valid_0's rmse: 1.30423
[61]	valid_0's rmse: 1.30269
[62]	valid_0's rmse: 1.30095
[63]	valid_0's rmse: 1.30065
[64]	valid_0's rmse: 1.29898
[65]	valid_0's rmse: 1.29713
[66]	valid_0's rmse: 1.29426
[67]	valid_0's rmse: 1.29292
[68]	valid_0's rmse: 1.2909
[69]	valid_0's rmse: 1.28969
[70]	valid_0's rmse: 1.28672
[71]	valid_0's rmse: 1.28555
[72]	valid_0's rmse: 1.28216
[73]	valid_0's rmse: 1.27997
[74]	valid_0's rmse: 1.27906
[75]	valid_0's rmse: 1.27899
[76]	valid_0's rmse: 1.2775
[77]	valid_0's rmse: 1.27746
[78]	valid_0's rmse: 1.27667
[79]	valid_0's rms

[321]	valid_0's rmse: 1.22382
[322]	valid_0's rmse: 1.22363
[323]	valid_0's rmse: 1.22337
[324]	valid_0's rmse: 1.22329
[325]	valid_0's rmse: 1.22339
[326]	valid_0's rmse: 1.22345
[327]	valid_0's rmse: 1.22344
[328]	valid_0's rmse: 1.22313
[329]	valid_0's rmse: 1.22319
[330]	valid_0's rmse: 1.22308
[331]	valid_0's rmse: 1.22304
[332]	valid_0's rmse: 1.22308
[333]	valid_0's rmse: 1.22309
[334]	valid_0's rmse: 1.22313
[335]	valid_0's rmse: 1.22308
[336]	valid_0's rmse: 1.2228
[337]	valid_0's rmse: 1.22279
[338]	valid_0's rmse: 1.22278
[339]	valid_0's rmse: 1.22276
[340]	valid_0's rmse: 1.22284
[341]	valid_0's rmse: 1.22282
[342]	valid_0's rmse: 1.22288
[343]	valid_0's rmse: 1.22289
[344]	valid_0's rmse: 1.2229
[345]	valid_0's rmse: 1.22285
[346]	valid_0's rmse: 1.22281
[347]	valid_0's rmse: 1.2227
[348]	valid_0's rmse: 1.22297
[349]	valid_0's rmse: 1.22294
[350]	valid_0's rmse: 1.22293
[351]	valid_0's rmse: 1.22316
[352]	valid_0's rmse: 1.22317
[353]	valid_0's rmse: 1.22313
[354]	valid_0

[57]	valid_0's rmse: 1.46419
[58]	valid_0's rmse: 1.46132
[59]	valid_0's rmse: 1.45842
[60]	valid_0's rmse: 1.45775
[61]	valid_0's rmse: 1.45526
[62]	valid_0's rmse: 1.45313
[63]	valid_0's rmse: 1.45188
[64]	valid_0's rmse: 1.45088
[65]	valid_0's rmse: 1.44848
[66]	valid_0's rmse: 1.44763
[67]	valid_0's rmse: 1.44748
[68]	valid_0's rmse: 1.44496
[69]	valid_0's rmse: 1.44219
[70]	valid_0's rmse: 1.44151
[71]	valid_0's rmse: 1.44084
[72]	valid_0's rmse: 1.4387
[73]	valid_0's rmse: 1.43715
[74]	valid_0's rmse: 1.43553
[75]	valid_0's rmse: 1.43393
[76]	valid_0's rmse: 1.43238
[77]	valid_0's rmse: 1.43205
[78]	valid_0's rmse: 1.43082
[79]	valid_0's rmse: 1.42938
[80]	valid_0's rmse: 1.42876
[81]	valid_0's rmse: 1.42798
[82]	valid_0's rmse: 1.42774
[83]	valid_0's rmse: 1.42789
[84]	valid_0's rmse: 1.4264
[85]	valid_0's rmse: 1.42547
[86]	valid_0's rmse: 1.4244
[87]	valid_0's rmse: 1.42364
[88]	valid_0's rmse: 1.42314
[89]	valid_0's rmse: 1.42161
[90]	valid_0's rmse: 1.42096
[91]	valid_0's rm

[332]	valid_0's rmse: 1.37397
[333]	valid_0's rmse: 1.37387
[334]	valid_0's rmse: 1.37385
[335]	valid_0's rmse: 1.37388
[336]	valid_0's rmse: 1.37376
[337]	valid_0's rmse: 1.37368
[338]	valid_0's rmse: 1.37367
[339]	valid_0's rmse: 1.3733
[340]	valid_0's rmse: 1.37346
[341]	valid_0's rmse: 1.37346
[342]	valid_0's rmse: 1.3731
[343]	valid_0's rmse: 1.37305
[344]	valid_0's rmse: 1.3726
[345]	valid_0's rmse: 1.37254
[346]	valid_0's rmse: 1.37281
[347]	valid_0's rmse: 1.37242
[348]	valid_0's rmse: 1.37215
[349]	valid_0's rmse: 1.37193
[350]	valid_0's rmse: 1.37192
[351]	valid_0's rmse: 1.372
[352]	valid_0's rmse: 1.37203
[353]	valid_0's rmse: 1.372
[354]	valid_0's rmse: 1.37212
[355]	valid_0's rmse: 1.37203
[356]	valid_0's rmse: 1.37185
[357]	valid_0's rmse: 1.37186
[358]	valid_0's rmse: 1.37161
[359]	valid_0's rmse: 1.37156
[360]	valid_0's rmse: 1.37157
[361]	valid_0's rmse: 1.3712
[362]	valid_0's rmse: 1.37115
[363]	valid_0's rmse: 1.37115
[364]	valid_0's rmse: 1.37116
[365]	valid_0's rm

[178]	valid_0's rmse: 1.22058
[179]	valid_0's rmse: 1.22003
[180]	valid_0's rmse: 1.21996
[181]	valid_0's rmse: 1.21996
[182]	valid_0's rmse: 1.21963
[183]	valid_0's rmse: 1.21968
[184]	valid_0's rmse: 1.21953
[185]	valid_0's rmse: 1.21962
[186]	valid_0's rmse: 1.21867
[187]	valid_0's rmse: 1.21871
[188]	valid_0's rmse: 1.21869
[189]	valid_0's rmse: 1.21779
[190]	valid_0's rmse: 1.21771
[191]	valid_0's rmse: 1.21773
[192]	valid_0's rmse: 1.21761
[193]	valid_0's rmse: 1.21762
[194]	valid_0's rmse: 1.21767
[195]	valid_0's rmse: 1.21771
[196]	valid_0's rmse: 1.21761
[197]	valid_0's rmse: 1.21763
[198]	valid_0's rmse: 1.21781
[199]	valid_0's rmse: 1.21786
[200]	valid_0's rmse: 1.21783
[201]	valid_0's rmse: 1.21743
[202]	valid_0's rmse: 1.21756
[203]	valid_0's rmse: 1.21743
[204]	valid_0's rmse: 1.21703
[205]	valid_0's rmse: 1.21716
[206]	valid_0's rmse: 1.21679
[207]	valid_0's rmse: 1.21633
[208]	valid_0's rmse: 1.21643
[209]	valid_0's rmse: 1.21642
[210]	valid_0's rmse: 1.21598
[211]	vali

[117]	valid_0's rmse: 0.696165
[118]	valid_0's rmse: 0.696063
[119]	valid_0's rmse: 0.695759
[120]	valid_0's rmse: 0.695523
[121]	valid_0's rmse: 0.695404
[122]	valid_0's rmse: 0.695202
[123]	valid_0's rmse: 0.694979
[124]	valid_0's rmse: 0.695029
[125]	valid_0's rmse: 0.694776
[126]	valid_0's rmse: 0.694776
[127]	valid_0's rmse: 0.694628
[128]	valid_0's rmse: 0.69446
[129]	valid_0's rmse: 0.694261
[130]	valid_0's rmse: 0.694486
[131]	valid_0's rmse: 0.694291
[132]	valid_0's rmse: 0.694203
[133]	valid_0's rmse: 0.694061
[134]	valid_0's rmse: 0.694216
[135]	valid_0's rmse: 0.694207
[136]	valid_0's rmse: 0.694085
[137]	valid_0's rmse: 0.693862
[138]	valid_0's rmse: 0.693855
[139]	valid_0's rmse: 0.693632
[140]	valid_0's rmse: 0.693572
[141]	valid_0's rmse: 0.693629
[142]	valid_0's rmse: 0.69378
[143]	valid_0's rmse: 0.693569
[144]	valid_0's rmse: 0.693537
[145]	valid_0's rmse: 0.693489
[146]	valid_0's rmse: 0.693323
[147]	valid_0's rmse: 0.693157
[148]	valid_0's rmse: 0.693469
[149]	vali

[48]	valid_0's rmse: 1.51748
[49]	valid_0's rmse: 1.51515
[50]	valid_0's rmse: 1.5125
[51]	valid_0's rmse: 1.51122
[52]	valid_0's rmse: 1.50941
[53]	valid_0's rmse: 1.508
[54]	valid_0's rmse: 1.50684
[55]	valid_0's rmse: 1.5054
[56]	valid_0's rmse: 1.50463
[57]	valid_0's rmse: 1.50274
[58]	valid_0's rmse: 1.50073
[59]	valid_0's rmse: 1.49934
[60]	valid_0's rmse: 1.49877
[61]	valid_0's rmse: 1.49726
[62]	valid_0's rmse: 1.49711
[63]	valid_0's rmse: 1.49549
[64]	valid_0's rmse: 1.49514
[65]	valid_0's rmse: 1.49321
[66]	valid_0's rmse: 1.49234
[67]	valid_0's rmse: 1.4927
[68]	valid_0's rmse: 1.49181
[69]	valid_0's rmse: 1.49033
[70]	valid_0's rmse: 1.48833
[71]	valid_0's rmse: 1.48796
[72]	valid_0's rmse: 1.48595
[73]	valid_0's rmse: 1.48676
[74]	valid_0's rmse: 1.48545
[75]	valid_0's rmse: 1.48414
[76]	valid_0's rmse: 1.48334
[77]	valid_0's rmse: 1.48262
[78]	valid_0's rmse: 1.48249
[79]	valid_0's rmse: 1.48207
[80]	valid_0's rmse: 1.48187
[81]	valid_0's rmse: 1.4805
[82]	valid_0's rmse:

[146]	valid_0's rmse: 1.55675
[147]	valid_0's rmse: 1.55597
[148]	valid_0's rmse: 1.55597
[149]	valid_0's rmse: 1.55588
[150]	valid_0's rmse: 1.55538
[151]	valid_0's rmse: 1.5549
[152]	valid_0's rmse: 1.5545
[153]	valid_0's rmse: 1.55439
[154]	valid_0's rmse: 1.55422
[155]	valid_0's rmse: 1.55411
[156]	valid_0's rmse: 1.55434
[157]	valid_0's rmse: 1.55478
[158]	valid_0's rmse: 1.55459
[159]	valid_0's rmse: 1.55475
[160]	valid_0's rmse: 1.55472
[161]	valid_0's rmse: 1.55488
[162]	valid_0's rmse: 1.55497
[163]	valid_0's rmse: 1.55467
[164]	valid_0's rmse: 1.55442
[165]	valid_0's rmse: 1.55456
[166]	valid_0's rmse: 1.55463
[167]	valid_0's rmse: 1.55449
[168]	valid_0's rmse: 1.55452
[169]	valid_0's rmse: 1.55454
[170]	valid_0's rmse: 1.5543
[171]	valid_0's rmse: 1.5542
[172]	valid_0's rmse: 1.5543
[173]	valid_0's rmse: 1.5542
[174]	valid_0's rmse: 1.55421
[175]	valid_0's rmse: 1.55395
[176]	valid_0's rmse: 1.55351
[177]	valid_0's rmse: 1.55323
[178]	valid_0's rmse: 1.55319
[179]	valid_0's 

[130]	valid_0's rmse: 1.5606
[131]	valid_0's rmse: 1.56036
[132]	valid_0's rmse: 1.55987
[133]	valid_0's rmse: 1.55954
[134]	valid_0's rmse: 1.55909
[135]	valid_0's rmse: 1.5588
[136]	valid_0's rmse: 1.55889
[137]	valid_0's rmse: 1.55933
[138]	valid_0's rmse: 1.55908
[139]	valid_0's rmse: 1.55861
[140]	valid_0's rmse: 1.55852
[141]	valid_0's rmse: 1.55771
[142]	valid_0's rmse: 1.55784
[143]	valid_0's rmse: 1.55752
[144]	valid_0's rmse: 1.55738
[145]	valid_0's rmse: 1.55743
[146]	valid_0's rmse: 1.55739
[147]	valid_0's rmse: 1.5575
[148]	valid_0's rmse: 1.55748
[149]	valid_0's rmse: 1.55718
[150]	valid_0's rmse: 1.557
[151]	valid_0's rmse: 1.55719
[152]	valid_0's rmse: 1.55666
[153]	valid_0's rmse: 1.55616
[154]	valid_0's rmse: 1.55608
[155]	valid_0's rmse: 1.55604
[156]	valid_0's rmse: 1.5564
[157]	valid_0's rmse: 1.5563
[158]	valid_0's rmse: 1.55607
[159]	valid_0's rmse: 1.55598
[160]	valid_0's rmse: 1.5559
[161]	valid_0's rmse: 1.55542
[162]	valid_0's rmse: 1.55499
[163]	valid_0's rm

[16]	valid_0's rmse: 1.56234
[17]	valid_0's rmse: 1.53425
[18]	valid_0's rmse: 1.50793
[19]	valid_0's rmse: 1.48526
[20]	valid_0's rmse: 1.46466
[21]	valid_0's rmse: 1.44639
[22]	valid_0's rmse: 1.43023
[23]	valid_0's rmse: 1.41542
[24]	valid_0's rmse: 1.40276
[25]	valid_0's rmse: 1.39199
[26]	valid_0's rmse: 1.38156
[27]	valid_0's rmse: 1.37275
[28]	valid_0's rmse: 1.36523
[29]	valid_0's rmse: 1.35737
[30]	valid_0's rmse: 1.35236
[31]	valid_0's rmse: 1.34563
[32]	valid_0's rmse: 1.34143
[33]	valid_0's rmse: 1.33689
[34]	valid_0's rmse: 1.33396
[35]	valid_0's rmse: 1.33058
[36]	valid_0's rmse: 1.32673
[37]	valid_0's rmse: 1.32414
[38]	valid_0's rmse: 1.32161
[39]	valid_0's rmse: 1.31956
[40]	valid_0's rmse: 1.31814
[41]	valid_0's rmse: 1.31747
[42]	valid_0's rmse: 1.31458
[43]	valid_0's rmse: 1.31435
[44]	valid_0's rmse: 1.31349
[45]	valid_0's rmse: 1.31283
[46]	valid_0's rmse: 1.31228
[47]	valid_0's rmse: 1.31056
[48]	valid_0's rmse: 1.31058
[49]	valid_0's rmse: 1.31152
[50]	valid_0's

[42]	valid_0's rmse: 1.7058
[43]	valid_0's rmse: 1.7032
[44]	valid_0's rmse: 1.69977
[45]	valid_0's rmse: 1.6966
[46]	valid_0's rmse: 1.69475
[47]	valid_0's rmse: 1.69319
[48]	valid_0's rmse: 1.69186
[49]	valid_0's rmse: 1.69068
[50]	valid_0's rmse: 1.6889
[51]	valid_0's rmse: 1.68714
[52]	valid_0's rmse: 1.68564
[53]	valid_0's rmse: 1.68479
[54]	valid_0's rmse: 1.68408
[55]	valid_0's rmse: 1.683
[56]	valid_0's rmse: 1.6822
[57]	valid_0's rmse: 1.68186
[58]	valid_0's rmse: 1.68124
[59]	valid_0's rmse: 1.68119
[60]	valid_0's rmse: 1.68084
[61]	valid_0's rmse: 1.67994
[62]	valid_0's rmse: 1.67998
[63]	valid_0's rmse: 1.68
[64]	valid_0's rmse: 1.67974
[65]	valid_0's rmse: 1.67892
[66]	valid_0's rmse: 1.6782
[67]	valid_0's rmse: 1.67794
[68]	valid_0's rmse: 1.67801
[69]	valid_0's rmse: 1.67717
[70]	valid_0's rmse: 1.67659
[71]	valid_0's rmse: 1.67598
[72]	valid_0's rmse: 1.67588
[73]	valid_0's rmse: 1.67564
[74]	valid_0's rmse: 1.67542
[75]	valid_0's rmse: 1.67537
[76]	valid_0's rmse: 1.67

In [36]:
for i in range(4):
    print('meter: '+ str(i))
    for model in meter_models[i]:
        print(model.best_score_['valid_0']['rmse'])


meter: 0
0.6598052296883413
0.5297042777440072
0.6357006694103878
0.6865595290785295
0.6487635781545118
meter: 1
1.29550570446236
1.172880143168755
1.2115980440636545
1.4552887862946413
1.3008457525309347
meter: 2
1.4346254221082324
1.431828195836705
1.368464433334204
1.5461106805998535
1.3794871988879216
meter: 3
1.5013518275812192
1.5916436738578947
1.209156241554405
1.547428076965108
1.6751698977656952


In [37]:
# Importance rank for first model in cross val models
for i in range(4):
    print('meter: '+ str(i))
    imprtc_df = pd.DataFrame()
    imprtc_df['feature'] = sample_train_X.drop('meter_reading_log1p', axis=1).columns   
    imprtc_df['importance'] = meter_models[i][0].feature_importances_
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    print(imprtc_df)


meter: 0
                          feature  importance
0                     building_id         855
99                     meter_mean         633
97                           hour         321
96                      dayofweek         145
100                     meter_max          55
27     precip_depth_1_hr_std_lag3          50
40     air_temperature_mean_lag72          39
102                     meter_std          39
70      air_temperature_min_lag96          38
69      air_temperature_max_lag96          34
47       cloud_coverage_std_lag72          32
61       wind_direction_max_lag72          31
63       wind_direction_std_lag72          31
2                         site_id          29
78      dew_temperature_min_lag96          26
5                 air_temperature          24
77      dew_temperature_max_lag96          22
85   sea_level_pressure_max_lag96          22
14       air_temperature_min_lag3          21
41      air_temperature_max_lag72          19
80   precip_depth_1_hr_me

[104 rows x 2 columns]
meter: 3
                           feature  importance
0                      building_id        2446
97                            hour         254
99                      meter_mean         250
70       air_temperature_min_lag96         204
40      air_temperature_mean_lag72         129
68      air_temperature_mean_lag96         121
14        air_temperature_min_lag3         120
102                      meter_std         113
85    sea_level_pressure_max_lag96         112
69       air_temperature_max_lag96         101
41       air_temperature_max_lag72          99
76      dew_temperature_mean_lag96          91
88       wind_direction_mean_lag96          90
13        air_temperature_max_lag3          90
78       dew_temperature_min_lag96          89
86    sea_level_pressure_min_lag96          77
12       air_temperature_mean_lag3          75
100                      meter_max          75
77       dew_temperature_max_lag96          74
98                 log_squar

In [38]:
# %%time
# ## Single fit single model

# gbm = LGBMRegressor(**gbm_params)
# f_train_X, f_train_y = getInFoldXY(train.index)
# gbm.fit(f_train_X, f_train_y)

In [39]:
train_analysis = x_fold_pipes.transform(train)
train_analysis.to_pickle('train_analysis')
pickle.dump(meter_models, open( "meter_models_analysis.pickle", "wb" ))

In [40]:
test = loadFile('test')
test = merge(test)
test = x_pre_pipes.transform(test)
test = x_fold_pipes.transform(test)
test = test.drop('row_id', axis=1)


#print(test.sample(n=20,  random_state=42))
print(test.shape)
#print(test.dtypes)

Index(['row_id', 'building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       ...
       'wind_direction_max_lag96', 'wind_direction_min_lag96',
       'wind_direction_std_lag96', 'wind_speed_mean_lag96',
       'wind_speed_max_lag96', 'wind_speed_min_lag96', 'wind_speed_std_lag96',
       'dayofweek', 'hour', 'log_square_feet'],
      dtype='object', length=103)
(41697600, 104)


In [41]:
if train_analysis is None:
    train_analysis = pd.read_pickle('train_analysis')
    

In [42]:
l1 =  x_fold_pipes.transform(train).columns.tolist()
l2 =  test.columns.tolist()
print(list(x for x in l1 if x not in l2))
print(list(x for x in l2 if x not in l1))

['meter_reading_log1p']
[]


In [43]:
def predMeters(test_X):
    test_y = test_X[['meter']] 
    test_y['meter_reading_log1p'] = np.nan
    for i in range(4):
        X = test_X[test_X['meter'] == i]
        if X.shape[0] > 0:
            preds = np.expm1(sum([model.predict(X) for model in meter_models[i]])/folds)
            test_y.loc[test_y['meter'] == i, 'meter_reading_log1p'] = preds
    return test_y['meter_reading_log1p'].tolist()
    
print(predMeters(test.sample(n=20,  random_state=42)))    


[144.3108401792618, 8.92558157134522, 53.08056262129931, 2.5943590581062246, 1113.5791559244926, 9.456376486249113, 332.85401213722884, 1331.7775033925836, 319.6899788652901, 85.85424392114157, 52.207169869437024, 678.7977189279279, 1079.1010550351689, 37.35698238338328, 59.01154322758964, 64.81196445735424, 15.77630992909333, 81.1353922242256, 42.41763159817612, 166.31781826730048]


In [44]:
# Predict using cross val models ensemble 
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    res.append(predMeters(test.iloc[i:i+step_size]))
    i+=step_size
    gc.collect()


100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [41:09<00:00,  4.14s/it]


In [45]:
# Save using cross val models ensemble 
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission_meter.csv.zip', index=False)
submission.shape

41697600


(41697600, 2)

In [46]:
# # Predict single model fit
# i=0
# res=[]
# step_size = 50000
# for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
#    #res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
#    res.append(np.expm1(gbm.predict(test_X.iloc[i:i+step_size])))
#    i+=step_size
    