In [1]:
# toggle to save space
mode = '_mean' if False else '_all'
print(mode)

_all


In [2]:
gbm_params = {
    'n_estimators' : 1500, # 500,  
    'max_depth' : -1,
    'learning_rate': 0.05,
    'bagging_fraction': 0.7, 
    
    'feature_fraction' : 0.9,
    'bagging_freq': 5,
    #'subsample' : 0.1,  # 
    #'subsample_freq' : 1,
    'num_leaves' : 31,
    'metric':'rmse',
    #'lambda_l1' : 1,  # Try defaults
    #'lambda_l2': 1, # Try defaults
    'verbose': 100
}

In [3]:
# TODO: add https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [5]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [6]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
}

def loadFile(name):
    for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
        if path.exists(dir_path + name + '.csv'):
            return  ConvertToDatetime().transform(
                pd.read_csv(dir_path + name + '.csv', dtype=file_dtype[name]))
        


In [7]:
building = loadFile('building_metadata')
pre_train = loadFile('train')
#test = loadFile('test')

In [8]:
def merge(x):
    weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')
    x = x.merge(building, on=['building_id'], how='left')
    gc.collect()
    x = x.merge(weather_processed_df, on=['site_id', 'timestamp'], how='left')
    gc.collect()
    x['site_id'] = x['site_id'].astype('int8');
    x['cloud_coverage'] = x['cloud_coverage'].astype('float16')
    gc.collect()
    return x

# todo see if this drop is rquired
# x = x.dropna(axis=0, subset=['air_temperature'])

train = merge(pre_train) 
print(train.dtypes)

building_id                             int16
meter                                    int8
timestamp                      datetime64[ns]
meter_reading                         float32
site_id                                  int8
primary_use                            object
square_feet                           float32
year_built                            float16
floor_count                           float16
air_temperature                       float16
dew_temperature                       float16
cloud_coverage                        float16
precip_depth_1_hr                     float16
wind_direction                        float16
wind_speed                            float16
sea_level_pressure                    float16
relative_humidity                     float16
air_temperature_rmean_3               float16
air_temperature_rmax_3                float16
air_temperature_rmin_3                float16
air_temperature_rstd_3                float16
cloud_coverage_rmean_3            

In [9]:
train.query('not (building_id == building_id & meter_reading == 0 & timestamp <= "2016-08-01")')

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,wind_direction_rmin_72,wind_direction_rstd_72,wind_speed_rmean_72,wind_speed_rmax_72,wind_speed_rmin_72,wind_speed_rstd_72,relative_humidity_rmean_72,relative_humidity_rmax_72,relative_humidity_rmin_72,relative_humidity_rstd_72
45,46,0,2016-01-01 00:00:00,53.239700,0,Retail,9045.0,2016.0,,19.406250,...,0.0,100.3125,0.683105,2.599609,0.000000,1.114258,90.500,100.0000,73.81250,10.726562
72,74,0,2016-01-01 00:00:00,43.001301,0,Parking,387638.0,1997.0,,19.406250,...,0.0,100.3125,0.683105,2.599609,0.000000,1.114258,90.500,100.0000,73.81250,10.726562
91,93,0,2016-01-01 00:00:00,52.420601,0,Office,33370.0,1982.0,,19.406250,...,0.0,100.3125,0.683105,2.599609,0.000000,1.114258,90.500,100.0000,73.81250,10.726562
103,105,0,2016-01-01 00:00:00,23.303600,1,Education,50623.0,,5.0,3.800781,...,240.0,,3.099609,3.099609,3.099609,,90.625,90.6250,90.62500,
104,106,0,2016-01-01 00:00:00,0.374600,1,Education,5374.0,,4.0,3.800781,...,240.0,,3.099609,3.099609,3.099609,,90.625,90.6250,90.62500,
106,107,0,2016-01-01 00:00:00,175.184006,1,Education,97532.0,2005.0,10.0,3.800781,...,240.0,,3.099609,3.099609,3.099609,,90.625,90.6250,90.62500,
107,108,0,2016-01-01 00:00:00,91.265297,1,Education,81580.0,1913.0,5.0,3.800781,...,240.0,,3.099609,3.099609,3.099609,,90.625,90.6250,90.62500,
108,109,0,2016-01-01 00:00:00,80.930000,1,Education,56995.0,1953.0,6.0,3.800781,...,240.0,,3.099609,3.099609,3.099609,,90.625,90.6250,90.62500,
110,110,0,2016-01-01 00:00:00,86.228302,1,Education,27814.0,2006.0,8.0,3.800781,...,240.0,,3.099609,3.099609,3.099609,,90.625,90.6250,90.62500,
111,111,0,2016-01-01 00:00:00,167.391998,1,Education,118338.0,1909.0,7.0,3.800781,...,240.0,,3.099609,3.099609,3.099609,,90.625,90.6250,90.62500,


In [10]:
# See holiday notebook to generate, this is optional
holiday_df = None
if path.exists('../input/ashrae-energy-prediction-pickles/holiday_df.pickle'):
    holiday_df = pd.read_pickle('../input/ashrae-energy-prediction-pickles/holiday_df.pickle')
if holiday_df is not None:
    print(holiday_df.sample(20))

          site_id           timestamp                               holiday
34211982        4 2017-12-25 22:00:00                         Christmas Day
52074718       13 2018-11-22 05:00:00                          Thanksgiving
980628          6 2016-01-18 19:00:00           Martin Luther King, Jr. Day
35961283        5 2018-03-17 09:00:00  St. Patrick's Day [Northern Ireland]
38737900        8 2017-02-20 13:00:00                 Washington's Birthday
43910634        9 2018-07-04 12:00:00                      Independence Day
15585694       11 2016-10-10 09:00:00                          Thanksgiving
46094726       10 2018-12-25 04:00:00                         Christmas Day
22657998        1 2017-05-01 08:00:00                               May Day
57066517       14 2018-11-11 16:00:00                          Veterans Day
34760530        4 2018-09-03 05:00:00                             Labor Day
37957470        7 2017-01-01 16:00:00                        New Year's Day
19913831    

In [11]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        # TODO: try week of year as numerical 
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('category')#.astype('uint8') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('category') #.astype('uint8')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self
    
print(AddTimeFeatures().transform(train.sample(20)))

          building_id  meter           timestamp  meter_reading  site_id  \
11274513          906      0 2016-07-25 11:00:00      51.000000        9   
2395144           195      0 2016-02-13 18:00:00       4.650000        2   
8303589           548      0 2016-06-02 16:00:00      33.049999        3   
1243597          1089      2 2016-01-23 13:00:00    1312.000000       13   
9408087           195      3 2016-06-22 07:00:00       0.000000        2   
6235             1156      2 2016-01-01 02:00:00   12890.599609       13   
15379493          532      0 2016-10-06 18:00:00      91.830002        3   
9736932           531      0 2016-06-28 03:00:00      11.240000        3   
14437355          865      0 2016-09-20 02:00:00      20.500000        8   
16257739          995      2 2016-10-22 07:00:00      64.050003        9   
8078621           901      2 2016-05-29 16:00:00     100.650002        9   
7463140           989      1 2016-05-18 17:00:00     273.610992        9   
2929645     

[20 rows x 83 columns]


In [12]:
class MeterReadingLog1p(TransformerMixin):
  
    def transform(self, df, **transform_params):
        if 'meter_reading' in df.columns:
            df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
            df = df.drop('meter_reading', axis=1)
        return df
    
    def fit(self, X, y=None, **fit_params):
        return self
print(train.sample(20, random_state=42))
print(MeterReadingLog1p().transform(train.sample(20, random_state=42)))
gc.collect()

          building_id  meter           timestamp  meter_reading  site_id  \
14245562         1324      1 2016-09-16 16:00:00       0.000000       14   
1282718          1013      0 2016-01-24 06:00:00      32.000099       10   
13883790          229      1 2016-09-10 07:00:00     567.655029        2   
4781820           217      3 2016-04-01 01:00:00       0.000000        2   
10415393         1434      0 2016-07-10 04:00:00      65.750000       15   
1057008          1047      0 2016-01-20 04:00:00      90.983299       12   
4507399           911      1 2016-03-26 20:00:00     295.063995        9   
19478829         1039      0 2016-12-18 23:00:00      16.900000       12   
8955615           265      0 2016-06-14 06:00:00     128.369995        2   
13799839          896      0 2016-09-08 19:00:00     300.000000        9   
15647011          973      0 2016-10-11 11:00:00     247.000000        9   
2524294           813      0 2016-02-16 08:00:00      10.958300        8   
10016102    

[20 rows x 81 columns]
          building_id  meter           timestamp  site_id  \
14245562         1324      1 2016-09-16 16:00:00       14   
1282718          1013      0 2016-01-24 06:00:00       10   
13883790          229      1 2016-09-10 07:00:00        2   
4781820           217      3 2016-04-01 01:00:00        2   
10415393         1434      0 2016-07-10 04:00:00       15   
1057008          1047      0 2016-01-20 04:00:00       12   
4507399           911      1 2016-03-26 20:00:00        9   
19478829         1039      0 2016-12-18 23:00:00       12   
8955615           265      0 2016-06-14 06:00:00        2   
13799839          896      0 2016-09-08 19:00:00        9   
15647011          973      0 2016-10-11 11:00:00        9   
2524294           813      0 2016-02-16 08:00:00        8   
10016102          870      0 2016-07-03 02:00:00        8   
3915750           898      0 2016-03-15 03:00:00        9   
17217526          903      0 2016-11-08 09:00:00        9   
1

[20 rows x 81 columns]


113

In [13]:
class CreateMeterDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _building_meter_desc_DF
        if 'meter_reading_log1p' in df.columns:
            cols = ['mean']
            if mode == '_all':
                cols = ['mean','max','min','std']
            group = df.groupby(['building_id','meter'])['meter_reading_log1p']
            desc_DF = group.describe(percentiles=[.05, .25, .5, .75, .95 ])
            desc_DF = desc_DF.reset_index()
            col_dict = {}
            for col in desc_DF.columns:
                if col not in ['building_id', 'meter', 'count']:
                    col_dict[col] = 'meter_' + col
            _building_meter_desc_DF = desc_DF.rename(columns=col_dict).drop('count', axis=1)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterDescDF().transform(
    AddTimeFeatures().transform(
        MeterReadingLog1p().transform(
            train.sample(2000, random_state=0)
        )
    )
)    
print(_building_meter_desc_DF)
gc.collect()

      building_id  meter  meter_mean  meter_std  meter_min  meter_5%  \
0               1      0    4.921724        NaN   4.921724  4.921724   
1               2      0    0.000000        NaN   0.000000  0.000000   
2               3      0    5.727568   0.128644   5.636602  5.645699   
3               5      0    0.000000        NaN   0.000000  0.000000   
4               6      0    0.000000        NaN   0.000000  0.000000   
5               8      0    5.930674        NaN   5.930674  5.930674   
6               9      1    5.300433   2.169901   3.766081  3.919516   
7              11      0    6.159797        NaN   6.159797  6.159797   
8              12      0    5.636602        NaN   5.636602  5.636602   
9              14      0    0.000000        NaN   0.000000  0.000000   
10             14      1    8.555042        NaN   8.555042  8.555042   
11             15      0    5.232920        NaN   5.232920  5.232920   
12             15      1    7.265242   0.810718   6.452293  6.53

[1349 rows x 11 columns]


35

In [14]:
class MergeMeterDescDF(TransformerMixin):
  
    def transform(self, df, **transform_params):
        # drop any columns to add
        dropCols =  [x for x in _building_meter_desc_DF.columns if x not in ['building_id', 'meter']]
        df = df.drop(dropCols, axis=1, errors='ignore') 
        return df.merge(_building_meter_desc_DF, on=['building_id','meter'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeMeterDescDF().transform(train.sample(2000, random_state=0)))

      building_id  meter           timestamp  meter_reading  site_id  \
0             774      1 2016-08-07 08:00:00      36.128899        6   
1             206      0 2016-10-04 14:00:00     226.270004        2   
2            1269      0 2016-11-29 10:00:00      28.670799       14   
3             951      0 2016-10-10 04:00:00     113.000000        9   
4             656      0 2016-05-01 21:00:00      32.700001        5   
5              36      0 2016-06-05 19:00:00     178.830994        0   
6            1262      0 2016-07-19 05:00:00      73.739998       14   
7              52      0 2016-03-17 14:00:00       0.000000        0   
8            1133      2 2016-07-10 17:00:00     984.375000       13   
9            1123      0 2016-10-02 09:00:00      18.243999       13   
10           1237      0 2016-10-06 18:00:00      85.000000       14   
11            960      1 2016-11-17 10:00:00      90.066498        9   
12             11      0 2016-09-25 21:00:00     472.332001     

[2000 rows x 90 columns]


In [15]:
# "As you can see above, this data looks weired until May 20. It is 
# reported in this discussion by @barnwellguy that All electricity
# meter is 0 until May 20 for site_id == 0. Let's remove these data 
# from training data."
# https://www.kaggle.com/kaushal2896/ashrae-eda-fe-lightgbm-1-13
class RmS0M0(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

    def fit(self, X, y=None, **fit_params):
        return self
    


In [16]:
# following this thread
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/113254#latest-663021
class RmBuilt2017(TransformerMixin):
# TO  
    def transform(self, df, **transform_params):
        # 954 leading zeros
        df = df.query('not (building_id == 954 & timestamp <= "2016-08-12")')
        df = df.query('not (building_id == 954 & timestamp <= "2016-10-12" & timestamp >= "2016-10-18")')
        
        return df

    def fit(self, X, y=None, **fit_params):
        return self
    

In [17]:
# remove big offenders in the error analysis reports
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/113254#latest-663021
class RmEAOffeners(TransformerMixin):
# TO  
    def transform(self, df, **transform_params):
        # 1072 new steam install
        df = df.query('not (building_id == 1072 & timestamp <= "2016-07-27")')
        
        # 783
        df = df.query('not (building_id == 783 & timestamp <= "2016-12-10"")')
        
        # 1264
        # not sure what to do with that one
        
        # 1021
        # drop zeros for hot water
        
        # 799
        df = df.query('not (building_id == 799 & meter == 0 & timestamp <= "2016-09-12")')
        
        # 693
        df = df.query('not (building_id == 693 & meter == 0 & timestamp <= "2016-07-11")')
        
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [18]:
# TODO: try rolling with power

In [19]:
    
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df['holiday'] = df['holiday'].astype('category')
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self
# Test 
if holiday_df is not None:
    print(holiday_df.columns)
    print(AddHolidays().transform(train.head(20))[['holiday','timestamp']])

Index(['site_id', 'timestamp', 'holiday'], dtype='object')
           holiday  timestamp
0   New Year's Day 2016-01-01
1   New Year's Day 2016-01-01
2   New Year's Day 2016-01-01
3   New Year's Day 2016-01-01
4   New Year's Day 2016-01-01
5   New Year's Day 2016-01-01
6   New Year's Day 2016-01-01
7   New Year's Day 2016-01-01
8   New Year's Day 2016-01-01
9   New Year's Day 2016-01-01
10  New Year's Day 2016-01-01
11  New Year's Day 2016-01-01
12  New Year's Day 2016-01-01
13  New Year's Day 2016-01-01
14  New Year's Day 2016-01-01
15  New Year's Day 2016-01-01
16  New Year's Day 2016-01-01
17  New Year's Day 2016-01-01
18  New Year's Day 2016-01-01
19  New Year's Day 2016-01-01


In [20]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df = df.drop(df[df['holiday'].notnull()].index)
            df = df.drop(['holiday'], axis=1)
            gc.collect()
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# Test you should see the new years removed
#print(train.head(100000).merge(building, on='building_id', how='left').columns)
print(RmHolidays().transform(train.head(100000)))

       building_id  meter           timestamp  meter_reading  site_id  \
55121            0      0 2016-01-02 00:00:00       0.000000        0   
55122            1      0 2016-01-02 00:00:00       0.000000        0   
55123            2      0 2016-01-02 00:00:00       0.000000        0   
55124            3      0 2016-01-02 00:00:00       0.000000        0   
55125            4      0 2016-01-02 00:00:00       0.000000        0   
55126            5      0 2016-01-02 00:00:00       0.000000        0   
55127            6      0 2016-01-02 00:00:00       0.000000        0   
55128            7      0 2016-01-02 00:00:00       0.000000        0   
55129            8      0 2016-01-02 00:00:00       0.000000        0   
55130            9      0 2016-01-02 00:00:00       0.000000        0   
55131           10      0 2016-01-02 00:00:00       0.000000        0   
55132           11      0 2016-01-02 00:00:00       0.000000        0   
55133           12      0 2016-01-02 00:00:00      

[41839 rows x 81 columns]


In [21]:
class SetCatTypes(TransformerMixin):
    
    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col]= df[col].astype('category')
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [22]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(building.head(20)['square_feet'])

0       7432.0
1       2720.0
2       5376.0
3      23685.0
4     116607.0
5       8000.0
6      27926.0
7     121074.0
8      60809.0
9      27000.0
10    370773.0
11     49073.0
12     37100.0
13     99380.0
14     86250.0
15     83957.0
16     54644.0
17     15250.0
18    111891.0
19     18717.0
Name: square_feet, dtype: float32


In [23]:
# TODO: Play with scaling cloud coverage

In [24]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [25]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','year_built']]
        year_built_median = tmp_df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        del tmp_df, year_built_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(ImputeYearBuilt().transform(train.sample(20))['building_age'])

19025495     70
18857178     70
17460127     60
2166386      70
10079254     70
918498       76
14423897     70
6277764      70
4427943      70
8819801      62
16634771     62
9540643      70
15808244    101
8714145      35
2187100      70
16268693     62
19310002     74
14297558     81
7853909      70
12928267     73
Name: building_age, dtype: uint8


In [26]:
class ImputeFloorCount(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','floor_count']]
        floors_median = tmp_df['floor_count'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['floor_count'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = i_median
            else:
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = floors_median
        del tmp_df, floors_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(ImputeFloorCount().transform(train.sample(20))['floor_count'])

16608454    3.0
13139304    3.0
8451209     3.0
7906448     3.0
8938119     3.0
4202679     3.0
19094575    3.0
10857924    3.0
8003319     3.0
1452571     5.0
12078489    3.0
10460628    3.0
3113119     3.0
15673459    3.0
4375816     3.0
6789998     3.0
1027798     3.0
1683816     3.0
3885572     3.0
8120568     2.0
Name: floor_count, dtype: float16


In [27]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [28]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [29]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [30]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [31]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [32]:
class MergeWeatherDescHDOW(TransformerMixin):
    def transform(self, df, **transform_params):
        desc_h_dow_df = pd.read_csv('weather_desc_h_dow.csv', index_col=0, dtype={'building_id': np.int16, 'meter': np.int8, 'meter_h_d_mean':np.float32,
                                                     'meter_h_d_std':np.float32, 'meter_h_d_min':np.float32,
                                                     'meter_h_d_25%':np.float32, 'meter_h_d_50%':np.float32,
                                                     'meter_h_d_75%':np.float32, 'meter_h_d_max':np.float32 })
        mrgCols = ['site_id', 'dayofweek','hour']
        dropCols = list(x for x in desc_h_dow_df.columns if x not in mrgCols)
        df.drop(mrgCols, errors='ignore')
        df = df.merge(desc_h_dow_df, on=mrgCols, how='left')
        
        del desc_h_dow_df
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeWeatherDescHDOW().transform(AddTimeFeatures().transform(ConvertToDatetime().transform(train.sample(20)))))

    building_id  meter           timestamp  meter_reading  site_id  \
0            73      0 2016-08-19 18:00:00      31.534300        0   
1           319      0 2016-06-23 13:00:00      18.450001        3   
2          1407      0 2016-09-13 14:00:00     107.775002       15   
3          1347      1 2016-12-17 11:00:00      30.305099       15   
4           869      0 2016-11-13 13:00:00    1812.670044        8   
5           910      0 2016-11-16 19:00:00      82.000000        9   
6           973      0 2016-04-30 03:00:00     124.000000        9   
7          1159      0 2016-12-03 11:00:00    1270.900024       13   
8          1085      1 2016-12-22 14:00:00       0.000000       13   
9          1220      1 2016-05-19 06:00:00      54.644798       13   
10          933      2 2016-10-21 07:00:00       0.000000        9   
11          229      1 2016-01-30 12:00:00     271.712006        2   
12         1344      0 2016-10-01 08:00:00     302.649994       15   
13          884     

[20 rows x 118 columns]


In [33]:
class MergeDescHDOW(TransformerMixin):
    def transform(self, df, **transform_params):
        desc_h_dow_df = pd.read_csv('desc_h_dow.csv', index_col=0, dtype={'building_id': np.int16, 'meter': np.int8, 'meter_h_d_mean':np.float32,
                                                     'meter_h_d_std':np.float32, 'meter_h_d_min':np.float32,
                                                     'meter_h_d_25%':np.float32, 'meter_h_d_50%':np.float32,
                                                     'meter_h_d_75%':np.float32, 'meter_h_d_max':np.float32 })
        mrgCols = ['building_id', 'meter', 'dayofweek','hour']
        dropCols = list(x for x in desc_h_dow_df.columns if x not in mrgCols)
        df.drop(mrgCols, errors='ignore')
        df = df.merge(desc_h_dow_df, on=mrgCols, how='left')
        
        del desc_h_dow_df
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeDescHDOW().transform(AddTimeFeatures().transform(ConvertToDatetime().transform(train.sample(20)))))

    building_id  meter           timestamp  meter_reading  site_id  \
0          1364      0 2016-10-03 00:00:00      38.375000       15   
1          1134      0 2016-12-16 13:00:00     201.472000       13   
2          1380      1 2016-10-29 09:00:00      28.527500       15   
3           196      0 2016-06-30 12:00:00      50.200001        2   
4          1200      0 2016-01-26 00:00:00     658.963013       13   
5          1312      3 2016-01-29 22:00:00     454.338989       14   
6           868      0 2016-10-07 08:00:00     120.833000        8   
7           296      0 2016-01-19 03:00:00      60.029999        3   
8           987      1 2016-07-26 20:00:00    1114.739990        9   
9          1367      0 2016-09-07 21:00:00     122.375000       15   
10         1135      0 2016-05-16 21:00:00     248.729004       13   
11          777      1 2016-01-31 07:00:00       0.000000        6   
12         1401      1 2016-12-01 17:00:00       8.974200       15   
13          213     

In [34]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def row_rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))


def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [35]:
pd.options.display.max_seq_items = 2000

In [36]:
%%time

x_pre_pipes = Pipeline(
    steps=[
        ('rmS0M0', RmS0M0()),
        ('rmBuilt2017', RmBuilt2017()),
        ('RmEAOffeners', RmEAOffeners()),
        ('meterReadingLog1p',MeterReadingLog1p()),
        #('RmBuilt2017', RmS0M0()),
        #('rmSite9July4th', RmSite9July4th())
        ('addTimeFeatures', AddTimeFeatures()),
        ('logSquareFeet', LogSquareFeet()),
        #('rmHolidays', RmHolidays()),
        #('addHolidays', AddHolidays()),
        #('createMeterDescDF', CreateMeterDescDF()), # note declares a globe variable to pass
        #('mergeMeterDescDF', MergeMeterDescDF()), # populates both test and train from global
        ('mergeDescHDOW', MergeDescHDOW()),
        #('mergeWeatherDescHDOW', MergeWeatherDescHDOW()),
        ('setCatTypes', SetCatTypes(['building_id', 'site_id', 'meter', 'primary_use'])),
        ('GC', GC())
    ]
)

train = x_pre_pipes.transform(merge(pre_train))
print(train.columns)

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'dew_temperature', 'cloud_coverage', 'precip_depth_1_hr',
       'wind_direction', 'wind_speed', 'sea_level_pressure',
       'relative_humidity', 'air_temperature_rmean_3',
       'air_temperature_rmax_3', 'air_temperature_rmin_3',
       'air_temperature_rstd_3', 'cloud_coverage_rmean_3',
       'cloud_coverage_rmax_3', 'cloud_coverage_rmin_3',
       'cloud_coverage_rstd_3', 'dew_temperature_rmean_3',
       'dew_temperature_rmax_3', 'dew_temperature_rmin_3',
       'dew_temperature_rstd_3', 'precip_depth_1_hr_rmean_3',
       'precip_depth_1_hr_rmax_3', 'precip_depth_1_hr_rmin_3',
       'precip_depth_1_hr_rstd_3', 'sea_level_pressure_rmean_3',
       'sea_level_pressure_rmax_3', 'sea_level_pressure_rmin_3',
       'sea_level_pressure_rstd_3', 'wind_direction_rmean_3',
       'wind_direction_rmax_3', 'wind_direction_rmin_3',
       'wind

In [37]:
# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_fold_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('imputeFloorCount', ImputeFloorCount()),
        ('dropCols', DropCols(['timestamp','square_feet', 'year_built'])),
        ('GC', GC())
    ]
)

sample_train_X = x_fold_pipes.transform(train.sample(20))
print(sample_train_X.columns)
print(sample_train_X.dtypes)
print(sample_train_X)

Index(['building_id', 'meter', 'site_id', 'primary_use', 'floor_count',
       'air_temperature', 'dew_temperature', 'cloud_coverage',
       'precip_depth_1_hr', 'wind_direction', 'wind_speed',
       'sea_level_pressure', 'relative_humidity', 'air_temperature_rmean_3',
       'air_temperature_rmax_3', 'air_temperature_rmin_3',
       'air_temperature_rstd_3', 'cloud_coverage_rmean_3',
       'cloud_coverage_rmax_3', 'cloud_coverage_rmin_3',
       'cloud_coverage_rstd_3', 'dew_temperature_rmean_3',
       'dew_temperature_rmax_3', 'dew_temperature_rmin_3',
       'dew_temperature_rstd_3', 'precip_depth_1_hr_rmean_3',
       'precip_depth_1_hr_rmax_3', 'precip_depth_1_hr_rmin_3',
       'precip_depth_1_hr_rstd_3', 'sea_level_pressure_rmean_3',
       'sea_level_pressure_rmax_3', 'sea_level_pressure_rmin_3',
       'sea_level_pressure_rstd_3', 'wind_direction_rmean_3',
       'wind_direction_rmax_3', 'wind_direction_rmin_3',
       'wind_direction_rstd_3', 'wind_speed_rmean_3', 'wind_s

[20 rows x 89 columns]


In [38]:
# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
# trying no shuffle https://www.kaggle.com/c/ashrae-energy-prediction/discussion/115851#latest-666115
folds = 2
kf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=42)


In [39]:
def cvTrainMeterEnsemble(train, gbm_params):
    meter_models = [ [], [], [], [] ]
    for train_index, val_index in kf.split(train, train['building_id']):
        f_train = x_fold_pipes.transform(train.loc[train_index])
        f_val = x_fold_pipes.transform(train.loc[val_index])
        for i in range(4):
            print(f'training meter {i}')
            f_train_m = f_train[f_train['meter'] == i]
            f_val_m = f_val[f_val['meter'] == i]
            gbm_params_m = gbm_params
            if i == 0:
                gbm_params_m['learning_rate']=0.01
            elif i == 1:
                gbm_params_m['learning_rate']=0.01
                gbm_params_m['bagging_fraction']=0.8
            elif i == 2:
                gbm_params_m['learning_rate']=0.03
                gbm_params_m['bagging_fraction']=0.8
            else:
                gbm_params_m['learning_rate']=0.02
                gbm_params_m['bagging_fraction']=0.9                
            gbm = LGBMRegressor(**gbm_params_m)
            gbm.fit(f_train_m.drop('meter_reading_log1p', axis=1), f_train_m['meter_reading_log1p'],
                eval_set=[(f_val_m.drop('meter_reading_log1p', axis=1), f_val_m['meter_reading_log1p'])],
                # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
                # eval_metric=lbm_rmslee,
                early_stopping_rounds=20)
            meter_models[i].append(gbm)
            del f_train_m, f_val_m, gbm
            gc.collect()
        del f_train, f_val
        gc.collect()
    return meter_models

In [40]:
%%time
meter_models = cvTrainMeterEnsemble(train, gbm_params)

training meter 0
[1]	valid_0's rmse: 1.55802
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.54463
[3]	valid_0's rmse: 1.53143
[4]	valid_0's rmse: 1.5183
[5]	valid_0's rmse: 1.50538
[6]	valid_0's rmse: 1.49263
[7]	valid_0's rmse: 1.48001
[8]	valid_0's rmse: 1.46755
[9]	valid_0's rmse: 1.45521
[10]	valid_0's rmse: 1.44304
[11]	valid_0's rmse: 1.43101
[12]	valid_0's rmse: 1.41913
[13]	valid_0's rmse: 1.40732
[14]	valid_0's rmse: 1.39572
[15]	valid_0's rmse: 1.38427
[16]	valid_0's rmse: 1.37296
[17]	valid_0's rmse: 1.36185
[18]	valid_0's rmse: 1.35075
[19]	valid_0's rmse: 1.33985
[20]	valid_0's rmse: 1.32913
[21]	valid_0's rmse: 1.31846
[22]	valid_0's rmse: 1.30791
[23]	valid_0's rmse: 1.29746
[24]	valid_0's rmse: 1.28708
[25]	valid_0's rmse: 1.27695
[26]	valid_0's rmse: 1.26691
[27]	valid_0's rmse: 1.25702
[28]	valid_0's rmse: 1.24721
[29]	valid_0's rmse: 1.23754
[30]	valid_0's rmse: 1.22797
[31]	valid_0's rmse: 1.21854
[32]	valid_0's rmse: 1.20921
[33]

[269]	valid_0's rmse: 0.627966
[270]	valid_0's rmse: 0.627848
[271]	valid_0's rmse: 0.627744
[272]	valid_0's rmse: 0.627795
[273]	valid_0's rmse: 0.627766
[274]	valid_0's rmse: 0.627658
[275]	valid_0's rmse: 0.62763
[276]	valid_0's rmse: 0.627664
[277]	valid_0's rmse: 0.627768
[278]	valid_0's rmse: 0.627815
[279]	valid_0's rmse: 0.627713
[280]	valid_0's rmse: 0.627704
[281]	valid_0's rmse: 0.627791
[282]	valid_0's rmse: 0.62785
[283]	valid_0's rmse: 0.627957
[284]	valid_0's rmse: 0.627983
[285]	valid_0's rmse: 0.627958
[286]	valid_0's rmse: 0.628105
[287]	valid_0's rmse: 0.628166
[288]	valid_0's rmse: 0.628236
[289]	valid_0's rmse: 0.62828
[290]	valid_0's rmse: 0.628422
[291]	valid_0's rmse: 0.628502
[292]	valid_0's rmse: 0.628602
[293]	valid_0's rmse: 0.628587
[294]	valid_0's rmse: 0.628697
[295]	valid_0's rmse: 0.628742
Early stopping, best iteration is:
[275]	valid_0's rmse: 0.62763
training meter 1
[1]	valid_0's rmse: 2.55226
Training until validation scores don't improve for 20 ro

[246]	valid_0's rmse: 1.27882
[247]	valid_0's rmse: 1.27862
[248]	valid_0's rmse: 1.27825
[249]	valid_0's rmse: 1.27762
[250]	valid_0's rmse: 1.27745
[251]	valid_0's rmse: 1.27724
[252]	valid_0's rmse: 1.27694
[253]	valid_0's rmse: 1.27677
[254]	valid_0's rmse: 1.27614
[255]	valid_0's rmse: 1.27585
[256]	valid_0's rmse: 1.27562
[257]	valid_0's rmse: 1.27545
[258]	valid_0's rmse: 1.27549
[259]	valid_0's rmse: 1.27529
[260]	valid_0's rmse: 1.27519
[261]	valid_0's rmse: 1.27491
[262]	valid_0's rmse: 1.27459
[263]	valid_0's rmse: 1.27433
[264]	valid_0's rmse: 1.27408
[265]	valid_0's rmse: 1.2739
[266]	valid_0's rmse: 1.27386
[267]	valid_0's rmse: 1.27375
[268]	valid_0's rmse: 1.27365
[269]	valid_0's rmse: 1.27342
[270]	valid_0's rmse: 1.27312
[271]	valid_0's rmse: 1.27268
[272]	valid_0's rmse: 1.27254
[273]	valid_0's rmse: 1.27233
[274]	valid_0's rmse: 1.27218
[275]	valid_0's rmse: 1.27199
[276]	valid_0's rmse: 1.27187
[277]	valid_0's rmse: 1.27176
[278]	valid_0's rmse: 1.2716
[279]	valid_

[50]	valid_0's rmse: 1.7507
[51]	valid_0's rmse: 1.74362
[52]	valid_0's rmse: 1.73663
[53]	valid_0's rmse: 1.72996
[54]	valid_0's rmse: 1.72426
[55]	valid_0's rmse: 1.71846
[56]	valid_0's rmse: 1.71274
[57]	valid_0's rmse: 1.70763
[58]	valid_0's rmse: 1.70205
[59]	valid_0's rmse: 1.69711
[60]	valid_0's rmse: 1.69212
[61]	valid_0's rmse: 1.68765
[62]	valid_0's rmse: 1.68314
[63]	valid_0's rmse: 1.67862
[64]	valid_0's rmse: 1.67407
[65]	valid_0's rmse: 1.6703
[66]	valid_0's rmse: 1.66678
[67]	valid_0's rmse: 1.66289
[68]	valid_0's rmse: 1.65882
[69]	valid_0's rmse: 1.65546
[70]	valid_0's rmse: 1.65196
[71]	valid_0's rmse: 1.64861
[72]	valid_0's rmse: 1.64568
[73]	valid_0's rmse: 1.64244
[74]	valid_0's rmse: 1.63987
[75]	valid_0's rmse: 1.63708
[76]	valid_0's rmse: 1.63432
[77]	valid_0's rmse: 1.6322
[78]	valid_0's rmse: 1.6295
[79]	valid_0's rmse: 1.62726
[80]	valid_0's rmse: 1.62465
[81]	valid_0's rmse: 1.62249
[82]	valid_0's rmse: 1.62077
[83]	valid_0's rmse: 1.61841
[84]	valid_0's rms

[152]	valid_0's rmse: 0.726436
[153]	valid_0's rmse: 0.725273
[154]	valid_0's rmse: 0.72415
[155]	valid_0's rmse: 0.722937
[156]	valid_0's rmse: 0.721856
[157]	valid_0's rmse: 0.720832
[158]	valid_0's rmse: 0.719746
[159]	valid_0's rmse: 0.718666
[160]	valid_0's rmse: 0.717567
[161]	valid_0's rmse: 0.716597
[162]	valid_0's rmse: 0.715656
[163]	valid_0's rmse: 0.714726
[164]	valid_0's rmse: 0.713828
[165]	valid_0's rmse: 0.712939
[166]	valid_0's rmse: 0.712117
[167]	valid_0's rmse: 0.711297
[168]	valid_0's rmse: 0.710395
[169]	valid_0's rmse: 0.70955
[170]	valid_0's rmse: 0.708731
[171]	valid_0's rmse: 0.707937
[172]	valid_0's rmse: 0.707194
[173]	valid_0's rmse: 0.706348
[174]	valid_0's rmse: 0.705583
[175]	valid_0's rmse: 0.704838
[176]	valid_0's rmse: 0.70404
[177]	valid_0's rmse: 0.70333
[178]	valid_0's rmse: 0.702658
[179]	valid_0's rmse: 0.701982
[180]	valid_0's rmse: 0.701344
[181]	valid_0's rmse: 0.700616
[182]	valid_0's rmse: 0.70002
[183]	valid_0's rmse: 0.6994
[184]	valid_0's

[61]	valid_0's rmse: 1.80349
[62]	valid_0's rmse: 1.79586
[63]	valid_0's rmse: 1.78845
[64]	valid_0's rmse: 1.78093
[65]	valid_0's rmse: 1.77365
[66]	valid_0's rmse: 1.76645
[67]	valid_0's rmse: 1.75953
[68]	valid_0's rmse: 1.75226
[69]	valid_0's rmse: 1.74568
[70]	valid_0's rmse: 1.7391
[71]	valid_0's rmse: 1.73271
[72]	valid_0's rmse: 1.72611
[73]	valid_0's rmse: 1.7195
[74]	valid_0's rmse: 1.71324
[75]	valid_0's rmse: 1.70701
[76]	valid_0's rmse: 1.70048
[77]	valid_0's rmse: 1.69443
[78]	valid_0's rmse: 1.68854
[79]	valid_0's rmse: 1.68269
[80]	valid_0's rmse: 1.67698
[81]	valid_0's rmse: 1.6711
[82]	valid_0's rmse: 1.66541
[83]	valid_0's rmse: 1.65956
[84]	valid_0's rmse: 1.65401
[85]	valid_0's rmse: 1.64875
[86]	valid_0's rmse: 1.6434
[87]	valid_0's rmse: 1.63828
[88]	valid_0's rmse: 1.63324
[89]	valid_0's rmse: 1.62829
[90]	valid_0's rmse: 1.62331
[91]	valid_0's rmse: 1.61842
[92]	valid_0's rmse: 1.61354
[93]	valid_0's rmse: 1.60865
[94]	valid_0's rmse: 1.60412
[95]	valid_0's rms

[315]	valid_0's rmse: 1.34172
training meter 2
[1]	valid_0's rmse: 2.60939
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 2.55804
[3]	valid_0's rmse: 2.50823
[4]	valid_0's rmse: 2.46103
[5]	valid_0's rmse: 2.41542
[6]	valid_0's rmse: 2.37114
[7]	valid_0's rmse: 2.32846
[8]	valid_0's rmse: 2.28799
[9]	valid_0's rmse: 2.24914
[10]	valid_0's rmse: 2.21206
[11]	valid_0's rmse: 2.17695
[12]	valid_0's rmse: 2.14371
[13]	valid_0's rmse: 2.11117
[14]	valid_0's rmse: 2.07979
[15]	valid_0's rmse: 2.05058
[16]	valid_0's rmse: 2.02264
[17]	valid_0's rmse: 1.99632
[18]	valid_0's rmse: 1.97046
[19]	valid_0's rmse: 1.94772
[20]	valid_0's rmse: 1.92365
[21]	valid_0's rmse: 1.90184
[22]	valid_0's rmse: 1.87997
[23]	valid_0's rmse: 1.85984
[24]	valid_0's rmse: 1.84078
[25]	valid_0's rmse: 1.8229
[26]	valid_0's rmse: 1.80522
[27]	valid_0's rmse: 1.78893
[28]	valid_0's rmse: 1.77303
[29]	valid_0's rmse: 1.75834
[30]	valid_0's rmse: 1.74406
[31]	valid_0's rmse: 1.73025
[32

[138]	valid_0's rmse: 1.6046
[139]	valid_0's rmse: 1.60436
[140]	valid_0's rmse: 1.60449
[141]	valid_0's rmse: 1.60429
[142]	valid_0's rmse: 1.6039
[143]	valid_0's rmse: 1.60358
[144]	valid_0's rmse: 1.60346
[145]	valid_0's rmse: 1.60332
[146]	valid_0's rmse: 1.60287
[147]	valid_0's rmse: 1.60236
[148]	valid_0's rmse: 1.60233
[149]	valid_0's rmse: 1.60194
[150]	valid_0's rmse: 1.60167
[151]	valid_0's rmse: 1.60144
[152]	valid_0's rmse: 1.60126
[153]	valid_0's rmse: 1.60149
[154]	valid_0's rmse: 1.60143
[155]	valid_0's rmse: 1.6012
[156]	valid_0's rmse: 1.60097
[157]	valid_0's rmse: 1.60088
[158]	valid_0's rmse: 1.60073
[159]	valid_0's rmse: 1.60068
[160]	valid_0's rmse: 1.60061
[161]	valid_0's rmse: 1.60041
[162]	valid_0's rmse: 1.60045
[163]	valid_0's rmse: 1.60047
[164]	valid_0's rmse: 1.60059
[165]	valid_0's rmse: 1.60054
[166]	valid_0's rmse: 1.60059
[167]	valid_0's rmse: 1.60069
[168]	valid_0's rmse: 1.60065
[169]	valid_0's rmse: 1.60054
[170]	valid_0's rmse: 1.60047
[171]	valid_0

In [41]:
rmsl_df =  None
for i in range(4):
    m_list = []
    for model in meter_models[i]:
        m_list.append(model.best_score_['valid_0']['rmse'])
    if rmsl_df is None:
        rmsl_df = df = pd.DataFrame(m_list, columns=[i])
    else:
        rmsl_df[i] = m_list

rmsl_df = rmsl_df.describe().transpose()
print(rmsl_df)
rmsl_df['count'] = train.groupby('meter')['meter'].count()
print(rmsl_df['count'])
rmsl_df['m_x_c'] = rmsl_df['mean'] * rmsl_df['count']
print(rmsl_df['m_x_c'].sum()/rmsl_df['count'].sum())


   count      mean       std       min       25%       50%       75%       max
0    2.0  0.652375  0.034995  0.627630  0.640003  0.652375  0.664748  0.677120
1    2.0  1.304261  0.052971  1.266805  1.285533  1.304261  1.322989  1.341718
2    2.0  1.525384  0.037294  1.499013  1.512198  1.525384  1.538569  1.551755
3    2.0  1.588363  0.016341  1.576809  1.582586  1.588363  1.594141  1.599918
0    11698593
1     4177064
2     2702088
3     1264037
Name: count, dtype: int64
0.968125193545988


In [42]:
def createFeature_DF(model):
    imprtc_df = pd.DataFrame()
    imprtc_df['feature'] = sample_train_X.drop('meter_reading_log1p', axis=1).columns   
    imprtc_df['importance'] = model.feature_importances_
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    return imprtc_df

In [43]:
features_df = None
for i in range(4):
    print('meter: '+ str(i))
    for model in meter_models[i]:
        if features_df is None:
            features_df = createFeature_DF(model)
        else:
            features_df = features_df.append(createFeature_DF(model))

features_df = features_df.groupby('feature').sum().sort_values('importance', ascending=False).reset_index()
features_df['feature'] = features_df['feature'].str.pad(features_df['feature'].str.len().max(), side ='right') 

features_df.to_csv("featuress_multi_meter_light_GBM.csv")
print(features_df)
print()            

meter: 0
meter: 1
meter: 2
meter: 3
                        feature  importance
0   building_id                       20784
1   meter_h_d_mean                     6631
2   air_temperature_rmean_72           2925
3   air_temperature_rmin_72            2332
4   meter_h_d_75%                      2029
5   meter_h_d_50%                      1867
6   air_temperature_rmax_72            1548
7   meter_h_d_25%                      1237
8   meter_h_d_std                      1103
9   air_temperature_rmin_3             1091
10  dew_temperature_rmin_72             723
11  dew_temperature_rmean_72            664
12  air_temperature_rmean_3             656
13  air_temperature_rmax_3              627
14  dew_temperature_rmax_72             619
15  sea_level_pressure_rmax_72          602
16  precip_depth_1_hr_rstd_3            562
17  relative_humidity_rmin_72           524
18  site_id                             504
19  cloud_coverage_rstd_72              499
20  wind_direction_rmax_72              

In [44]:
raise("stop")

TypeError: exceptions must derive from BaseException

In [None]:
# %%time
# ## Single fit single model

# gbm = LGBMRegressor(**gbm_params)
# f_train_X, f_train_y = getInFoldXY(train.index)
# gbm.fit(f_train_X, f_train_y)

In [50]:
train_analysis = x_fold_pipes.transform(train)
train_preds = predictInBatchs(train_analysis.drop('meter_reading_log1p', axis=1))


100%|████████████████████████████████████████████████████████████████████████████████| 397/397 [10:02<00:00,  1.70s/it]


In [51]:
train_analysis['prediction'] = train_preds
train_analysis['meter_reading']=np.expm1(train_analysis['meter_reading_log1p'])
train_analysis['rmsle']=((np.log(train_analysis['prediction'] + 1) - np.log(train_analysis['meter_reading'] + 1))**2)**0.5
train_analysis = train_analysis[['building_id','site_id','meter_reading_log1p','meter_reading','prediction','rmsle']]
print(train_analysis[['building_id','site_id','meter_reading_log1p','meter_reading','prediction','rmsle']].head())


  building_id site_id  meter_reading_log1p  meter_reading  prediction  \
0         105       1             3.190624      23.303600   57.941697   
1         106       1             0.318163       0.374600    1.444133   
2         106       1             0.000000       0.000000    2.406300   
3         107       1             5.171529     175.183990   62.766757   
4         108       1             4.524668      91.265312  196.499835   

      rmsle  
0  0.885924  
1  0.575528  
2  1.225627  
3  1.016297  
4  0.761070  


In [52]:
train_analysis = x_fold_pipes.transform(train)
train_analysis['prediction'] = train_preds
train_analysis['meter_reading']=np.expm1(train_analysis['meter_reading_log1p'])
train_analysis['rmsle']=((np.log(train_analysis['prediction'] + 1) - np.log(train_analysis['meter_reading'] + 1))**2)**0.5
ta_group = train_analysis.groupby(['building_id','hour']).mean().reset_index()
print(ta_group.sort_values('rmsle', ascending=False)[['building_id','rmsle','hour']])

      building_id     rmsle  hour
25747        1072  2.766430    19
25746        1072  2.764117    18
25745        1072  2.757586    17
25748        1072  2.753591    20
25744        1072  2.734736    16
25743        1072  2.729428    15
25742        1072  2.721333    14
25749        1072  2.705420    21
25741        1072  2.702479    13
25736        1072  2.696437     8
25735        1072  2.688241     7
25737        1072  2.686710     9
25740        1072  2.683943    12
25750        1072  2.681270    22
25738        1072  2.680252    10
25730        1072  2.675272     2
25739        1072  2.674987    11
25731        1072  2.674659     3
25729        1072  2.669877     1
25751        1072  2.669573    23
25732        1072  2.668254     4
25728        1072  2.663212     0
25734        1072  2.656963     6
25733        1072  2.655483     5
18797         783  2.517353     5
18798         783  2.511741     6
18796         783  2.503926     4
18799         783  2.501646     7
18793         

In [53]:
ta_group = train_analysis.groupby(['site_id']).mean().reset_index()
print(ta_group.sort_values('rmsle', ascending=False)[['site_id','rmsle']])

   site_id     rmsle
7        7  0.822101
14      14  0.782069
10      10  0.586665
6        6  0.556656
9        9  0.511709
11      11  0.478608
13      13  0.476417
1        1  0.418914
2        2  0.410841
0        0  0.385558
5        5  0.366809
15      15  0.311178
8        8  0.307730
3        3  0.272928
12      12  0.230423
4        4  0.145585


In [54]:
if train_analysis is None:
    train_analysis = pd.read_pickle('train_analysis')
    

In [55]:
ta_group = train_analysis.groupby(['building_id']).mean().reset_index()
print(ta_group.sort_values('rmsle', ascending=False)[['building_id','rmsle']])

     building_id     rmsle
1072        1072  2.698344
783          783  2.409086
1264        1264  2.195937
693          693  2.186572
1021        1021  2.185602
1303        1303  1.938516
1116        1116  1.783975
1232        1232  1.763465
1099        1099  1.743051
499          499  1.579405
258          258  1.569772
803          803  1.546004
1195        1195  1.525615
107          107  1.505213
59            59  1.449430
112          112  1.438498
802          802  1.401884
1227        1227  1.378810
1241        1241  1.377834
1319        1319  1.373465
681          681  1.359928
1324        1324  1.343551
1272        1272  1.340150
53            53  1.316453
260          260  1.310332
331          331  1.290664
1113        1113  1.286943
98            98  1.271574
1017        1017  1.270920
60            60  1.268701
...          ...       ...
1365        1365  0.091857
1023        1023  0.090568
1441        1441  0.090300
1439        1439  0.087190
585          585  0.086912
2

In [56]:
#del train
#gc.collect()
test = loadFile('test')
print(test.shape[0])
test = merge(test)
print(test.shape[0])
test = x_pre_pipes.transform(test)
test = x_fold_pipes.transform(test)
test = test.drop('row_id', axis=1)


#print(test.sample(n=20,  random_state=42))
print(test.shape)
#print(test.dtypes)

41697600
41697600
(41697600, 88)


In [46]:
l1 =  x_fold_pipes.transform(train).columns.tolist()
l2 =  test.columns.tolist()
print(list(x for x in l1 if x not in l2))
print(list(x for x in l2 if x not in l1))

['meter_reading_log1p']
[]


In [47]:
def predMeters(test_X):
    test_y = test_X[['meter']] 
    test_y['meter_reading_log1p'] = np.nan
    for i in range(4):
        X = test_X[test_X['meter'] == i]
        if X.shape[0] > 0:
            preds = np.expm1(sum([model.predict(X) for model in meter_models[i]])/folds)
            test_y.loc[test_y['meter'] == i, 'meter_reading_log1p'] = preds
    return test_y['meter_reading_log1p'].tolist()
    
print(predMeters(test.sample(n=20,  random_state=42)))    


[114.47830088036902, 10.894618295414437, 56.49506749327657, 2.1862674503339194, 1140.1085138102048, 9.148837340514952, 295.9267338276026, 1107.1573550540088, 240.5837945317612, 98.9806482753674, 46.40604574454975, 742.0109742603246, 737.1570933056887, 41.96606830295442, 61.28461844498004, 75.5126934223981, 16.42440775700949, 67.0348285059786, 60.50943713748668, 160.38779717405626]


In [48]:
# Predict using cross val models ensemble 
def predictInBatchs(x):
    i=0
    res=[]
    step_size = 50000
    for j in tqdm(range(int(np.ceil(x.shape[0]/50000)))):
        res.append(predMeters(x.iloc[i:i+step_size]))
        i+=step_size
        gc.collect()
    return np.concatenate(res)

res = predictInBatchs(test)

100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [20:09<00:00,  2.16s/it]


In [49]:
# Save using cross val models ensemble 
print(test.shape[0])
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
print(submission.shape[0])
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission_meter.csv.zip', index=False)
submission.shape

41697600
41697600
41697600


(41697600, 2)

In [None]:
# # Predict single model fit
# i=0
# res=[]
# step_size = 50000
# for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
#    #res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
#    res.append(np.expm1(gbm.predict(test_X.iloc[i:i+step_size])))
#    i+=step_size
    