In [1]:
# toggle to save space
mode = '_mean' if True else '_all'
print(mode)

_mean


In [2]:
gbm_params = {
    'n_estimators' : 10, # 500,  
    'max_depth' : 3,
    'learning_rate': 0.9,
    'bagging_fraction': 0.8, # TODO: try 0.9
    
    'feature_fraction' : 0.9,
    'bagging_freq': 5,
    'subsample' : 0.1,  # 
    'subsample_freq' : 1,
    'num_leaves' : 20,
    'metric':'rmse',
    #'lambda_l1' : 1,  # Try defaults
    #'lambda_l2': 1, # Try defaults
    'verbose': 100
}

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [4]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
}

def loadFile(name):
    for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
        if path.exists(dir_path + name + '.csv'):
            return  ConvertToDatetime().transform(
                pd.read_csv(dir_path + name + '.csv', dtype=file_dtype[name]))
        


In [6]:
building = loadFile('building_metadata')
pre_train = loadFile('train')
#test = loadFile('test')

weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')

In [7]:
def merge(x):
    weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')
    x = x.merge(building, on=['building_id'], how='left')
    gc.collect()
    x = x.merge(weather_processed_df, on=['site_id', 'timestamp'], how='left')
    gc.collect()
    x['site_id'] = x['site_id'].astype('int8');
    x['cloud_coverage'] = x['cloud_coverage'].astype('float16')
    gc.collect()
    return x
        
train = merge(pre_train)  
print(train)
print('!!!! Warning we are missing weather for '+ str(train['air_temperature'].isnull().sum())+' rows')
train = train.dropna(axis=0, subset=['air_temperature'])


          building_id  meter           timestamp  meter_reading  site_id  \
0                   0      0 2016-01-01 00:00:00       0.000000        0   
1                   1      0 2016-01-01 00:00:00       0.000000        0   
2                   2      0 2016-01-01 00:00:00       0.000000        0   
3                   3      0 2016-01-01 00:00:00       0.000000        0   
4                   4      0 2016-01-01 00:00:00       0.000000        0   
5                   5      0 2016-01-01 00:00:00       0.000000        0   
6                   6      0 2016-01-01 00:00:00       0.000000        0   
7                   7      0 2016-01-01 00:00:00       0.000000        0   
8                   8      0 2016-01-01 00:00:00       0.000000        0   
9                   9      0 2016-01-01 00:00:00       0.000000        0   
10                 10      0 2016-01-01 00:00:00       0.000000        0   
11                 11      0 2016-01-01 00:00:00       0.000000        0   
12          

[20216100 rows x 31 columns]


In [8]:
# See holiday notebook to generate, this is optional
holiday_df = None
if path.exists('../input/ashrae-energy-prediction-pickles/holiday_df.pickle'):
    holiday_df = pd.read_pickle('../input/ashrae-energy-prediction-pickles/holiday_df.pickle')
if holiday_df is not None:
    print(holiday_df.sample(20))

          site_id           timestamp                      holiday
10080255       15 2016-07-04 05:00:00             Independence Day
8103409         1 2016-05-30 03:00:00          Spring Bank Holiday
19841510        8 2016-12-25 09:00:00                Christmas Day
34332830        4 2018-02-19 08:00:00        Washington's Birthday
24847265        2 2017-07-04 20:00:00             Independence Day
37351484        6 2018-02-19 05:00:00        Washington's Birthday
28275961        2 2018-11-11 04:00:00                 Veterans Day
942134         13 2016-01-18 02:00:00  Martin Luther King, Jr. Day
46055976       10 2018-11-22 08:00:00                 Thanksgiving
57140533       14 2018-11-22 09:00:00                 Thanksgiving
26057           5 2016-01-01 11:00:00               New Year's Day
21186309        0 2017-11-10 09:00:00      Veterans Day (Observed)
46288828       11 2018-08-06 13:00:00                Civic Holiday
46134436       11 2017-04-14 08:00:00                  Good Fr

In [9]:
class MeterReadingLog1p(TransformerMixin):
  
    def transform(self, df, **transform_params):
        if 'meter_reading' in df.columns:
            df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
            df = df.drop('meter_reading', axis=1)
        return df
    
    def fit(self, X, y=None, **fit_params):
        return self
print(train.sample(20, random_state=42))
print(MeterReadingLog1p().transform(train.sample(20, random_state=42)))
gc.collect()

          building_id  meter           timestamp  meter_reading  site_id  \
19993926         1389      1 2016-12-28 01:00:00      18.774500       15   
6530527          1240      3 2016-05-02 03:00:00       0.000000       14   
6484853           464      0 2016-05-01 08:00:00      13.480000        3   
18072271          656      0 2016-11-23 15:00:00      11.100000        5   
15944342          322      0 2016-10-16 18:00:00      24.889999        3   
4563775           643      0 2016-03-27 23:00:00      72.724197        4   
18118279         1278      0 2016-11-24 10:00:00     113.069000       14   
2121957           575      0 2016-02-08 11:00:00      69.300003        4   
18890113          890      2 2016-12-08 14:00:00     631.349976        9   
14306902         1390      2 2016-09-17 18:00:00     134.160004       15   
5056620           702      0 2016-04-05 22:00:00       9.500000        5   
6530049           991      2 2016-05-02 03:00:00       0.000000        9   
5177920     

[20 rows x 31 columns]
          building_id  meter           timestamp  site_id  \
19993926         1389      1 2016-12-28 01:00:00       15   
6530527          1240      3 2016-05-02 03:00:00       14   
6484853           464      0 2016-05-01 08:00:00        3   
18072271          656      0 2016-11-23 15:00:00        5   
15944342          322      0 2016-10-16 18:00:00        3   
4563775           643      0 2016-03-27 23:00:00        4   
18118279         1278      0 2016-11-24 10:00:00       14   
2121957           575      0 2016-02-08 11:00:00        4   
18890113          890      2 2016-12-08 14:00:00        9   
14306902         1390      2 2016-09-17 18:00:00       15   
5056620           702      0 2016-04-05 22:00:00        5   
6530049           991      2 2016-05-02 03:00:00        9   
5177920           224      0 2016-04-08 02:00:00        2   
19974616         1247      0 2016-12-27 17:00:00       14   
9989434            43      1 2016-07-02 15:00:00        0   
1

[20 rows x 31 columns]


31

In [10]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        # TODO: try week of year as numerical 
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('uint8') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = (df['timestamp'].dt.dayofweek * 24) + df['timestamp'].dt.hour
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('uint8')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [11]:
class CreateMeterDDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _m_dow_desc_DF
        if 'meter_reading_log1p' in df.columns:
            group = df[['building_id', 'meter', 'dayofweek', 'meter_reading_log1p']].groupby(['building_id','meter','dayofweek'])['meter_reading_log1p']
            desc_DF = group.describe()
            desc_DF = desc_DF.reset_index()
            col_dict = {}
            for col in desc_DF.columns:
                if col not in ['building_id', 'meter', 'count','dayofweek']:
                    col_dict[col] = 'meter_d_' + col
            _m_dow_desc_DF = desc_DF.rename(columns=col_dict).drop('count', axis=1)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterDDescDF().transform(
    AddTimeFeatures().transform(
        MeterReadingLog1p().transform(
            train#.sample(2000, random_state=0)
        )
    )
)  
print(_m_dow_desc_DF)

       building_id  meter dayofweek  meter_h_d_mean  meter_h_d_std  \
0                0      0         0        3.346189       2.654752   
1                0      0         1        3.359983       2.663385   
2                0      0         2        3.375220       2.672971   
3                0      0         3        3.351040       2.658930   
4                0      0         4        3.319596       2.666740   
5                0      0         5        3.385256       2.640371   
6                0      0         6        3.356991       2.650592   
7                1      0         0        2.987316       2.316237   
8                1      0         1        2.993209       2.296804   
9                1      0         2        2.999730       2.304983   
10               1      0         3        2.993358       2.304681   
11               1      0         4        2.941351       2.296175   
12               1      0         5        3.004999       2.289748   
13               1  

[16660 rows x 10 columns]


In [12]:
class MergeMeterDDescDF(TransformerMixin):
  
    def transform(self, df, **transform_params):
        # drop any columns to add
        dropCols =  [x for x in _m_dow_desc_DF.columns if x not in ['building_id', 'meter', 'dayofweek','hour']]
        df = df.drop(dropCols, axis=1, errors='ignore') 
        return df.merge(_m_dow_desc_DF, on=['building_id','meter'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeMeterDDescDF().transform(train.sample(2000, random_state=0)))

       building_id  meter           timestamp  meter_reading  site_id  \
0              683      0 2016-08-16 06:00:00       1.700000        5   
1              683      0 2016-08-16 06:00:00       1.700000        5   
2              683      0 2016-08-16 06:00:00       1.700000        5   
3              683      0 2016-08-16 06:00:00       1.700000        5   
4              683      0 2016-08-16 06:00:00       1.700000        5   
5              683      0 2016-08-16 06:00:00       1.700000        5   
6              683      0 2016-08-16 06:00:00       1.700000        5   
7             1265      0 2016-02-17 20:00:00     142.848007       14   
8             1265      0 2016-02-17 20:00:00     142.848007       14   
9             1265      0 2016-02-17 20:00:00     142.848007       14   
10            1265      0 2016-02-17 20:00:00     142.848007       14   
11            1265      0 2016-02-17 20:00:00     142.848007       14   
12            1265      0 2016-02-17 20:00:00     1

[14000 rows x 40 columns]


In [13]:
class CreateMeterDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _building_meter_desc_DF
        if 'meter_reading_log1p' in df.columns:
            cols = ['mean']
            if mode == '_all':
                cols = ['mean','max','min','std']
            group = df.groupby(['building_id','meter'])['meter_reading_log1p']
            desc_DF = group.describe(percentiles=[.05, .25, .5, .75, .95 ])
            desc_DF = desc_DF.reset_index()
            col_dict = {}
            for col in desc_DF.columns:
                if col not in ['building_id', 'meter', 'count']:
                    col_dict[col] = 'meter_' + col
            _building_meter_desc_DF = desc_DF.rename(columns=col_dict).drop('count', axis=1)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterDescDF().transform(
    AddTimeFeatures().transform(
        MeterReadingLog1p().transform(
            train.sample(2000, random_state=0)
        )
    )
)    
print(_building_meter_desc_DF)
gc.collect()

      building_id  meter  meter_mean  meter_std  meter_min  meter_5%  \
0               1      0    4.911728        NaN   4.911728  4.911728   
1               2      0    1.907095        NaN   1.907095  1.907095   
2               3      0    2.450559   3.465614   0.000000  0.245056   
3               4      0    0.000000        NaN   0.000000  0.000000   
4               5      0    0.000000        NaN   0.000000  0.000000   
5               6      0    4.392713        NaN   4.392713  4.392713   
6               7      1    8.461376        NaN   8.461376  8.461376   
7               9      0    4.949193        NaN   4.949193  4.949193   
8              10      0    3.504745   4.956458   0.000000  0.350475   
9              12      0    5.029588        NaN   5.029588  5.029588   
10             13      0    5.850474        NaN   5.850474  5.850474   
11             14      0    3.081338   4.357670   0.000000  0.308134   
12             16      0    4.905344   4.251092   0.000000  0.71

[1366 rows x 11 columns]


28

In [14]:
class MergeMeterDescDF(TransformerMixin):
  
    def transform(self, df, **transform_params):
        # drop any columns to add
        dropCols =  [x for x in _building_meter_desc_DF.columns if x not in ['building_id', 'meter']]
        df = df.drop(dropCols, axis=1, errors='ignore') 
        return df.merge(_building_meter_desc_DF, on=['building_id','meter'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeMeterDescDF().transform(train.sample(2000, random_state=0)))

      building_id  meter           timestamp  meter_reading  site_id  \
0             683      0 2016-08-16 06:00:00       1.700000        5   
1            1265      0 2016-02-17 20:00:00     142.848007       14   
2            1030      1 2016-11-24 18:00:00       0.000000       11   
3            1240      3 2016-12-25 11:00:00     413.479004       14   
4             857      0 2016-07-12 18:00:00       0.000000        8   
5              18      0 2016-08-16 21:00:00    1876.219971        0   
6             712      0 2016-07-17 01:00:00       1.900000        5   
7             416      0 2016-04-10 06:00:00      11.250000        3   
8             715      0 2016-08-17 23:00:00       4.500000        5   
9             183      1 2016-05-19 06:00:00     289.365997        2   
10            204      0 2016-09-10 01:00:00      34.680000        2   
11            249      1 2016-11-05 04:00:00     605.284973        2   
12            633      0 2016-05-13 18:00:00      27.610001     

[2000 rows x 41 columns]


In [15]:
# "As you can see above, this data looks weired until May 20. It is 
# reported in this discussion by @barnwellguy that All electricity
# meter is 0 until May 20 for site_id == 0. Let's remove these data 
# from training data."
# https://www.kaggle.com/kaushal2896/ashrae-eda-fe-lightgbm-1-13
class RmS0M0(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

    def fit(self, X, y=None, **fit_params):
        return self
    


In [16]:
# TODO: write filter to remove any 0 meter reading that continue more than N days (try 3)
# Also we need to account for this by meter

In [17]:
# TODO: try rolling with power

In [18]:
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            if 'holiday' in df.columns:
                df = df.drop('holiday', axis=1)
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df['holiday'] = df['holiday'].astype('category')
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self
# Test 
if holiday_df is not None:
    print(holiday_df.columns)
    print(AddHolidays().transform(train.head(20))[['holiday','timestamp']])

Index(['site_id', 'timestamp', 'holiday'], dtype='object')
           holiday  timestamp
0   New Year's Day 2016-01-01
1   New Year's Day 2016-01-01
2   New Year's Day 2016-01-01
3   New Year's Day 2016-01-01
4   New Year's Day 2016-01-01
5   New Year's Day 2016-01-01
6   New Year's Day 2016-01-01
7   New Year's Day 2016-01-01
8   New Year's Day 2016-01-01
9   New Year's Day 2016-01-01
10  New Year's Day 2016-01-01
11  New Year's Day 2016-01-01
12  New Year's Day 2016-01-01
13  New Year's Day 2016-01-01
14  New Year's Day 2016-01-01
15  New Year's Day 2016-01-01
16  New Year's Day 2016-01-01
17  New Year's Day 2016-01-01
18  New Year's Day 2016-01-01
19  New Year's Day 2016-01-01


In [19]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df = df.drop(df[df['holiday'].notnull()].index)
            df = df.drop(['holiday'], axis=1)
            gc.collect()
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# Test you should see the new years removed
#print(train.head(100000).merge(building, on='building_id', how='left').columns)
print(RmHolidays().transform(train.head(100000)))

       building_id  meter           timestamp  meter_reading  site_id  \
52897            0      0 2016-01-02 00:00:00       0.000000        0   
52898            1      0 2016-01-02 00:00:00       0.000000        0   
52899            2      0 2016-01-02 00:00:00       0.000000        0   
52900            3      0 2016-01-02 00:00:00       0.000000        0   
52901            4      0 2016-01-02 00:00:00       0.000000        0   
52902            5      0 2016-01-02 00:00:00       0.000000        0   
52903            6      0 2016-01-02 00:00:00       0.000000        0   
52904            7      0 2016-01-02 00:00:00       0.000000        0   
52905            8      0 2016-01-02 00:00:00       0.000000        0   
52906            9      0 2016-01-02 00:00:00       0.000000        0   
52907           10      0 2016-01-02 00:00:00       0.000000        0   
52908           11      0 2016-01-02 00:00:00       0.000000        0   
52909           12      0 2016-01-02 00:00:00      

[43848 rows x 32 columns]


In [20]:
class SetCatTypes(TransformerMixin):
    
    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col]= df[col].astype('category')
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [21]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [22]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(building.head(20)['square_feet'])

0       7432.0
1       2720.0
2       5376.0
3      23685.0
4     116607.0
5       8000.0
6      27926.0
7     121074.0
8      60809.0
9      27000.0
10    370773.0
11     49073.0
12     37100.0
13     99380.0
14     86250.0
15     83957.0
16     54644.0
17     15250.0
18    111891.0
19     18717.0
Name: square_feet, dtype: float32


In [23]:
# TODO: Play with scaling cloud coverage

In [24]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','year_built']]
        year_built_median = tmp_df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        del tmp_df, year_built_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(ImputeYearBuilt().transform(train.sample(20))['building_age'])

18511846     70
16003404     70
1928088     111
1864004      70
3931215      70
6990283      70
12074114     84
3037154      86
6910114      70
752795       17
8248958      82
14451460     70
17210074     94
14606639     70
383446       70
11026027     56
16231495     70
1942688      70
3848833      70
16473257    107
Name: building_age, dtype: uint8


In [25]:
class ImputeFloorCount(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','floor_count']]
        floors_median = tmp_df['floor_count'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['floor_count'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = i_median
            else:
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = floors_median
        del tmp_df, floors_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(ImputeFloorCount().transform(train.sample(20))['floor_count'])

16890843     1.0
10134103     3.0
2812405      3.0
3125220      8.0
4068107      3.0
17350581     3.0
17454395     3.0
16181345     3.0
13214901     3.0
15113942     3.0
4632609      3.0
17435132     3.0
16768364     3.0
9643393      2.0
2352003      3.0
8876964      3.0
13760481     3.0
11783741    16.0
18421089     3.0
12315873     3.0
Name: floor_count, dtype: float16


In [26]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [27]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [28]:
class FillMean(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].mean())
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [29]:
class FillZeros(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(0)
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [30]:
class FillMedian(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].median())
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [31]:
class FillPopular(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].value_counts()[0])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [32]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [33]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [34]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [35]:
%%time

x_pre_pipes = Pipeline(
    steps=[
        ('meterReadingLog1p',MeterReadingLog1p()),
        ('rmS0M0', RmS0M0()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('logSquareFeet', LogSquareFeet()),
        #('rmHolidays', RmHolidays()),
        #('addHolidays', AddHolidays()),
        ('createMeterDescDF', CreateMeterDescDF()), # note declares a globe variable to pass
        ('mergeMeterDescDF', MergeMeterDescDF()), # populates both test and train from global
        ('createMeterDDescDF', CreateMeterDDescDF()), # note declares a globe variable to pass
        ('mergeMeterDDescDF', MergeMeterDDescDF()), # populates both test and train from global
        ('setCatTypes', SetCatTypes(['building_id', 'site_id', 'meter', 'primary_use'])),
        ('GC', GC())
    ]
)

train = x_pre_pipes.transform(train)
print(train.columns)

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 's_radiation',
       'air_temperature_mean_lag3', 'cloud_coverage_mean_lag3',
       'dew_temperature_mean_lag3', 'precip_depth_1_hr_mean_lag3',
       'sea_level_pressure_mean_lag3', 'wind_direction_mean_lag3',
       'wind_speed_mean_lag3', 'air_temperature_mean_lag72',
       'cloud_coverage_mean_lag72', 'dew_temperature_mean_lag72',
       'precip_depth_1_hr_mean_lag72', 'sea_level_pressure_mean_lag72',
       'wind_direction_mean_lag72', 'wind_speed_mean_lag72',
       'meter_reading_log1p', 'dayofweek_x', 'hour', 'log_square_feet',
       'meter_mean', 'meter_std', 'meter_min', 'meter_5%', 'meter_25%',
       'meter_50%', 'meter_75%', 'meter_95%', 'meter_max', 'dayofweek_y',
       'meter_h_d_mean', 'meter_h_d_std', '

In [36]:
# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_fold_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('imputeFloorCount', ImputeFloorCount()),
        ('dropCols', DropCols(['timestamp','square_feet', 'year_built','site_id','building_id'])),
        ('GC', GC())
    ]
)

sample_train_X = x_fold_pipes.transform(train.sample(20))
print(sample_train_X.columns)
print(sample_train_X.dtypes)

Index(['meter', 'primary_use', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 's_radiation',
       'air_temperature_mean_lag3', 'cloud_coverage_mean_lag3',
       'dew_temperature_mean_lag3', 'precip_depth_1_hr_mean_lag3',
       'sea_level_pressure_mean_lag3', 'wind_direction_mean_lag3',
       'wind_speed_mean_lag3', 'air_temperature_mean_lag72',
       'cloud_coverage_mean_lag72', 'dew_temperature_mean_lag72',
       'precip_depth_1_hr_mean_lag72', 'sea_level_pressure_mean_lag72',
       'wind_direction_mean_lag72', 'wind_speed_mean_lag72',
       'meter_reading_log1p', 'dayofweek_x', 'hour', 'log_square_feet',
       'meter_mean', 'meter_std', 'meter_min', 'meter_5%', 'meter_25%',
       'meter_50%', 'meter_75%', 'meter_95%', 'meter_max', 'dayofweek_y',
       'meter_h_d_mean', 'meter_h_d_std', 'meter_h_d_min', 'meter_h_d_25%',
       'meter_h_d_50%', 'meter_h_d_75%', '

In [37]:
# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
folds = 5
kf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=42)

In [38]:
models = []
best_scores = []
for train_index, val_index in kf.split(train, train['building_id']):
    f_train_X = x_fold_pipes.transform(train.loc[train_index])
    f_val_X = x_fold_pipes.transform(train.loc[val_index])
    gbm = LGBMRegressor(**gbm_params)
    gbm.fit(f_train_X.drop('meter_reading_log1p', axis=1), f_train_X['meter_reading_log1p'],
        eval_set=[(f_val_X.drop('meter_reading_log1p', axis=1), f_val_X['meter_reading_log1p'])],
        # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
        # eval_metric=lbm_rmslee,
        early_stopping_rounds=20)
    models.append(gbm)
    best_scores.append(gbm.best_score_)
    del f_train_X, f_val_X, gbm
    gc.collect()


MemoryError: 

In [None]:
# Scores for cross val
for score in best_scores:
    print(score['valid_0']['rmse'])

In [None]:
def createFeature_DF(model):
    imprtc_df = pd.DataFrame()
    imprtc_df['feature'] = sample_train_X.drop('meter_reading_log1p', axis=1).columns   
    imprtc_df['importance'] = model.feature_importances_
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    return imprtc_df



In [None]:
features_df = None
for model in models:
    if features_df is None:
        features_df = createFeature_DF(model)
    else:
        features_df = features_df.append(createFeature_DF(model))

features_df = features_df.groupby('feature').sum().sort_values('importance', ascending=False).reset_index()
features_df['feature'] = features_df['feature'].str.pad(features_df['feature'].str.len().max(), side ='right') 

features_df.to_csv("featuress_small_multi_meter_light_GBM.csv")
print(features_df)
