In [1]:
# toggle to save space
mode = '_mean' if False else '_all'
print(mode)

_all


In [2]:
gbm_params = {
    'n_estimators' : 10, # 500,  
    'max_depth' : 3,
    'learning_rate': 0.9,
    'bagging_fraction': 0.8, # TODO: try 0.9
    
    'feature_fraction' : 0.9,
    'bagging_freq': 5,
    'subsample' : 0.1,  # 
    'subsample_freq' : 1,
    'num_leaves' : 20,
    'metric':'rmse',
    #'lambda_l1' : 1,  # Try defaults
    #'lambda_l2': 1, # Try defaults
    'verbose': 100
}

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [4]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
}

def loadFile(name):
    for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
        if path.exists(dir_path + name + '.csv'):
            return  ConvertToDatetime().transform(
                pd.read_csv(dir_path + name + '.csv', dtype=file_dtype[name]))
        


In [6]:
building = loadFile('building_metadata')
pre_train = loadFile('train')
#test = loadFile('test')

weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')

In [7]:
def merge(x):
    weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')
    x = x.merge(building, on=['building_id'], how='left')
    gc.collect()
    x = x.merge(weather_processed_df, on=['site_id', 'timestamp'], how='left')
    gc.collect()
    x['site_id'] = x['site_id'].astype('int8');
    x['cloud_coverage'] = x['cloud_coverage'].astype('float16')
    gc.collect()
    return x
        
train = merge(pre_train)  
print(train)
print('!!!! Warning we are missing weather for '+ str(train['air_temperature'].isnull().sum())+' rows')
train = train.dropna(axis=0, subset=['air_temperature'])


          building_id  meter           timestamp  meter_reading  site_id  \
0                   0      0 2016-01-01 00:00:00       0.000000        0   
1                   1      0 2016-01-01 00:00:00       0.000000        0   
2                   2      0 2016-01-01 00:00:00       0.000000        0   
3                   3      0 2016-01-01 00:00:00       0.000000        0   
4                   4      0 2016-01-01 00:00:00       0.000000        0   
5                   5      0 2016-01-01 00:00:00       0.000000        0   
6                   6      0 2016-01-01 00:00:00       0.000000        0   
7                   7      0 2016-01-01 00:00:00       0.000000        0   
8                   8      0 2016-01-01 00:00:00       0.000000        0   
9                   9      0 2016-01-01 00:00:00       0.000000        0   
10                 10      0 2016-01-01 00:00:00       0.000000        0   
11                 11      0 2016-01-01 00:00:00       0.000000        0   
12          

[20216100 rows x 100 columns]


In [8]:
# See holiday notebook to generate, this is optional
holiday_df = None
if path.exists('../input/ashrae-energy-prediction-pickles/holiday_df.pickle'):
    holiday_df = pd.read_pickle('../input/ashrae-energy-prediction-pickles/holiday_df.pickle')
if holiday_df is not None:
    print(holiday_df.sample(20))

          site_id           timestamp  \
4579148        12 2016-03-28 06:00:00   
38296830        7 2017-12-25 15:00:00   
23517507        1 2018-11-30 10:00:00   
35026249        5 2017-01-02 23:00:00   
17413323       10 2016-11-11 20:00:00   
19903658       14 2016-12-26 11:00:00   
10108424       15 2016-07-04 17:00:00   
35785953        5 2017-12-25 05:00:00   
9932435        11 2016-07-01 14:00:00   
37257644        6 2018-01-01 07:00:00   
35487981        5 2017-08-07 08:00:00   
6562292         5 2016-05-02 17:00:00   
22658250        1 2017-05-01 12:00:00   
2483961        15 2016-02-15 12:00:00   
46246758       11 2018-03-30 02:00:00   
19942128        1 2016-12-27 04:00:00   
37794444        6 2018-10-08 04:00:00   
19837224       12 2016-12-25 07:00:00   
11485           0 2016-01-01 05:00:00   
37258204        6 2018-01-01 14:00:00   

                                                    holiday  
4579148                                       Easter Monday  
38296830      

In [9]:
class MeterReadingLog1p(TransformerMixin):
  
    def transform(self, df, **transform_params):
        if 'meter_reading' in df.columns:
            df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
            df = df.drop('meter_reading', axis=1)
        return df
    
    def fit(self, X, y=None, **fit_params):
        return self
print(train.sample(20, random_state=42))
print(MeterReadingLog1p().transform(train.sample(20, random_state=42)))
gc.collect()

          building_id  meter           timestamp  meter_reading  site_id  \
14245562         1324      1 2016-09-16 16:00:00       0.000000       14   
1282718          1013      0 2016-01-24 06:00:00      32.000099       10   
13883790          229      1 2016-09-10 07:00:00     567.655029        2   
4781820           217      3 2016-04-01 01:00:00       0.000000        2   
10415393         1434      0 2016-07-10 04:00:00      65.750000       15   
1057008          1047      0 2016-01-20 04:00:00      90.983299       12   
4507399           911      1 2016-03-26 20:00:00     295.063995        9   
19478829         1039      0 2016-12-18 23:00:00      16.900000       12   
8955615           265      0 2016-06-14 06:00:00     128.369995        2   
13799839          896      0 2016-09-08 19:00:00     300.000000        9   
15647011          973      0 2016-10-11 11:00:00     247.000000        9   
2524294           813      0 2016-02-16 08:00:00      10.958300        8   
10016102    

[20 rows x 100 columns]
          building_id  meter           timestamp  site_id  \
14245562         1324      1 2016-09-16 16:00:00       14   
1282718          1013      0 2016-01-24 06:00:00       10   
13883790          229      1 2016-09-10 07:00:00        2   
4781820           217      3 2016-04-01 01:00:00        2   
10415393         1434      0 2016-07-10 04:00:00       15   
1057008          1047      0 2016-01-20 04:00:00       12   
4507399           911      1 2016-03-26 20:00:00        9   
19478829         1039      0 2016-12-18 23:00:00       12   
8955615           265      0 2016-06-14 06:00:00        2   
13799839          896      0 2016-09-08 19:00:00        9   
15647011          973      0 2016-10-11 11:00:00        9   
2524294           813      0 2016-02-16 08:00:00        8   
10016102          870      0 2016-07-03 02:00:00        8   
3915750           898      0 2016-03-15 03:00:00        9   
17217526          903      0 2016-11-08 09:00:00        9   


[20 rows x 100 columns]


31

In [10]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        # TODO: try week of year as numerical 
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('category') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = (df['timestamp'].dt.dayofweek * 24) + df['timestamp'].dt.hour
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('uint8')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [11]:
meter_desc_columns={'mean': 'meter_mean', 'max': 'meter_max', 'min': 'meter_min', 'std':'meter_std'}

class CreateMeterDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _building_meter_desc_DF
        print(df.columns)
        if 'meter_reading_log1p' in df.columns:
            cols = ['mean']
            if mode == '_all':
                cols = ['mean','max','min','std']
            group = df.groupby(['building_id','meter'])['meter_reading_log1p']
            desc_DF = group.describe()[cols]
            desc_DF = desc_DF.reset_index()
            _building_meter_desc_DF = desc_DF.rename(columns=meter_desc_columns)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterDescDF().transform(
    AddTimeFeatures().transform(
        MeterReadingLog1p().transform(
            train.sample(2000, random_state=0)
        )
    )
)    
print(_building_meter_desc_DF)
gc.collect()

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'dew_temperature',
       ...
       'wind_direction_max_lag96', 'wind_direction_min_lag96',
       'wind_direction_std_lag96', 'wind_speed_mean_lag96',
       'wind_speed_max_lag96', 'wind_speed_min_lag96', 'wind_speed_std_lag96',
       'meter_reading_log1p', 'dayofweek', 'hour'],
      dtype='object', length=102)
      building_id  meter  meter_mean  meter_max  meter_min  meter_std
0               1      0    4.921724   4.921724   4.921724        NaN
1               2      0    0.000000   0.000000   0.000000        NaN
2               3      0    5.727568   5.818533   5.636602   0.128644
3               5      0    0.000000   0.000000   0.000000        NaN
4               6      0    0.000000   0.000000   0.000000        NaN
5               8      0    5.930674   5.930674   5.930674        NaN
6               9      1    5.300433   6.8347

28

In [12]:
class MergeMeterDescDF(TransformerMixin):
  
    def transform(self, df, **transform_params):
        # drop any columns to add
        df = df.drop(meter_desc_columns.values(), axis=1, errors='ignore') 
        return df.merge(_building_meter_desc_DF, on=['building_id','meter'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeMeterDescDF().transform(train.sample(2000, random_state=0)))

      building_id  meter           timestamp  meter_reading  site_id  \
0             774      1 2016-08-07 08:00:00      36.128899        6   
1             206      0 2016-10-04 14:00:00     226.270004        2   
2            1269      0 2016-11-29 10:00:00      28.670799       14   
3             951      0 2016-10-10 04:00:00     113.000000        9   
4             656      0 2016-05-01 21:00:00      32.700001        5   
5              36      0 2016-06-05 19:00:00     178.830994        0   
6            1262      0 2016-07-19 05:00:00      73.739998       14   
7              52      0 2016-03-17 14:00:00       0.000000        0   
8            1133      2 2016-07-10 17:00:00     984.375000       13   
9            1123      0 2016-10-02 09:00:00      18.243999       13   
10           1237      0 2016-10-06 18:00:00      85.000000       14   
11            960      1 2016-11-17 10:00:00      90.066498        9   
12             11      0 2016-09-25 21:00:00     472.332001     

[2000 rows x 104 columns]


In [13]:
# "As you can see above, this data looks weired until May 20. It is 
# reported in this discussion by @barnwellguy that All electricity
# meter is 0 until May 20 for site_id == 0. Let's remove these data 
# from training data."
# https://www.kaggle.com/kaushal2896/ashrae-eda-fe-lightgbm-1-13
class RmS0M0(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

    def fit(self, X, y=None, **fit_params):
        return self
    


In [14]:
# TODO: write filter to remove any 0 meter reading that continue more than N days (try 3)
# Also we need to account for this by meter

In [15]:
# TODO: try rolling with power

In [16]:
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            if 'holiday' in df.columns:
                df = df.drop('holiday', axis=1)
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df['holiday'] = df['holiday'].astype('category')
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self
# Test 
if holiday_df is not None:
    print(holiday_df.columns)
    print(AddHolidays().transform(train.head(20))[['holiday','timestamp']])

Index(['site_id', 'timestamp', 'holiday'], dtype='object')
           holiday  timestamp
0   New Year's Day 2016-01-01
1   New Year's Day 2016-01-01
2   New Year's Day 2016-01-01
3   New Year's Day 2016-01-01
4   New Year's Day 2016-01-01
5   New Year's Day 2016-01-01
6   New Year's Day 2016-01-01
7   New Year's Day 2016-01-01
8   New Year's Day 2016-01-01
9   New Year's Day 2016-01-01
10  New Year's Day 2016-01-01
11  New Year's Day 2016-01-01
12  New Year's Day 2016-01-01
13  New Year's Day 2016-01-01
14  New Year's Day 2016-01-01
15  New Year's Day 2016-01-01
16  New Year's Day 2016-01-01
17  New Year's Day 2016-01-01
18  New Year's Day 2016-01-01
19  New Year's Day 2016-01-01


In [17]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df = df.drop(df[df['holiday'].notnull()].index)
            df = df.drop(['holiday'], axis=1)
            gc.collect()
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# Test you should see the new years removed
#print(train.head(100000).merge(building, on='building_id', how='left').columns)
print(RmHolidays().transform(train.head(100000)))

       building_id  meter           timestamp  meter_reading  site_id  \
55121            0      0 2016-01-02 00:00:00       0.000000        0   
55122            1      0 2016-01-02 00:00:00       0.000000        0   
55123            2      0 2016-01-02 00:00:00       0.000000        0   
55124            3      0 2016-01-02 00:00:00       0.000000        0   
55125            4      0 2016-01-02 00:00:00       0.000000        0   
55126            5      0 2016-01-02 00:00:00       0.000000        0   
55127            6      0 2016-01-02 00:00:00       0.000000        0   
55128            7      0 2016-01-02 00:00:00       0.000000        0   
55129            8      0 2016-01-02 00:00:00       0.000000        0   
55130            9      0 2016-01-02 00:00:00       0.000000        0   
55131           10      0 2016-01-02 00:00:00       0.000000        0   
55132           11      0 2016-01-02 00:00:00       0.000000        0   
55133           12      0 2016-01-02 00:00:00      

[41839 rows x 100 columns]


In [18]:
class SetCatTypes(TransformerMixin):
    
    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col]= df[col].astype('category')
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [19]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [20]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(building.head(20)['square_feet'])

0       7432.0
1       2720.0
2       5376.0
3      23685.0
4     116607.0
5       8000.0
6      27926.0
7     121074.0
8      60809.0
9      27000.0
10    370773.0
11     49073.0
12     37100.0
13     99380.0
14     86250.0
15     83957.0
16     54644.0
17     15250.0
18    111891.0
19     18717.0
Name: square_feet, dtype: float32


In [21]:
# TODO: Play with scaling cloud coverage

In [22]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','year_built']]
        year_built_median = tmp_df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        del tmp_df, year_built_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(ImputeYearBuilt().transform(train.sample(20))['building_age'])

2422709      19
3609453      70
6999489      70
5574910      70
16835572     75
18387841    106
13389938     76
10510114      6
19810725     70
15854942     86
9323071      70
9675846      12
5537345      70
8781554      33
13367430     70
1943112      70
5167775      70
12990413     78
15617063    108
4121552      62
Name: building_age, dtype: uint8


In [23]:
class ImputeFloorCount(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','floor_count']]
        floors_median = tmp_df['floor_count'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['floor_count'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = i_median
            else:
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = floors_median
        del tmp_df, floors_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(ImputeFloorCount().transform(train.sample(20))['floor_count'])

15713474     3.0
11425893     3.0
17865707     3.0
15798423     3.0
2859273      3.0
11698952     3.0
4459410      3.0
9528745      3.0
19596635     3.0
4812989      1.0
18504290     3.0
6091130     16.0
4300915      4.0
16392846     3.0
17301810     3.0
17071418     3.0
11791809     3.0
17096027     2.0
6472285      3.0
6669473      4.0
Name: floor_count, dtype: float16


In [24]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [25]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [26]:
class FillMean(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].mean())
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [27]:
class FillZeros(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(0)
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [28]:
class FillMedian(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].median())
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [29]:
class FillPopular(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].value_counts()[0])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [30]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [31]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [32]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [33]:
%%time

x_pre_pipes = Pipeline(
    steps=[
        ('meterReadingLog1p',MeterReadingLog1p()),
        ('rmS0M0', RmS0M0()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('logSquareFeet', LogSquareFeet()),
        #('rmHolidays', RmHolidays()),
        #('addHolidays', AddHolidays()),
        ('createMeterDescDF', CreateMeterDescDF()), # note declares a globe variable to pass
        ('mergeMeterDescDF', MergeMeterDescDF()), # populates both test and train from global
        ('setCatTypes', SetCatTypes(['building_id', 'site_id', 'meter', 'primary_use'])),
        ('GC', GC())
    ]
)

train = x_pre_pipes.transform(train)
print(train.columns)

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'dew_temperature',
       ...
       'wind_direction_min_lag96', 'wind_direction_std_lag96',
       'wind_speed_mean_lag96', 'wind_speed_max_lag96', 'wind_speed_min_lag96',
       'wind_speed_std_lag96', 'meter_reading_log1p', 'dayofweek', 'hour',
       'log_square_feet'],
      dtype='object', length=103)
Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'dew_temperature',
       ...
       'wind_speed_min_lag96', 'wind_speed_std_lag96', 'meter_reading_log1p',
       'dayofweek', 'hour', 'log_square_feet', 'meter_mean', 'meter_max',
       'meter_min', 'meter_std'],
      dtype='object', length=107)
Wall time: 41.6 s


In [34]:
# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_fold_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('imputeFloorCount', ImputeFloorCount()),
        ('dropCols', DropCols(['timestamp','square_feet', 'year_built'])),
        ('GC', GC())
    ]
)

sample_train_X = x_fold_pipes.transform(train.sample(20))
print(sample_train_X.columns)
print(sample_train_X.dtypes)

Index(['building_id', 'meter', 'site_id', 'primary_use', 'floor_count',
       'air_temperature', 'dew_temperature', 'cloud_coverage',
       'precip_depth_1_hr', 'wind_direction',
       ...
       'wind_speed_std_lag96', 'meter_reading_log1p', 'dayofweek', 'hour',
       'log_square_feet', 'meter_mean', 'meter_max', 'meter_min', 'meter_std',
       'building_age'],
      dtype='object', length=105)
building_id                      category
meter                            category
site_id                          category
primary_use                      category
floor_count                       float16
air_temperature                   float16
dew_temperature                   float16
cloud_coverage                    float16
precip_depth_1_hr                 float16
wind_direction                    float16
wind_speed                        float16
sea_level_pressure                float16
air_temperature_mean_lag3         float16
air_temperature_max_lag3          float16
air_temp

In [35]:
# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
folds = 5
kf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=42)

In [36]:
models = []
best_scores = []
for train_index, val_index in kf.split(train, train['building_id']):
    f_train_X = x_fold_pipes.transform(train.loc[train_index])
    f_val_X = x_fold_pipes.transform(train.loc[val_index])
    gbm = LGBMRegressor(**gbm_params)
    gbm.fit(f_train_X.drop('meter_reading_log1p', axis=1), f_train_X['meter_reading_log1p'],
        eval_set=[(f_val_X.drop('meter_reading_log1p', axis=1), f_val_X['meter_reading_log1p'])],
        # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
        # eval_metric=lbm_rmslee,
        early_stopping_rounds=20)
    models.append(gbm)
    best_scores.append(gbm.best_score_)
    del f_train_X, f_val_X, gbm
    gc.collect()


KeyboardInterrupt: 

In [None]:
# Scores for cross val
for score in best_scores:
    print(score['valid_0']['rmse'])

In [None]:
# Importance rank for first model in cross val models
imprtc_df = pd.DataFrame()
imprtc_df['feature'] = sample_train_X.drop('meter_reading_log1p', axis=1).columns   
imprtc_df['importance'] = models[0].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)
print(sample_train_X.columns)