In [1]:
# toggle to save space
mode = '_mean' if False else '_all'
print(mode)

_all


In [2]:
# TODO: add https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
from datetime import date 
import holidays


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [4]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
}

def loadFile(name):
    for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
        if path.exists(dir_path + name + '.csv'):
            return  ConvertToDatetime().transform(
                pd.read_csv(dir_path + name + '.csv', dtype=file_dtype[name]))
        


In [6]:
building = loadFile('building_metadata')
pre_train = loadFile('train')
#test = loadFile('test')

In [7]:
def merge(x):
    weather_processed_df = pd.read_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')
    weather_processed_df = weather_processed_df.dropna(axis=1)
    x = x.merge(building, on=['building_id'], how='left')
    gc.collect()
    x = x.merge(weather_processed_df, on=['site_id', 'timestamp'], how='left')
    gc.collect()
    x['site_id'] = x['site_id'].astype('int8');
    x['cloud_coverage'] = x['cloud_coverage'].astype('float16')
    x = x.dropna(axis=0, subset=['air_temperature'])
    gc.collect()
    return x
        
train = merge(pre_train) 
gc.collect()
print(train)
print('!!!! Warning we are missing weather for '+ str(train['air_temperature'].isnull().sum())+' rows')

print(train.dtypes)

          building_id  meter           timestamp  meter_reading  site_id  \
0                   0      0 2016-01-01 00:00:00       0.000000        0   
1                   1      0 2016-01-01 00:00:00       0.000000        0   
2                   2      0 2016-01-01 00:00:00       0.000000        0   
3                   3      0 2016-01-01 00:00:00       0.000000        0   
4                   4      0 2016-01-01 00:00:00       0.000000        0   
5                   5      0 2016-01-01 00:00:00       0.000000        0   
6                   6      0 2016-01-01 00:00:00       0.000000        0   
7                   7      0 2016-01-01 00:00:00       0.000000        0   
8                   8      0 2016-01-01 00:00:00       0.000000        0   
9                   9      0 2016-01-01 00:00:00       0.000000        0   
10                 10      0 2016-01-01 00:00:00       0.000000        0   
11                 11      0 2016-01-01 00:00:00       0.000000        0   
12          

[20216100 rows x 65 columns]
building_id                             int16
meter                                    int8
timestamp                      datetime64[ns]
meter_reading                         float32
site_id                                  int8
primary_use                            object
square_feet                           float32
year_built                            float16
floor_count                           float16
air_temperature                       float16
dew_temperature                       float16
cloud_coverage                        float16
precip_depth_1_hr                     float16
wind_direction                        float16
wind_speed                            float16
sea_level_pressure                    float16
relative_humidity                     float16
air_temperature_rmean_3               float16
air_temperature_rmax_3                float16
air_temperature_rmin_3                float16
cloud_coverage_rmean_3                float16
cloud

In [8]:
# See holiday notebook to generate, this is optional
holiday_df = None
if path.exists('../input/ashrae-energy-prediction-pickles/holiday_df.pickle'):
    holiday_df = pd.read_pickle('../input/ashrae-energy-prediction-pickles/holiday_df.pickle')
if holiday_df is not None:
    print(holiday_df.sample(20))

          site_id           timestamp                      holiday
10089887        0 2016-07-04 10:00:00             Independence Day
35788267        5 2017-12-26 07:00:00                   Boxing Day
19902577        4 2016-12-26 11:00:00     Christmas Day (Observed)
17411434       14 2016-11-11 19:00:00                 Veterans Day
51484219       13 2018-09-03 14:00:00                    Labor Day
54312949       14 2017-10-09 04:00:00                 Columbus Day
52005811       13 2018-11-12 22:00:00      Veterans Day (Observed)
49375294       13 2017-11-23 05:00:00                 Thanksgiving
15601814        8 2016-10-10 16:00:00                 Columbus Day
4424954         5 2016-03-25 05:00:00                  Good Friday
57139669       14 2018-11-22 06:00:00                 Thanksgiving
19976565       12 2016-12-27 18:00:00     Christmas Day (Observed)
35802418        5 2018-01-01 23:00:00               New Year's Day
21390258        0 2018-01-15 06:00:00  Martin Luther King, Jr.

In [9]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        # TODO: try week of year as numerical 
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('uint8') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('uint8')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [10]:
class MeterReadingLog1p(TransformerMixin):
  
    def transform(self, df, **transform_params):
        if 'meter_reading' in df.columns:
            df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
            df = df.drop('meter_reading', axis=1)
        return df
    
    def fit(self, X, y=None, **fit_params):
        return self
print(train.sample(20, random_state=42))
print(MeterReadingLog1p().transform(train.sample(20, random_state=42)))
gc.collect()

          building_id  meter           timestamp  meter_reading  site_id  \
14245562         1324      1 2016-09-16 16:00:00       0.000000       14   
1282718          1013      0 2016-01-24 06:00:00      32.000099       10   
13883790          229      1 2016-09-10 07:00:00     567.655029        2   
4781820           217      3 2016-04-01 01:00:00       0.000000        2   
10415393         1434      0 2016-07-10 04:00:00      65.750000       15   
1057008          1047      0 2016-01-20 04:00:00      90.983299       12   
4507399           911      1 2016-03-26 20:00:00     295.063995        9   
19478829         1039      0 2016-12-18 23:00:00      16.900000       12   
8955615           265      0 2016-06-14 06:00:00     128.369995        2   
13799839          896      0 2016-09-08 19:00:00     300.000000        9   
15647011          973      0 2016-10-11 11:00:00     247.000000        9   
2524294           813      0 2016-02-16 08:00:00      10.958300        8   
10016102    

[20 rows x 65 columns]
          building_id  meter           timestamp  site_id  \
14245562         1324      1 2016-09-16 16:00:00       14   
1282718          1013      0 2016-01-24 06:00:00       10   
13883790          229      1 2016-09-10 07:00:00        2   
4781820           217      3 2016-04-01 01:00:00        2   
10415393         1434      0 2016-07-10 04:00:00       15   
1057008          1047      0 2016-01-20 04:00:00       12   
4507399           911      1 2016-03-26 20:00:00        9   
19478829         1039      0 2016-12-18 23:00:00       12   
8955615           265      0 2016-06-14 06:00:00        2   
13799839          896      0 2016-09-08 19:00:00        9   
15647011          973      0 2016-10-11 11:00:00        9   
2524294           813      0 2016-02-16 08:00:00        8   
10016102          870      0 2016-07-03 02:00:00        8   
3915750           898      0 2016-03-15 03:00:00        9   
17217526          903      0 2016-11-08 09:00:00        9   
1

[20 rows x 65 columns]


34

In [11]:
class CreateMeterDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _building_meter_desc_DF
        if 'meter_reading_log1p' in df.columns:
            cols = ['mean']
            if mode == '_all':
                cols = ['mean','max','min','std']
            group = df.groupby(['building_id','meter'])['meter_reading_log1p']
            desc_DF = group.describe(percentiles=[.05, .25, .5, .75, .95 ])
            desc_DF = desc_DF.reset_index()
            col_dict = {}
            for col in desc_DF.columns:
                if col not in ['building_id', 'meter', 'count']:
                    col_dict[col] = 'meter_' + col
            _building_meter_desc_DF = desc_DF.rename(columns=col_dict).drop('count', axis=1)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterDescDF().transform(
    AddTimeFeatures().transform(
        MeterReadingLog1p().transform(
            train.sample(2000, random_state=0)
        )
    )
)    
print(_building_meter_desc_DF)
gc.collect()

      building_id  meter  meter_mean  meter_std  meter_min  meter_5%  \
0               1      0    4.921724        NaN   4.921724  4.921724   
1               2      0    0.000000        NaN   0.000000  0.000000   
2               3      0    5.727568   0.128644   5.636602  5.645699   
3               5      0    0.000000        NaN   0.000000  0.000000   
4               6      0    0.000000        NaN   0.000000  0.000000   
5               8      0    5.930674        NaN   5.930674  5.930674   
6               9      1    5.300433   2.169901   3.766081  3.919516   
7              11      0    6.159797        NaN   6.159797  6.159797   
8              12      0    5.636602        NaN   5.636602  5.636602   
9              14      0    0.000000        NaN   0.000000  0.000000   
10             14      1    8.555042        NaN   8.555042  8.555042   
11             15      0    5.232920        NaN   5.232920  5.232920   
12             15      1    7.265242   0.810718   6.452293  6.53

[1349 rows x 11 columns]


21

In [12]:
class MergeMeterDescDF(TransformerMixin):
  
    def transform(self, df, **transform_params):
        # drop any columns to add
        dropCols =  [x for x in _building_meter_desc_DF.columns if x not in ['building_id', 'meter']]
        df = df.drop(dropCols, axis=1, errors='ignore') 
        return df.merge(_building_meter_desc_DF, on=['building_id','meter'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeMeterDescDF().transform(train.sample(2000, random_state=0)))

      building_id  meter           timestamp  meter_reading  site_id  \
0             774      1 2016-08-07 08:00:00      36.128899        6   
1             206      0 2016-10-04 14:00:00     226.270004        2   
2            1269      0 2016-11-29 10:00:00      28.670799       14   
3             951      0 2016-10-10 04:00:00     113.000000        9   
4             656      0 2016-05-01 21:00:00      32.700001        5   
5              36      0 2016-06-05 19:00:00     178.830994        0   
6            1262      0 2016-07-19 05:00:00      73.739998       14   
7              52      0 2016-03-17 14:00:00       0.000000        0   
8            1133      2 2016-07-10 17:00:00     984.375000       13   
9            1123      0 2016-10-02 09:00:00      18.243999       13   
10           1237      0 2016-10-06 18:00:00      85.000000       14   
11            960      1 2016-11-17 10:00:00      90.066498        9   
12             11      0 2016-09-25 21:00:00     472.332001     

[2000 rows x 74 columns]


In [13]:
# "As you can see above, this data looks weired until May 20. It is 
# reported in this discussion by @barnwellguy that All electricity
# meter is 0 until May 20 for site_id == 0. Let's remove these data 
# from training data."
# https://www.kaggle.com/kaushal2896/ashrae-eda-fe-lightgbm-1-13
class RmS0M0(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

    def fit(self, X, y=None, **fit_params):
        return self
    


In [14]:
# following this thread
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/113254#latest-663021
class RmBuilt2017(TransformerMixin):
  
    def transform(self, df, **transform_params):
        df = df.query('not (building_id == 363 & meter == 0 & timestamp >= "2016-06-30")')
        df = df.query('not (building_id == 409 & meter == 0 & timestamp <= "2016-07-60")')
        return df

    def fit(self, X, y=None, **fit_params):
        return self
    


In [15]:
# following this thread
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/116633#latest-669738
class RmSite9JUly4th(TransformerMixin):
  
    def transform(self, df, **transform_params):
        df = df.query('not (building_id == 363 & meter == 0 & timestamp <= "2016-08-01")')
        df = df.query('not (site_id == 9 & meter == 0 & timestamp <= "2016-05-20")')
        return df

    def fit(self, X, y=None, **fit_params):
        return self
    


In [16]:
# TODO: write filter to remove any 0 meter reading that continue more than N days (try 3)
# Also we need to account for this by meter

In [17]:
# TODO: try rolling with power

In [18]:
    
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df['holiday'] = df['holiday'].astype('category')
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self
# Test 
if holiday_df is not None:
    print(holiday_df.columns)
    print(AddHolidays().transform(train.head(20))[['holiday','timestamp']])

Index(['site_id', 'timestamp', 'holiday'], dtype='object')
           holiday  timestamp
0   New Year's Day 2016-01-01
1   New Year's Day 2016-01-01
2   New Year's Day 2016-01-01
3   New Year's Day 2016-01-01
4   New Year's Day 2016-01-01
5   New Year's Day 2016-01-01
6   New Year's Day 2016-01-01
7   New Year's Day 2016-01-01
8   New Year's Day 2016-01-01
9   New Year's Day 2016-01-01
10  New Year's Day 2016-01-01
11  New Year's Day 2016-01-01
12  New Year's Day 2016-01-01
13  New Year's Day 2016-01-01
14  New Year's Day 2016-01-01
15  New Year's Day 2016-01-01
16  New Year's Day 2016-01-01
17  New Year's Day 2016-01-01
18  New Year's Day 2016-01-01
19  New Year's Day 2016-01-01


In [19]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['timestamp','site_id'], how='left')
            df = df.drop(df[df['holiday'].notnull()].index)
            df = df.drop(['holiday'], axis=1)
            gc.collect()
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# Test you should see the new years removed
#print(train.head(100000).merge(building, on='building_id', how='left').columns)
print(RmHolidays().transform(train.head(100000)))

       building_id  meter           timestamp  meter_reading  site_id  \
55121            0      0 2016-01-02 00:00:00       0.000000        0   
55122            1      0 2016-01-02 00:00:00       0.000000        0   
55123            2      0 2016-01-02 00:00:00       0.000000        0   
55124            3      0 2016-01-02 00:00:00       0.000000        0   
55125            4      0 2016-01-02 00:00:00       0.000000        0   
55126            5      0 2016-01-02 00:00:00       0.000000        0   
55127            6      0 2016-01-02 00:00:00       0.000000        0   
55128            7      0 2016-01-02 00:00:00       0.000000        0   
55129            8      0 2016-01-02 00:00:00       0.000000        0   
55130            9      0 2016-01-02 00:00:00       0.000000        0   
55131           10      0 2016-01-02 00:00:00       0.000000        0   
55132           11      0 2016-01-02 00:00:00       0.000000        0   
55133           12      0 2016-01-02 00:00:00      

[41839 rows x 65 columns]


In [20]:
class SetCatTypes(TransformerMixin):
    
    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col]= df[col].astype('category')
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [21]:
class GetDummies(TransformerMixin):
    
    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        df = pd.get_dummies(df, columns=self._cols )
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [22]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(building.head(20)['square_feet'])

0       7432.0
1       2720.0
2       5376.0
3      23685.0
4     116607.0
5       8000.0
6      27926.0
7     121074.0
8      60809.0
9      27000.0
10    370773.0
11     49073.0
12     37100.0
13     99380.0
14     86250.0
15     83957.0
16     54644.0
17     15250.0
18    111891.0
19     18717.0
Name: square_feet, dtype: float32


In [23]:
# TODO: Play with scaling cloud coverage

In [24]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [25]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','year_built']]
        year_built_median = tmp_df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        del tmp_df, year_built_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(ImputeYearBuilt().transform(train.sample(20))['building_age'])

19126651    70
17890209    11
8313628     70
5885827     19
18366900    82
8337544     70
12153859    70
6456798      3
7631332     62
14232992    70
9992584     76
51783       70
11822423    70
19335946    74
9214938     70
4532659     70
10368128    70
3098791     70
13409536    70
11520019    69
Name: building_age, dtype: uint8


In [26]:
class ImputeFloorCount(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        tmp_df = train.drop_duplicates(['site_id','building_id'])[['site_id','building_id','floor_count']]
        floors_median = tmp_df['floor_count'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in tmp_df.groupby(['site_id'])['floor_count'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = i_median
            else:
                df.loc[(df['floor_count'].isnull()) & (df['site_id'] == i), 'floor_count'] = floors_median
        del tmp_df, floors_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(ImputeFloorCount().transform(train.sample(20))['floor_count'])

8611825     3.0
11479388    3.0
13459693    3.0
14472989    3.0
15313721    1.0
14713796    8.0
13908092    1.0
13300533    3.0
1218616     3.0
8552468     3.0
4230609     3.0
11022048    4.0
16437706    3.0
8096965     3.0
7346580     3.0
6378632     3.0
2653099     3.0
10313176    1.0
2867363     3.0
18051600    2.0
Name: floor_count, dtype: float16


In [27]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [28]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [29]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [30]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [31]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [32]:
class MergeWeatherDescHDOW(TransformerMixin):
    def transform(self, df, **transform_params):
        desc_h_dow_df = pd.read_csv('weather_desc_h_dow.csv', index_col=0, dtype={'building_id': np.int16, 'meter': np.int8, 'meter_h_d_mean':np.float32,
                                                     'meter_h_d_std':np.float32, 'meter_h_d_min':np.float32,
                                                     'meter_h_d_25%':np.float32, 'meter_h_d_50%':np.float32,
                                                     'meter_h_d_75%':np.float32, 'meter_h_d_max':np.float32 })
        mrgCols = ['site_id', 'dayofweek','hour']
        dropCols = list(x for x in desc_h_dow_df.columns if x not in mrgCols)
        df.drop(mrgCols, errors='ignore')
        df = df.merge(desc_h_dow_df, on=mrgCols, how='left')
        
        del desc_h_dow_df
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeWeatherDescHDOW().transform(AddTimeFeatures().transform(ConvertToDatetime().transform(train.sample(20)))))

    building_id  meter           timestamp  meter_reading  site_id  \
0          1092      1 2016-09-18 07:00:00     746.867981       13   
1            52      0 2016-05-21 04:00:00     347.423004        0   
2          1336      0 2016-05-19 05:00:00      46.849998       15   
3          1130      0 2016-01-21 08:00:00     719.900024       13   
4           837      0 2016-05-21 14:00:00      25.833300        8   
5          1220      1 2016-02-04 22:00:00      22.657600       13   
6           406      0 2016-04-12 10:00:00       8.390000        3   
7          1363      2 2016-04-19 12:00:00       0.000000       15   
8           379      0 2016-01-20 19:00:00       9.050000        3   
9          1322      3 2016-11-09 09:00:00    1137.349976       14   
10          860      0 2016-11-13 17:00:00       1.833300        8   
11          233      0 2016-09-12 07:00:00      34.779999        2   
12          876      2 2016-07-03 01:00:00     146.399994        9   
13         1290     

[20 rows x 102 columns]


In [33]:
class MergeDescHDOW(TransformerMixin):
    def transform(self, df, **transform_params):
        desc_h_dow_df = pd.read_csv('desc_h_dow.csv', index_col=0, dtype={'building_id': np.int16, 'meter': np.int8, 'meter_h_d_mean':np.float32,
                                                     'meter_h_d_std':np.float32, 'meter_h_d_min':np.float32,
                                                     'meter_h_d_25%':np.float32, 'meter_h_d_50%':np.float32,
                                                     'meter_h_d_75%':np.float32, 'meter_h_d_max':np.float32 })
        desc_h_dow_df = desc_h_dow_df.dropna(axis=1)
        mrgCols = ['building_id', 'meter', 'dayofweek','hour']
        dropCols = list(x for x in desc_h_dow_df.columns if x not in mrgCols)
        df.drop(mrgCols, errors='ignore')
        df = df.merge(desc_h_dow_df, on=mrgCols, how='left')
        
        del desc_h_dow_df
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

print(MergeDescHDOW().transform(AddTimeFeatures().transform(ConvertToDatetime().transform(train.sample(20)))))

    building_id  meter           timestamp  meter_reading  site_id  \
0          1409      1 2016-05-06 03:00:00      37.218300       15   
1           910      1 2016-11-26 19:00:00      38.931499        9   
2          1227      0 2016-07-10 11:00:00       0.000000       14   
3          1047      0 2016-09-14 00:00:00      77.347504       12   
4           640      0 2016-11-03 22:00:00     225.750000        4   
5            82      1 2016-12-15 20:00:00     168.841003        0   
6          1438      0 2016-09-29 03:00:00     112.849998       15   
7          1224      0 2016-10-10 13:00:00      24.000000       14   
8          1102      2 2016-12-25 11:00:00    3050.780029       13   
9          1104      1 2016-04-27 16:00:00     171.931000       13   
10          886      0 2016-02-07 04:00:00     174.000000        9   
11         1233      1 2016-05-12 12:00:00     184.210007       14   
12         1233      1 2016-02-14 09:00:00       0.579300       14   
13          433     

In [34]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [35]:
pd.options.display.max_seq_items = 2000

In [36]:
%%time
x_pre_pipes = Pipeline(
    steps=[
        ('meterReadingLog1p',MeterReadingLog1p()),
        ('rmS0M0', RmS0M0()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('logSquareFeet', LogSquareFeet()),
        #('rmHolidays', RmHolidays()),
        #('addHolidays', AddHolidays()),
        #('createMeterDescDF', CreateMeterDescDF()), # note declares a globe variable to pass
        #('mergeMeterDescDF', MergeMeterDescDF()), # populates both test and train from global
        ('mergeDescHDOW', MergeDescHDOW()),
        #('mergeWeatherDescHDOW', MergeWeatherDescHDOW()),
        ('GC', GC())
    ]
)

train = x_pre_pipes.transform(merge(pre_train))
print(train.columns)

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'dew_temperature', 'cloud_coverage', 'precip_depth_1_hr',
       'wind_direction', 'wind_speed', 'sea_level_pressure',
       'relative_humidity', 'air_temperature_rmean_3',
       'air_temperature_rmax_3', 'air_temperature_rmin_3',
       'cloud_coverage_rmean_3', 'cloud_coverage_rmax_3',
       'cloud_coverage_rmin_3', 'dew_temperature_rmean_3',
       'dew_temperature_rmax_3', 'dew_temperature_rmin_3',
       'precip_depth_1_hr_rmean_3', 'precip_depth_1_hr_rmax_3',
       'precip_depth_1_hr_rmin_3', 'sea_level_pressure_rmean_3',
       'sea_level_pressure_rmax_3', 'sea_level_pressure_rmin_3',
       'wind_direction_rmean_3', 'wind_direction_rmax_3',
       'wind_direction_rmin_3', 'wind_speed_rmean_3', 'wind_speed_rmax_3',
       'wind_speed_rmin_3', 'relative_humidity_rmean_3',
       'relative_humidity_rmax_3', 'relative_humidity_rmin_3

In [37]:
 train.isna().any()

building_id                    False
meter                          False
timestamp                      False
site_id                        False
primary_use                    False
square_feet                    False
year_built                      True
floor_count                     True
air_temperature                False
dew_temperature                False
cloud_coverage                 False
precip_depth_1_hr              False
wind_direction                 False
wind_speed                     False
sea_level_pressure             False
relative_humidity              False
air_temperature_rmean_3        False
air_temperature_rmax_3         False
air_temperature_rmin_3         False
cloud_coverage_rmean_3         False
cloud_coverage_rmax_3          False
cloud_coverage_rmin_3          False
dew_temperature_rmean_3        False
dew_temperature_rmax_3         False
dew_temperature_rmin_3         False
precip_depth_1_hr_rmean_3      False
precip_depth_1_hr_rmax_3       False
p

In [38]:
# pre_a_pipes is for preprocessing that doesn't change impute
# values
cat_cols = ['site_id', 'meter', 'primary_use']

x_fold_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('imputeFloorCount', ImputeFloorCount()),
        ('dropCols', DropCols(['timestamp','square_feet', 'year_built'])),
        ('getDummies', GetDummies(cat_cols)),
        ('GC', GC())
    ]
)

sample_train_X = x_fold_pipes.transform(train.sample(20))
print(sample_train_X.columns)
print(sample_train_X.shape)
print(sample_train_X.dtypes)
nans = sample_train_X.isna().any()
print(nans[nans==True].index.tolist())

Index(['building_id', 'floor_count', 'air_temperature', 'dew_temperature',
       'cloud_coverage', 'precip_depth_1_hr', 'wind_direction', 'wind_speed',
       'sea_level_pressure', 'relative_humidity', 'air_temperature_rmean_3',
       'air_temperature_rmax_3', 'air_temperature_rmin_3',
       'cloud_coverage_rmean_3', 'cloud_coverage_rmax_3',
       'cloud_coverage_rmin_3', 'dew_temperature_rmean_3',
       'dew_temperature_rmax_3', 'dew_temperature_rmin_3',
       'precip_depth_1_hr_rmean_3', 'precip_depth_1_hr_rmax_3',
       'precip_depth_1_hr_rmin_3', 'sea_level_pressure_rmean_3',
       'sea_level_pressure_rmax_3', 'sea_level_pressure_rmin_3',
       'wind_direction_rmean_3', 'wind_direction_rmax_3',
       'wind_direction_rmin_3', 'wind_speed_rmean_3', 'wind_speed_rmax_3',
       'wind_speed_rmin_3', 'relative_humidity_rmean_3',
       'relative_humidity_rmax_3', 'relative_humidity_rmin_3',
       'air_temperature_rmean_72', 'air_temperature_rmax_72',
       'air_temperature_rm

In [39]:
# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
# trying no shuffle https://www.kaggle.com/c/ashrae-energy-prediction/discussion/115851#latest-666115
folds = 2
kf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=42)


In [40]:
models = [ ]
scores = []

def cvFitScore(train):
    for train_index, val_index in kf.split(train, train['building_id']):
        print('starting fold')
#        print(train.loc[train_index].groupby(['building_id']).size())
        f_train = x_fold_pipes.transform(train.loc[train_index].drop('building_id', axis=1))
        f_train_x = f_train.drop('meter_reading_log1p', axis=1)
        f_train_y = f_train['meter_reading_log1p']
        f_val = x_fold_pipes.transform(train.loc[val_index].drop('building_id', axis=1))
        f_val_x = f_val.drop('meter_reading_log1p', axis=1)
        f_val_y = f_val['meter_reading_log1p']
        
        reg = RandomForestRegressor(n_estimators= 10,
               max_features=60,
               max_depth= 12)
        reg.fit(f_train_x, f_train_y)
        
        scores.append(rmse(f_val_y, reg.predict(f_val_x)))
        
        models.append(reg)
        
        del f_train, f_val
        gc.collect()
    return models

In [41]:
%%time
models = cvFitScore(train)

starting fold
starting fold
Wall time: 1h 2min 42s


In [42]:
print(scores)
print(np.sum(scores)/len(scores))



[1.1026091682234087, 1.1745162570562393]
1.138562712639824


In [44]:
def createFeature_DF(model):
    feature_importances = pd.DataFrame(model.feature_importances_,
                                       index = sample_train_X.columns,
                                       columns=['importance']).sort_values('importance',
                                                                           ascending=False)

'''
    print(feature_importances)
    
    imprtc_df = pd.DataFrame()
    imprtc_df['feature'] = sample_train_X.drop('meter_reading_log1p', axis=1).columns   
    #print(imprtc_df['feature'])
    print(model._feature_importances_)
    imprtc_df['importance'] = model.feature_importances_
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    return imprtc_df
'''

"\n    print(feature_importances)\n    \n    imprtc_df = pd.DataFrame()\n    imprtc_df['feature'] = sample_train_X.drop('meter_reading_log1p', axis=1).columns   \n    #print(imprtc_df['feature'])\n    print(model._feature_importances_)\n    imprtc_df['importance'] = model.feature_importances_\n    imprtc_df.sort_values('importance', ascending=False, inplace= True)\n    imprtc_df.sort_values('importance', ascending=False, inplace= True)\n    return imprtc_df\n"

In [46]:
print(len(sample_train_X.columns))
for model in models:
    #print(model.feature_importances_)
    print(np.argsort(model.feature_importances_)[::-1])
'''
feature_importances = pd.DataFrame(model.feature_importances_,
                                       index = sample_train_X.columns,
                                       columns=['importance']).sort_values('importance',
                                                                          ascending=False)

print(features_df)
'''       

90
[ 60  63  64  65  85  33  61  35  66  59  41  11  34  86   9  84  10  39
  49  87  42  40  54  37  46  77  67  51  56  15  44  36  55  16  82  47
  70   1  81  48   0  94  91  62  52  88  78  17  97  45   2  75  43  26
  74  50  20  24  57  71  68  89  69  92  22  83  32  95  23  79  18  25
  27  21  58   7  93  30   5 100  19  13  31  12  53  29  28   8 101   4
  73  38   6   3  14 103  99  76  96  98  90  72 102  80]
[ 60  64  63  65  62  85  35  33  61  11   9  34  66  86  59  10  84  87
  39  41  82  40  77  56  67  91  15   1  70  81  36  54  16  55  48   0
  88  17  46  92  42  43  51  68  94  47  83  75  44  74  37  45  52   2
  97  89  20  49  22  78  58 100   7  21  23  79  50  30  31  99  24  71
  69  32  12  57  93  18  95  53  27  26   8  28  25  19 101  14  38   3
   6   5  90   4  29 103  13  96  73  76  98  80  72 102]


"\nfeature_importances = pd.DataFrame(model.feature_importances_,\n                                       index = sample_train_X.columns,\n                                       columns=['importance']).sort_values('importance',\n                                                                          ascending=False)\n\nprint(features_df)\n"

In [47]:
# %%time
# ## Single fit single model

# gbm = LGBMRegressor(**gbm_params)
# f_train_X, f_train_y = getInFoldXY(train.index)
# gbm.fit(f_train_X, f_train_y)

In [48]:
#del train
#gc.collect()
test = loadFile('test')
test = merge(test)
test = x_pre_pipes.transform(test)
test = x_fold_pipes.transform(test)
test = test.drop('row_id', axis=1)


#print(test.sample(n=20,  random_state=42))
print(test.shape)
#print(test.dtypes)

(41682883, 105)


In [49]:
l1 =  x_fold_pipes.transform(train).columns.tolist()
l2 =  test.columns.tolist()
print(list(x for x in l1 if x not in l2))
print(list(x for x in l2 if x not in l1))
nans = test.isna().any()
print(nans[nans==True].index.tolist())

['meter_reading_log1p']
[]
[]


In [53]:
def predMeters(test_X): 
    preds = np.expm1(sum([model.predict(test_X.drop('building_id', axis=1)) for model in models])/folds)
    #test_y['meter_reading_log1p'] = preds
    return preds.tolist()
    
print(predMeters(test.sample(n=20,  random_state=42)))    


[1.5653586758368823, 100.26819701095523, 44.2173119623791, 122.95182665452566, 17.586494639911525, 58.707075240491754, 9.859791261172527, 98.22340711437536, 3.545350568884719, 30.37218059498823, 436.1995028941272, 99.82492593307597, 403.8876071007004, 247.26120908479655, 13.151739712081135, 22.958750462255438, 1.3358602490118465, 265.8427739513252, 224.35248210370884, 28.392580182865157]


In [None]:
# Predict using cross val models ensemble 
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    res.append(predMeters(test.iloc[i:i+step_size]))
    i+=step_size
    gc.collect()


  4%|███                                                                              | 31/834 [00:07<03:22,  3.96it/s]

In [None]:
# Save using cross val models ensemble 
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission_meter.csv.zip', index=False)
submission.shape

In [None]:
# # Predict single model fit
# i=0
# res=[]
# step_size = 50000
# for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
#    #res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
#    res.append(np.expm1(gbm.predict(test_X.iloc[i:i+step_size])))
#    i+=step_size
    