In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [2]:
# note you must run weather
site_time_offsets_df = pd.read_pickle('../input/ashrae-energy-prediction/site_time_offsets_df.pickle')

In [3]:
# note you must run create-holiday-df notebook first to create the pickle
holiday_df = pd.read_pickle('../input/ashrae-energy-prediction/holiday_df.pickle')

In [4]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
class AlignTime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['offset'] = df.site_id.map(site_time_offsets_df)
        df['timestamp_aligned'] = (df.timestamp - pd.to_timedelta(df.offset, unit='H'))
        df['timestamp'] = df['timestamp_aligned']
        del df['timestamp_aligned'], df['offset']
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self
    

In [6]:
weather_pre_pipes = Pipeline(
    steps=[
        ('convertToDatetime', ConvertToDatetime()),
        ('alignTime', AlignTime()),
    ]
)

In [7]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
    'weather' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16}
}

file_loc = {}    
for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
    for name in ['building_metadata','weather_train','weather_test','train','test']:
        if path.exists(dir_path + name + '.csv'):
            file_loc[name]= dir_path + name + '.csv'
    
    building = pd.read_csv(file_loc['building_metadata'], dtype=file_dtype['building_metadata'])
    weather_train = weather_pre_pipes.transform(pd.read_csv(file_loc['weather_train'], dtype=file_dtype['weather']))
    weather_test = weather_pre_pipes.transform(pd.read_csv(file_loc['weather_test'], dtype=file_dtype['weather']))
    train = ConvertToDatetime().transform(pd.read_csv(file_loc['train'], dtype=file_dtype['train']))
    test = ConvertToDatetime().transform(pd.read_csv(file_loc['test'], dtype=file_dtype['test']))


In [8]:
# "As you can see above, this data looks weired until May 20. It is 
# reported in this discussion by @barnwellguy that All electricity
# meter is 0 until May 20 for site_id == 0. Let's remove these data 
# from training data."
# https://www.kaggle.com/kaushal2896/ashrae-eda-fe-lightgbm-1-13
class RmS0M0(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

    def fit(self, X, y=None, **fit_params):
        return self
    


In [9]:
class ImputeWeather(TransformerMixin):

    def __init__(self, method:str='linear', gap_limit:int=None, limit_direction:str='forward'):
        self._method = method
        self._gap_limit = gap_limit
        self._limit_direction = limit_direction
        
    def transform(self, weather_df, **transform_params):
        grouped_weather_df = weather_df.groupby('site_id').apply(lambda group: group.interpolate(method=self._method, limit=self._gap_limit, limit_direction=self._limit_direction))
        if 'cloud_coverage' in grouped_weather_df.columns:
            grouped_weather_df['cloud_coverage'] = grouped_weather_df['cloud_coverage'].round(decimals=0).clip(0,8)
        grouped_weather_df.reset_index(inplace=True)
        weather_df = grouped_weather_df.drop(['index','Unnamed: 0'], axis=1)
        gc.collect()
        return weather_df

    def fit(self, X, y=None, **fit_params):
        return self
print(weather_test.head(20))
print(ImputeWeather().transform(weather_test.head(20)))

    Unnamed: 0  site_id           timestamp  air_temperature  cloud_coverage  \
0            0        0 2016-12-31 19:00:00        17.796875             4.0   
1            1        0 2016-12-31 20:00:00        17.796875             2.0   
2            2        0 2016-12-31 21:00:00        16.093750             0.0   
3            3        0 2016-12-31 22:00:00        17.203125             0.0   
4            4        0 2016-12-31 23:00:00        16.703125             2.0   
5            5        0 2017-01-01 00:00:00        15.601562             2.0   
6            6        0 2017-01-01 01:00:00        15.000000             0.0   
7            7        0 2017-01-01 02:00:00        15.000000             2.0   
8            8        0 2017-01-01 03:00:00        13.296875             0.0   
9            9        0 2017-01-01 04:00:00        12.203125             4.0   
10          10        0 2017-01-01 05:00:00        13.898438             2.0   
11          11        0 2017-01-01 06:00

In [10]:
class AddWeatherLags(TransformerMixin):
    
    def __init__(self, window):
        self._window = window
        
    def transform(self, weather_df, **transform_params):
        group_df = weather_df.groupby(['site_id'])
        cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
        rolled = group_df[cols].rolling(window=self._window, min_periods=0)
        lag_mean = rolled.mean().reset_index().astype(np.float16)
        lag_max = rolled.max().reset_index().astype(np.float16)
        lag_min = rolled.min().reset_index().astype(np.float16)
        lag_std = rolled.std().reset_index().astype(np.float16)
        for col in cols:
            weather_df[f'{col}_mean_lag{self._window}'] = lag_mean[col]
            weather_df[f'{col}_max_lag{self._window}'] = lag_max[col]
            weather_df[f'{col}_min_lag{self._window}'] = lag_min[col]
            weather_df[f'{col}_std_lag{self._window}'] = lag_std[col]
        del group_df, rolled
        gc.collect()
        return weather_df

    def fit(self, X, y=None, **fit_params):
        return self
    
print(AddWeatherLags(72).transform(weather_train.head(20)))

    Unnamed: 0  site_id           timestamp  air_temperature  cloud_coverage  \
0            0        0 2015-12-31 19:00:00        25.000000             6.0   
1            1        0 2015-12-31 20:00:00        24.406250             NaN   
2            2        0 2015-12-31 21:00:00        22.796875             2.0   
3            3        0 2015-12-31 22:00:00        21.093750             2.0   
4            4        0 2015-12-31 23:00:00        20.000000             2.0   
5            5        0 2016-01-01 00:00:00        19.406250             NaN   
6            6        0 2016-01-01 01:00:00        21.093750             6.0   
7            7        0 2016-01-01 02:00:00        21.093750             NaN   
8            8        0 2016-01-01 03:00:00        20.593750             NaN   
9            9        0 2016-01-01 04:00:00        21.093750             NaN   
10          10        0 2016-01-01 05:00:00        21.093750             NaN   
11          11        0 2016-01-01 06:00

[20 rows x 38 columns]


In [11]:
class AddWeather(TransformerMixin):

    def __init__(self, weather_df):
        self._b_df = weather_df
        
    def transform(self, df, **transform_params):
        return df.merge(weather_test, on=['site_id', 'timestamp'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

In [12]:
class AddBuilding(TransformerMixin):

    def __init__(self, building_df):
        self._b_df = building_df
        
    def transform(self, df, **transform_params):
        return df.merge(_b_df, on='building_id', how='left')

    def fit(self, X, y=None, **fit_params):
        return self

In [13]:
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        df = df.merge(holiday_df, on=['building_id','meter','timestamp','site_id'], how='left')
        return df

    def fit(self, X, y=None, **fit_params):
        return self
# Test 
addHolidays = AddHolidays()
print(addHolidays.transform(train.head(2000).merge(building, on='building_id', how='left'))[['building_id','timestamp','holiday']])

      building_id  timestamp         holiday
0               0 2016-01-01  New Year's Day
1               1 2016-01-01  New Year's Day
2               2 2016-01-01  New Year's Day
3               3 2016-01-01  New Year's Day
4               4 2016-01-01  New Year's Day
5               5 2016-01-01  New Year's Day
6               6 2016-01-01  New Year's Day
7               7 2016-01-01  New Year's Day
8               8 2016-01-01  New Year's Day
9               9 2016-01-01  New Year's Day
10             10 2016-01-01  New Year's Day
11             11 2016-01-01  New Year's Day
12             12 2016-01-01  New Year's Day
13             13 2016-01-01  New Year's Day
14             14 2016-01-01  New Year's Day
15             15 2016-01-01  New Year's Day
16             16 2016-01-01  New Year's Day
17             17 2016-01-01  New Year's Day
18             18 2016-01-01  New Year's Day
19             19 2016-01-01  New Year's Day
20             20 2016-01-01  New Year's Day
21        

In [14]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        df = df.merge(holiday_df, on=['building_id','meter','timestamp','site_id'], how='left')
        df = df.drop(df[df['holiday'].notnull()].index)
        df = df.drop(['holiday'], axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# Test you should see the new years removed
rmHolidays = RmHolidays()
print(rmHolidays.transform(train.head(100000).merge(building, on='building_id', how='left')))

       building_id  meter           timestamp  meter_reading  site_id  \
55121            0      0 2016-01-02 00:00:00       0.000000        0   
55122            1      0 2016-01-02 00:00:00       0.000000        0   
55123            2      0 2016-01-02 00:00:00       0.000000        0   
55124            3      0 2016-01-02 00:00:00       0.000000        0   
55125            4      0 2016-01-02 00:00:00       0.000000        0   
55126            5      0 2016-01-02 00:00:00       0.000000        0   
55127            6      0 2016-01-02 00:00:00       0.000000        0   
55128            7      0 2016-01-02 00:00:00       0.000000        0   
55129            8      0 2016-01-02 00:00:00       0.000000        0   
55130            9      0 2016-01-02 00:00:00       0.000000        0   
55131           10      0 2016-01-02 00:00:00       0.000000        0   
55132           11      0 2016-01-02 00:00:00       0.000000        0   
55133           12      0 2016-01-02 00:00:00      

[41839 rows x 9 columns]


In [15]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(building.head(20)['square_feet'])

0       7432.0
1       2720.0
2       5376.0
3      23685.0
4     116607.0
5       8000.0
6      27926.0
7     121074.0
8      60809.0
9      27000.0
10    370773.0
11     49073.0
12     37100.0
13     99380.0
14     86250.0
15     83957.0
16     54644.0
17     15250.0
18    111891.0
19     18717.0
Name: square_feet, dtype: float32


In [16]:
class SetCatTypes(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['primary_use']= df['primary_use'].astype('category')
        df['meter'] = df["meter"].astype('category')
        df['site_id'] = df["site_id"].astype('category')
        df['building_id'] = df['building_id'].astype('category')
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [17]:
class ImputeCloudCoverage(TransformerMixin):
        
    def transform(self, df, **transform_params):
        # set age of building to mediam of site_id
        # else if set ot overall median
        median = df['cloud_coverage'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['cloud_coverage'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = i_median
            else:
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = median
        df['cloud_coverage'] = np.uint8(df['cloud_coverage'])
        df['cloud_coverage'] = df['cloud_coverage']
        del median
        gc.collect()
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [18]:
class CloudTimeCat(TransformerMixin):
        
    def transform(self, df, **transform_params):
        tempDf = df[['cloud_coverage', 'hour']].astype('int')
        tempDf['cloud_coverage'] = (tempDf['cloud_coverage']).astype('int')
        tempDf['hour'] = (tempDf['hour']).astype('int')
        tempDf = tempDf.astype('str')
        df['cloud_time_cat'] = 'c' + tempDf['cloud_coverage'] + 't' + tempDf['hour']
        df['cloud_time_cat'] = df['cloud_time_cat'].astype('category')
        del tempDf
        gc.collect()
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [19]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [20]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        year_built_median = df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        del year_built_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [21]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [22]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('category') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = (df['timestamp'].dt.dayofweek * 24) + df['timestamp'].dt.hour
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('category')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [23]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [24]:
class FillMean(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].mean())
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [25]:
class FillZeros(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(0)
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [26]:
class FillMedian(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].median())
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [27]:
class FillPopular(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].value_counts()[0])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [28]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [29]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [30]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [31]:
weather_pipes = Pipeline(
    steps=[
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeWeather', ImputeWeather()),
        ('fillMean',FillMean(['air_temperature','dew_temperature'
                              , 'precip_depth_1_hr', 'sea_level_pressure'])),
        ('imputeCloudCoverage', ImputeCloudCoverage()),
        ('addWeatherLags3', AddWeatherLags(3)),
        ('addWeatherLags72', AddWeatherLags(72)),
    ]
)

building_pipes = Pipeline(
    steps=[
        ('logSquareFeet', LogSquareFeet()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('fillMean',FillMean(['floor_count'])),
        ('dropClos', DropCols(['square_feet', 'year_built'])),
    ]
)


# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        #('convertToDatetime', ConvertToDatetime()),
        ('rmS0M0', RmS0M0()),
        #('addHolidays', AddHolidays()),
        ('rmHolidays', RmHolidays()),
        ('addRelativeHumidity',AddRelativeHumidity()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('setCatTypes', SetCatTypes()),
        ('fillMean',FillMean([])),
        ('fillZeros',FillZeros([])),
        ('dropClos', DropCols(['timestamp'])),
        ('GC', GC())
    ]
)

In [32]:
test_X = x_pipes.transform(
    test
        .merge(building_pipes.transform(building), on='building_id', how='left').drop(['row_id'], axis=1)
        .merge(weather_pipes.transform(weather_test), on=['site_id', 'timestamp'], how='left')
    )

print(test_X.sample(n=20,  random_state=42))
print(test_X.shape)

         building_id meter site_id                    primary_use  \
21108577         895     1       9                      Education   
4063882          169     1       2                      Education   
23643600         945     2       9                         Office   
38594617        1344     1      15                      Education   
35906878        1305     0      14            Lodging/residential   
19367120         804     0       8                         Office   
24336200         982     0       9                       Services   
8393798          221     1       2                      Education   
4116452          288     0       2  Entertainment/public assembly   
4141951          186     0       2              Religious worship   
26913518        1139     2      13                         Office   
39654538        1426     2      15  Entertainment/public assembly   
24405013         983     2       9  Entertainment/public assembly   
6560502          277     0       2

[20 rows x 72 columns]
(40368192, 72)


In [33]:
def getOutsideFoldXY(train_index):
    X = train.iloc[train_index].drop('meter_reading', axis=1)
    X_buildings = building[building['building_id'].isin(X['building_id'].unique())]
    X_weather = building[building['building_id'].isin(X['building_id'].unique())]
    X = x_pipes.transform(
        X
            .merge(building_pipes.transform(X_buildings), on='building_id', how='left')
            .merge(weather_train_trans, on=['site_id', 'timestamp'], how='left')
        )
    f_train_y = np.log1p(train.iloc[train_index]['meter_reading'])
    print(X.columns)
    return X,f_train_y



def getInFoldXY(train_index):
    X = train.iloc[train_index]
    X_buildings = building[building['building_id'].isin(X['building_id'].unique())]
    X = X.merge(building_pipes.transform(X_buildings), on='building_id', how='left')
    X_weather = weather_train[
        (weather_train['site_id'].isin(X['site_id'].unique())) 
         & (weather_train['timestamp'].isin(X['timestamp'].unique())) 
    ]
    X = x_pipes.transform(
        rmHolidays.transform(
            X.merge(weather_pipes.transform(X_weather), how='left')))
    return X.drop('meter_reading', axis=1),  np.log1p(X['meter_reading'])


print(getInFoldXY(train.head(10).index))

(Empty DataFrame
Columns: [primary_use, floor_count, log_square_feet, building_age, air_temperature, cloud_coverage, dew_temperature, precip_depth_1_hr, sea_level_pressure, wind_direction, wind_speed, air_temperature_mean_lag3, air_temperature_max_lag3, air_temperature_min_lag3, air_temperature_std_lag3, cloud_coverage_mean_lag3, cloud_coverage_max_lag3, cloud_coverage_min_lag3, cloud_coverage_std_lag3, dew_temperature_mean_lag3, dew_temperature_max_lag3, dew_temperature_min_lag3, dew_temperature_std_lag3, precip_depth_1_hr_mean_lag3, precip_depth_1_hr_max_lag3, precip_depth_1_hr_min_lag3, precip_depth_1_hr_std_lag3, sea_level_pressure_mean_lag3, sea_level_pressure_max_lag3, sea_level_pressure_min_lag3, sea_level_pressure_std_lag3, wind_direction_mean_lag3, wind_direction_max_lag3, wind_direction_min_lag3, wind_direction_std_lag3, wind_speed_mean_lag3, wind_speed_max_lag3, wind_speed_min_lag3, wind_speed_std_lag3, air_temperature_mean_lag72, air_temperature_max_lag72, air_temperature_m

In [34]:
gbm_params = {
    'n_estimators' : 500, # for accuracy use large numbers like 6000 
    'learning_rate': 0.4,
    'feature_fraction' : 0.9,
    'subsample' : 0.1,  # 
    'subsample_freq' : 1,
    'num_leaves' : 20,
    'max_depth' : 10,
    'metric':'rmse',
    'lambda_l1' : 1,  
    'lambda_l2': 1,
    'verbose': 100
}

In [None]:
%%time

folds = 5

# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

models = []
best_scores = []
for train_index, val_index in kf.split(train, train['building_id']):
    f_train_X, f_train_y = getInFoldXY(train_index)
    f_val_X, f_val_y = getInFoldXY(val_index)
    gbm = LGBMRegressor(**gbm_params)
    gbm.fit(f_train_X, f_train_y,
        eval_set=[(f_val_X, f_val_y)],
        # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
        # eval_metric=lbm_rmslee,
        early_stopping_rounds=20)
    models.append(gbm)
    #y_pred = gbm.predict(f_val_X, num_iteration=gbm.best_iteration_)
    # eval
    #rmsle_score = lbm_rmslee(f_val_X, y_pred)[1]
    best_scores.append(gbm.best_score_)
    del f_train_X, f_train_y, f_val_X, f_val_y, gbm
    gc.collect()


[1]	valid_0's rmse: 1.78692
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.63559
[3]	valid_0's rmse: 1.55846
[4]	valid_0's rmse: 1.52245
[5]	valid_0's rmse: 1.33708
[6]	valid_0's rmse: 1.26372
[7]	valid_0's rmse: 1.2389
[8]	valid_0's rmse: 1.22071
[9]	valid_0's rmse: 1.21187
[10]	valid_0's rmse: 1.16863
[11]	valid_0's rmse: 1.13592
[12]	valid_0's rmse: 1.11741
[13]	valid_0's rmse: 1.11102
[14]	valid_0's rmse: 1.10158
[15]	valid_0's rmse: 1.09741
[16]	valid_0's rmse: 1.09359
[17]	valid_0's rmse: 1.08839
[18]	valid_0's rmse: 1.07055
[19]	valid_0's rmse: 1.06519
[20]	valid_0's rmse: 1.05773
[21]	valid_0's rmse: 1.04278
[22]	valid_0's rmse: 1.03719
[23]	valid_0's rmse: 1.03195
[24]	valid_0's rmse: 1.02964
[25]	valid_0's rmse: 1.02595
[26]	valid_0's rmse: 1.02363
[27]	valid_0's rmse: 1.02105
[28]	valid_0's rmse: 1.01904
[29]	valid_0's rmse: 1.01667
[30]	valid_0's rmse: 1.01336
[31]	valid_0's rmse: 1.00494
[32]	valid_0's rmse: 1.00228
[33]	valid_0's rmse: 

[268]	valid_0's rmse: 0.825824
[269]	valid_0's rmse: 0.82569
[270]	valid_0's rmse: 0.825539
[271]	valid_0's rmse: 0.825218
[272]	valid_0's rmse: 0.825033
[273]	valid_0's rmse: 0.824916
[274]	valid_0's rmse: 0.824821
[275]	valid_0's rmse: 0.824772
[276]	valid_0's rmse: 0.824018
[277]	valid_0's rmse: 0.823942
[278]	valid_0's rmse: 0.823863
[279]	valid_0's rmse: 0.823605
[280]	valid_0's rmse: 0.823365
[281]	valid_0's rmse: 0.822849
[282]	valid_0's rmse: 0.822594
[283]	valid_0's rmse: 0.822381
[284]	valid_0's rmse: 0.822169
[285]	valid_0's rmse: 0.821921
[286]	valid_0's rmse: 0.821809
[287]	valid_0's rmse: 0.821544
[288]	valid_0's rmse: 0.821442
[289]	valid_0's rmse: 0.821151
[290]	valid_0's rmse: 0.821032
[291]	valid_0's rmse: 0.820845
[292]	valid_0's rmse: 0.820639
[293]	valid_0's rmse: 0.820404
[294]	valid_0's rmse: 0.820344
[295]	valid_0's rmse: 0.820246
[296]	valid_0's rmse: 0.820073
[297]	valid_0's rmse: 0.819902
[298]	valid_0's rmse: 0.819638
[299]	valid_0's rmse: 0.819457
[300]	val

[31]	valid_0's rmse: 1.02334
[32]	valid_0's rmse: 1.01866
[33]	valid_0's rmse: 1.01135
[34]	valid_0's rmse: 0.998046
[35]	valid_0's rmse: 0.991462
[36]	valid_0's rmse: 0.989155
[37]	valid_0's rmse: 0.986709
[38]	valid_0's rmse: 0.984503
[39]	valid_0's rmse: 0.972183
[40]	valid_0's rmse: 0.970781
[41]	valid_0's rmse: 0.965428
[42]	valid_0's rmse: 0.963815
[43]	valid_0's rmse: 0.962184
[44]	valid_0's rmse: 0.958798
[45]	valid_0's rmse: 0.954182
[46]	valid_0's rmse: 0.952131
[47]	valid_0's rmse: 0.950104
[48]	valid_0's rmse: 0.949155
[49]	valid_0's rmse: 0.948113
[50]	valid_0's rmse: 0.946175
[51]	valid_0's rmse: 0.9442
[52]	valid_0's rmse: 0.942462
[53]	valid_0's rmse: 0.941584
[54]	valid_0's rmse: 0.940449
[55]	valid_0's rmse: 0.938597
[56]	valid_0's rmse: 0.937662
[57]	valid_0's rmse: 0.935031
[58]	valid_0's rmse: 0.934219
[59]	valid_0's rmse: 0.933538
[60]	valid_0's rmse: 0.932507
[61]	valid_0's rmse: 0.931255
[62]	valid_0's rmse: 0.930752
[63]	valid_0's rmse: 0.929446
[64]	valid_0's 

[298]	valid_0's rmse: 0.817956
[299]	valid_0's rmse: 0.817754
[300]	valid_0's rmse: 0.817648
[301]	valid_0's rmse: 0.817415
[302]	valid_0's rmse: 0.817336
[303]	valid_0's rmse: 0.817043
[304]	valid_0's rmse: 0.816574
[305]	valid_0's rmse: 0.816481
[306]	valid_0's rmse: 0.816292
[307]	valid_0's rmse: 0.815946
[308]	valid_0's rmse: 0.815766
[309]	valid_0's rmse: 0.815644
[310]	valid_0's rmse: 0.815397
[311]	valid_0's rmse: 0.815297
[312]	valid_0's rmse: 0.815053
[313]	valid_0's rmse: 0.814775
[314]	valid_0's rmse: 0.814726
[315]	valid_0's rmse: 0.814622
[316]	valid_0's rmse: 0.814558
[317]	valid_0's rmse: 0.814431
[318]	valid_0's rmse: 0.814298
[319]	valid_0's rmse: 0.814212
[320]	valid_0's rmse: 0.814203
[321]	valid_0's rmse: 0.81403
[322]	valid_0's rmse: 0.813833
[323]	valid_0's rmse: 0.813746
[324]	valid_0's rmse: 0.81355
[325]	valid_0's rmse: 0.813202
[326]	valid_0's rmse: 0.813019
[327]	valid_0's rmse: 0.812953
[328]	valid_0's rmse: 0.812854
[329]	valid_0's rmse: 0.812806
[330]	vali

In [None]:
for score in best_scores:
    print(score['valid_0']['rmse'])

In [None]:
#
#
#
#
#
# Cross val models ensemble 
#
#
#
#
#

In [None]:
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
    #res.append(np.expm1(gbm.predict(test_X.iloc[i:i+step_size])))
    i+=step_size
    
    
    


In [None]:
imprtc_df = pd.DataFrame()
imprtc_df['feature'] = test_X.columns   
imprtc_df['importance'] = models[0].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)
print(test_X.columns)

In [None]:
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission.csv.zip', index=False)
submission.shape

In [None]:
#
#
#
#
#
# Single model fit
#
#
#
#
#

In [None]:
%%time

gbm = LGBMRegressor(**gbm_params)
f_train_X, f_train_y = getInFoldXY(train.index)
gbm.fit(f_train_X, f_train_y)

In [None]:
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    #res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
    res.append(np.expm1(gbm.predict(test_X.iloc[i:i+step_size])))
    i+=step_size
    

In [None]:
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission.csv.zip', index=False)
submission.shape