In [1]:
gbm_params = {
    'n_estimators' : 500, # 500,  
    'max_depth' : 8,
    'learning_rate': 0.1,
    'bagging_fraction': 0.1, # TODO: try 0.9
    
    'feature_fraction' : 0.9,
    'bagging_freq': 5,
    'subsample' : 0.1,  # 
    'subsample_freq' : 1,
    'num_leaves' : 20,
    'metric':'rmse',
    #'lambda_l1' : 1,  # Try defaults
    #'lambda_l2': 1, # Try defaults
    'verbose': 100
}

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [3]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [4]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
    'weather_test' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16},
    'weather_train' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16}
}

def loadFile(name):
    for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
        if path.exists(dir_path + name + '.csv'):
            return  ConvertToDatetime().transform(
                pd.read_csv(dir_path + name + '.csv', dtype=file_dtype[name]))
        


In [5]:
building = loadFile('building_metadata')
weather_train = loadFile('weather_train')
weather_test = loadFile('weather_test')
train = loadFile('train')
test = loadFile('test')



In [6]:
# align weather
def weatherSiteOffsets():
    weather = pd.concat([weather_train,weather_test],ignore_index=True)
    weather['timestamp'] = pd.to_datetime(weather['timestamp'])
    weather_key = ['site_id', 'timestamp']

    temp_skeleton = weather[weather_key + ['air_temperature']].drop_duplicates(subset=weather_key).sort_values(by=weather_key).copy()

    # calculate ranks of hourly temperatures within date/site_id chunks
    temp_skeleton['temp_rank'] = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.date])['air_temperature'].rank('average')
    
    # create a dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
    df_2d = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.hour])['temp_rank'].mean().unstack(level=1)

    # Subtract the columnID of temperature peak by 14, getting the timestamp alignment gap.
    site_ids_offsets = pd.Series(df_2d.values.argmax(axis=1) - 14)
    site_ids_offsets.index.name = 'site_id'
    return site_ids_offsets

site_time_offsets_df = weatherSiteOffsets()

def alignWeather(df):
    df['offset'] = df.site_id.map(site_time_offsets_df)
    df['timestamp_aligned'] = (df.timestamp - pd.to_timedelta(df.offset, unit='H'))
    df['timestamp'] = df['timestamp_aligned']
    del df['timestamp_aligned'], df['offset']
    gc.collect()
    return df

weather_train = alignWeather(weather_train)
weather_test = alignWeather(weather_test)

del site_time_offsets_df
gc.collect()

7

In [7]:
# TODO: do something with race precipitation is coded as a “-1” value
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/113103#latest-664978
# Test out below
def cleanPrecipDepth(df):
    df.loc[df['precip_depth_1_hr'] == -1, 'precip_depth_1_hr'] = 0.25
    return df

#weather_train = cleanPrecipDepth(weather_train)
#weather_test = cleanPrecipDepth(weather_test)

#print(weather_train['precip_depth_1_hr'].value_counts().sort_index())

In [8]:
# See holiday notebook to generate
holiday_df = None
if path.exists('../input/holiday-pickle/holiday_df.pickle'):
    holiday_df = pd.read_pickle('../input/holiday-pickle/holiday_df.pickle')
elif path.exists('../input/ashrae-energy-prediction/holiday_df.pickle'):
    holiday_df = pd.read_pickle('../input/ashrae-energy-prediction/holiday_df.pickle')

In [9]:
class MeterReadingLog1p(TransformerMixin):
  
    def transform(self, df, **transform_params):
        if 'meter_reading' in df.columns:
            df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
            df = df.drop('meter_reading', axis=1)
        return df
    
    def fit(self, X, y=None, **fit_params):
        return self
print(train.sample(20, random_state=42))
print(MeterReadingLog1p().transform(train.sample(20, random_state=42)))
gc.collect()

          building_id  meter           timestamp  meter_reading
14245562         1324      1 2016-09-16 16:00:00       0.000000
1282718          1013      0 2016-01-24 06:00:00      32.000099
13883790          229      1 2016-09-10 07:00:00     567.655029
4781820           217      3 2016-04-01 01:00:00       0.000000
10415393         1434      0 2016-07-10 04:00:00      65.750000
1057008          1047      0 2016-01-20 04:00:00      90.983299
4507399           911      1 2016-03-26 20:00:00     295.063995
19478829         1039      0 2016-12-18 23:00:00      16.900000
8955615           265      0 2016-06-14 06:00:00     128.369995
13799839          896      0 2016-09-08 19:00:00     300.000000
15647011          973      0 2016-10-11 11:00:00     247.000000
2524294           813      0 2016-02-16 08:00:00      10.958300
10016102          870      0 2016-07-03 02:00:00       4.166700
3915750           898      0 2016-03-15 03:00:00      40.000000
17217526          903      0 2016-11-08 

24

In [10]:
class BuildingMeterDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        building_df = pd.DataFrame(data={'building_id':train['building_id'].unique()})
        for i in range(4):
            group = df[train['meter'] == i].groupby(['building_id'])['meter_reading_log1p']
            #print(group.head())
            building_mean = group.mean().astype(np.float16)
            building_median = group.median().astype(np.float16)
            building_min = group.min().astype(np.float16)
            building_max = group.max().astype(np.float16)
            building_std = group.std().astype(np.float16)
            building_df['meter_' + str(i) +'_mean'] = building_df['building_id'].map(building_mean).fillna(0)
            building_df['meter_' + str(i) +'_median'] = building_df['building_id'].map(building_median).fillna(0)
            building_df['meter_' + str(i) +'_min'] = building_df['building_id'].map(building_min).fillna(0)
            building_df['meter_' + str(i) +'_max'] = building_df['building_id'].map(building_max).fillna(0)
            building_df['meter_' + str(i) +'_std'] = building_df['building_id'].map(building_std).fillna(0)
            del group
            gc.collect()
        return building_df 
    def fit(self, X, y=None, **fit_params):
        return self

if 'meter_0_mean' not in test.columns:
    building_meter_desc_DF = BuildingMeterDescDF().transform(MeterReadingLog1p().transform(train))
    train = train.merge(building_meter_desc_DF, on='building_id', how='left')
    test = test.merge(building_meter_desc_DF, on='building_id', how='left')

print(test.sample(20, random_state=42))
del building_meter_desc_DF
gc.collect()

            row_id  building_id  meter           timestamp  meter_0_mean  \
3573457    3573457          173      0 2017-02-02 08:00:00      4.832031   
8315486    8315486          222      1 2018-12-18 00:00:00      2.617188   
40305643  40305643         1354      2 2018-06-14 12:00:00      0.000000   
16083617  16083617          712      0 2018-08-23 23:00:00      1.302734   
37204119  37204119         1344      2 2017-01-02 00:00:00      5.789062   
32144852  32144852         1119      1 2018-12-30 19:00:00      4.425781   
5105044    5105044          249      0 2017-09-11 03:00:00      5.921875   
36982844  36982844         1303      1 2018-11-30 19:00:00      5.328125   
20487823  20487823          945      2 2017-04-23 08:00:00      5.406250   
8404196    8404196          217      1 2018-12-30 19:00:00      5.562500   
6889602    6889602          241      0 2018-05-26 10:00:00      3.972656   
16963616  16963616          784      0 2017-11-21 15:00:00      6.503906   
39666699  39

73

In [11]:
# "As you can see above, this data looks weired until May 20. It is 
# reported in this discussion by @barnwellguy that All electricity
# meter is 0 until May 20 for site_id == 0. Let's remove these data 
# from training data."
# https://www.kaggle.com/kaushal2896/ashrae-eda-fe-lightgbm-1-13
class RmS0M0(TransformerMixin):
  
    def transform(self, df, **transform_params):
        return df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

    def fit(self, X, y=None, **fit_params):
        return self
    


In [12]:
# TODO: write filter to remove any 0 meter reading that continue more than N days (try 3)
# Also we need to account for this by meter

In [13]:
# TODO: try both for direction
class ImputeWeather(TransformerMixin):

    def __init__(self, method:str='linear', gap_limit:int=None, limit_direction:str='forward'):
        self._method = method
        self._gap_limit = gap_limit
        self._limit_direction = limit_direction
        
    def transform(self, weather_df, **transform_params):
        grouped_weather_df = weather_df.groupby('site_id').apply(lambda group: group.interpolate(method=self._method, limit=self._gap_limit, limit_direction=self._limit_direction))
        if 'cloud_coverage' in grouped_weather_df.columns:
            grouped_weather_df['cloud_coverage'] = grouped_weather_df['cloud_coverage'].round(decimals=0).clip(0,8)
        grouped_weather_df.reset_index(inplace=True)
        weather_df = grouped_weather_df.drop(['index'], axis=1)
        gc.collect()
        return weather_df

    def fit(self, X, y=None, **fit_params):
        return self
print(weather_train.head(20))
print(ImputeWeather().transform(weather_train.head(20)))

    site_id           timestamp  air_temperature  cloud_coverage  \
0         0 2015-12-31 19:00:00        25.000000             6.0   
1         0 2015-12-31 20:00:00        24.406250             NaN   
2         0 2015-12-31 21:00:00        22.796875             2.0   
3         0 2015-12-31 22:00:00        21.093750             2.0   
4         0 2015-12-31 23:00:00        20.000000             2.0   
5         0 2016-01-01 00:00:00        19.406250             NaN   
6         0 2016-01-01 01:00:00        21.093750             6.0   
7         0 2016-01-01 02:00:00        21.093750             NaN   
8         0 2016-01-01 03:00:00        20.593750             NaN   
9         0 2016-01-01 04:00:00        21.093750             NaN   
10        0 2016-01-01 05:00:00        21.093750             NaN   
11        0 2016-01-01 06:00:00        20.593750             NaN   
12        0 2016-01-01 07:00:00        18.906250             6.0   
13        0 2016-01-01 08:00:00        20.000000

In [14]:
# TODO: rename to rolling
class AddWeatherLags(TransformerMixin):
    
    def __init__(self, window):
        self._window = window
        
    def transform(self, weather_df, **transform_params):
        group_df = weather_df.groupby(['site_id'])
        cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
        rolled = group_df[cols].rolling(window=self._window, min_periods=0)
        lag_mean = rolled.mean().reset_index().astype(np.float16)
        lag_max = rolled.max().reset_index().astype(np.float16)
        lag_min = rolled.min().reset_index().astype(np.float16)
        lag_std = rolled.std().reset_index().astype(np.float16)
        for col in cols:
            weather_df[f'{col}_mean_lag{self._window}'] = lag_mean[col]
            weather_df[f'{col}_max_lag{self._window}'] = lag_max[col]
            weather_df[f'{col}_min_lag{self._window}'] = lag_min[col]
            weather_df[f'{col}_std_lag{self._window}'] = lag_std[col]
        del group_df, rolled
        gc.collect()
        return weather_df

    def fit(self, X, y=None, **fit_params):
        return self
    
print(AddWeatherLags(72).transform(weather_train.head(20)))

    site_id           timestamp  air_temperature  cloud_coverage  \
0         0 2015-12-31 19:00:00        25.000000             6.0   
1         0 2015-12-31 20:00:00        24.406250             NaN   
2         0 2015-12-31 21:00:00        22.796875             2.0   
3         0 2015-12-31 22:00:00        21.093750             2.0   
4         0 2015-12-31 23:00:00        20.000000             2.0   
5         0 2016-01-01 00:00:00        19.406250             NaN   
6         0 2016-01-01 01:00:00        21.093750             6.0   
7         0 2016-01-01 02:00:00        21.093750             NaN   
8         0 2016-01-01 03:00:00        20.593750             NaN   
9         0 2016-01-01 04:00:00        21.093750             NaN   
10        0 2016-01-01 05:00:00        21.093750             NaN   
11        0 2016-01-01 06:00:00        20.593750             NaN   
12        0 2016-01-01 07:00:00        18.906250             6.0   
13        0 2016-01-01 08:00:00        20.000000

[20 rows x 37 columns]


In [15]:
class AddBuilding(TransformerMixin):

    def __init__(self, building_df):
        self._b_df = building_df
        
    def transform(self, df, **transform_params):
        return df.merge(_b_df, on='building_id', how='left')

    def fit(self, X, y=None, **fit_params):
        return self

In [16]:
# TODO: try rolling with power

In [17]:
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['building_id','meter','timestamp','site_id'], how='left')
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self
# Test 
print(AddHolidays().transform(train.head(2000).merge(building, on='building_id', how='left'))[['building_id','timestamp','holiday']])

      building_id  timestamp         holiday
0               0 2016-01-01  New Year's Day
1               1 2016-01-01  New Year's Day
2               2 2016-01-01  New Year's Day
3               3 2016-01-01  New Year's Day
4               4 2016-01-01  New Year's Day
5               5 2016-01-01  New Year's Day
6               6 2016-01-01  New Year's Day
7               7 2016-01-01  New Year's Day
8               8 2016-01-01  New Year's Day
9               9 2016-01-01  New Year's Day
10             10 2016-01-01  New Year's Day
11             11 2016-01-01  New Year's Day
12             12 2016-01-01  New Year's Day
13             13 2016-01-01  New Year's Day
14             14 2016-01-01  New Year's Day
15             15 2016-01-01  New Year's Day
16             16 2016-01-01  New Year's Day
17             17 2016-01-01  New Year's Day
18             18 2016-01-01  New Year's Day
19             19 2016-01-01  New Year's Day
20             20 2016-01-01  New Year's Day
21        

In [18]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        if holiday_df is not None:
            df = df.merge(holiday_df, on=['building_id','meter','timestamp','site_id'], how='left')
            df = df.drop(df[df['holiday'].notnull()].index)
            df = df.drop(['holiday'], axis=1)
            gc.collect()
        else:
            print("Warning: Holiday DF is missing")
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# Test you should see the new years removed
print(RmHolidays().transform(train.head(100000).merge(building, on='building_id', how='left')))

       building_id  meter           timestamp  meter_reading  \
55121            0      0 2016-01-02 00:00:00       0.000000   
55122            1      0 2016-01-02 00:00:00       0.000000   
55123            2      0 2016-01-02 00:00:00       0.000000   
55124            3      0 2016-01-02 00:00:00       0.000000   
55125            4      0 2016-01-02 00:00:00       0.000000   
55126            5      0 2016-01-02 00:00:00       0.000000   
55127            6      0 2016-01-02 00:00:00       0.000000   
55128            7      0 2016-01-02 00:00:00       0.000000   
55129            8      0 2016-01-02 00:00:00       0.000000   
55130            9      0 2016-01-02 00:00:00       0.000000   
55131           10      0 2016-01-02 00:00:00       0.000000   
55132           11      0 2016-01-02 00:00:00       0.000000   
55133           12      0 2016-01-02 00:00:00       0.000000   
55134           13      0 2016-01-02 00:00:00       0.000000   
55135           14      0 2016-01-02 00:

[41839 rows x 30 columns]


In [19]:
class SetCatTypes(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['primary_use']= df['primary_use'].astype('category')
        df['meter'] = df["meter"].astype('category')
        df['site_id'] = df["site_id"].astype('category')
        df['building_id'] = df['building_id'].astype('category')
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [20]:
class ImputeCloudCoverage(TransformerMixin):
        
    def transform(self, df, **transform_params):
        # set age of building to mediam of site_id
        # else if set ot overall median
        median = df['cloud_coverage'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['cloud_coverage'].median().items():
            # TODO add in +9 as a NAN
            if not np.isnan(i_median):
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = i_median
            else:
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = median
        df['cloud_coverage'] = np.uint8(df['cloud_coverage'])
        df['cloud_coverage'] = df['cloud_coverage']
        del median
        gc.collect()
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [21]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self
print(building.head(20)['square_feet'])

0       7432.0
1       2720.0
2       5376.0
3      23685.0
4     116607.0
5       8000.0
6      27926.0
7     121074.0
8      60809.0
9      27000.0
10    370773.0
11     49073.0
12     37100.0
13     99380.0
14     86250.0
15     83957.0
16     54644.0
17     15250.0
18    111891.0
19     18717.0
Name: square_feet, dtype: float32


In [22]:
# TODO: Play with scaling cloud coverage

In [23]:
class CloudTimeCat(TransformerMixin):
        
    def transform(self, df, **transform_params):
        tempDf = df[['cloud_coverage', 'hour']].astype('int')
        tempDf['cloud_coverage'] = (tempDf['cloud_coverage']).astype('int')
        tempDf['hour'] = (tempDf['hour']).astype('int')
        tempDf = tempDf.astype('str')
        df['cloud_time_cat'] = 'c' + tempDf['cloud_coverage'] + 't' + tempDf['hour']
        df['cloud_time_cat'] = df['cloud_time_cat'].astype('category')
        del tempDf
        gc.collect()
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [24]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        df = df.drop(self._drop_cols, axis=1)
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [25]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        # revisit the choice of median vs anything else
        year_built_median = df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        del year_built_median
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [26]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [27]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        # TODO: try week of year as numerical 
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('category') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = (df['timestamp'].dt.dayofweek * 24) + df['timestamp'].dt.hour
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('uint8')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [28]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [29]:
class FillMean(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].mean())
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [30]:
class FillZeros(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(0)
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [31]:
class FillMedian(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].median())
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [32]:
class FillPopular(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].value_counts()[0])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [33]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [34]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [35]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [36]:
weather_pipes = Pipeline(
    steps=[
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeWeather', ImputeWeather()),
        ('fillMean',FillMean(['air_temperature','dew_temperature'
                              , 'precip_depth_1_hr', 'sea_level_pressure'])),
        ('imputeCloudCoverage', ImputeCloudCoverage()),
        ('addWeatherLags3', AddWeatherLags(3)),
        ('addWeatherLags72', AddWeatherLags(72)),
    ]
)

building_pipes = Pipeline(
    steps=[
        ('logSquareFeet', LogSquareFeet()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('fillMean',FillMean(['floor_count'])),
        ('dropClos', DropCols(['square_feet', 'year_built'])),
    ]
)


# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        #('convertToDatetime', ConvertToDatetime()),
        ('meterReadingLog1p',MeterReadingLog1p()),
        ('rmS0M0', RmS0M0()),
        #('addHolidays', AddHolidays()),
        #('rmHolidays', RmHolidays()), called manually in fold
        ('addRelativeHumidity',AddRelativeHumidity()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('setCatTypes', SetCatTypes()),
        ('fillMean',FillMean([])),
        ('fillZeros',FillZeros([])),
        ('dropCols', DropCols(['timestamp'])),
        ('GC', GC())
    ]
)

In [37]:
def getInFoldXY(train_index):
    X = train.iloc[train_index]
    X_buildings = building[building['building_id'].isin(X['building_id'].unique())]
    X = X.merge(building_pipes.transform(X_buildings), on='building_id', how='left')
    X_weather = weather_train[
        (weather_train['site_id'].isin(X['site_id'].unique())) 
         & (weather_train['timestamp'].isin(X['timestamp'].unique())) 
    ]
    X = x_pipes.transform(
        RmHolidays().transform(
            X.merge(weather_pipes.transform(X_weather), how='left')))
    return X

sample_train_X = getInFoldXY(train.sample(n=20,  random_state=42).index)
print(sample_train_X)

   meter  meter_reading_log1p  meter_0_mean  meter_0_median  meter_0_min  \
0      1             0.000000      3.050781        2.708984     0.000000   
1      0             3.496511      4.003906        3.798828     1.286133   
2      1             6.343274      5.656250        5.742188     3.658203   
3      3             0.000000      5.562500        5.515625     3.964844   
4      0             4.200954      4.355469        4.332031     3.572266   
5      0             4.521607      4.914062        5.015625     2.908203   
6      1             5.690576      5.546875        5.699219     0.000000   
7      0             2.884801      3.289062        3.130859     1.785156   
8      0             4.862677      4.785156        4.734375     2.351562   
9      0             5.707110      5.402344        5.562500     0.000000   
10     0             5.513429      4.976562        5.035156     0.000000   
11     0             2.481426      2.427734        2.492188     0.040863   
12     0    

In [38]:
# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
folds = 5
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)


In [39]:
## cross val multipule meter models
def cvTrainMeterEnsemble(train, gbm_params):
    meter_models = [ [], [], [], [] ]
    for train_index, val_index in kf.split(train, train['building_id']):
        f_train = getInFoldXY(train_index)
        f_val = getInFoldXY(val_index)
        for i in range(4):
            f_train_m = f_train[f_train['meter'] == i]
            f_val_m = f_val[f_val['meter'] == i]
            gbm_params_m = gbm_params
            if i == 0:
                gbm_params_m['n_estimators']=500
            elif i == 1:
                gbm_params_m['learning_rate']=0.05
                gbm_params_m['bagging_fraction']=0.05
            elif i == 2:
                gbm_params_m['learning_rate']=0.05
                gbm_params_m['bagging_fraction']=0.08
            else:
                gbm_params_m['learning_rate']=0.03
                gbm_params_m['bagging_fraction']=0.09                
            gbm = LGBMRegressor(**gbm_params_m)
            gbm.fit(f_train_m.drop('meter_reading_log1p', axis=1), f_train_m['meter_reading_log1p'],
                eval_set=[(f_val_m.drop('meter_reading_log1p', axis=1), f_val_m['meter_reading_log1p'])],
                # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
                # eval_metric=lbm_rmslee,
                early_stopping_rounds=20)
            meter_models[i].append(gbm)
            del f_train_m, f_val_m, gbm
            gc.collect()
        del f_train, f_val
        gc.collect()
    return meter_models


In [40]:
## cross single mdoels

def cvTrainSingleEnsemble(train, gbm_params):
    meter_models = [ [] ] # same to use same print out code later
    for train_index, val_index in kf.split(train, train['building_id']):
        f_train = getInFoldXY(train_index)
        f_val = getInFoldXY(val_index)
        gbm.fit(f_train.drop('meter_reading_log1p', axis=1), f_train['meter_reading_log1p'],
            eval_set=[(f_val.drop('meter_reading_log1p', axis=1), f_val['meter_reading_log1p'])],
            # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
            # eval_metric=lbm_rmslee,
            early_stopping_rounds=20)
        meter_models[i].append(gbm)
        del f_train, f_val
        gc.collect()
    return meter_models


In [41]:
%%time
meter_models = cvTrainMeterEnsemble(train, gbm_params)

[1]	valid_0's rmse: 1.45579
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.34088
[3]	valid_0's rmse: 1.23993
[4]	valid_0's rmse: 1.15162
[5]	valid_0's rmse: 1.07429
[6]	valid_0's rmse: 1.0068
[7]	valid_0's rmse: 0.948554
[8]	valid_0's rmse: 0.898202
[9]	valid_0's rmse: 0.854936
[10]	valid_0's rmse: 0.817849
[11]	valid_0's rmse: 0.78636
[12]	valid_0's rmse: 0.759394
[13]	valid_0's rmse: 0.736478
[14]	valid_0's rmse: 0.717275
[15]	valid_0's rmse: 0.700858
[16]	valid_0's rmse: 0.68637
[17]	valid_0's rmse: 0.674047
[18]	valid_0's rmse: 0.664009
[19]	valid_0's rmse: 0.655558
[20]	valid_0's rmse: 0.648173
[21]	valid_0's rmse: 0.641576
[22]	valid_0's rmse: 0.635922
[23]	valid_0's rmse: 0.630718
[24]	valid_0's rmse: 0.625392
[25]	valid_0's rmse: 0.621656
[26]	valid_0's rmse: 0.61826
[27]	valid_0's rmse: 0.615413
[28]	valid_0's rmse: 0.612613
[29]	valid_0's rmse: 0.609414
[30]	valid_0's rmse: 0.606147
[31]	valid_0's rmse: 0.603453
[32]	valid_0's rmse: 0.60137

[267]	valid_0's rmse: 0.476321
[268]	valid_0's rmse: 0.476152
[269]	valid_0's rmse: 0.475921
[270]	valid_0's rmse: 0.475781
[271]	valid_0's rmse: 0.475491
[272]	valid_0's rmse: 0.475324
[273]	valid_0's rmse: 0.475243
[274]	valid_0's rmse: 0.475018
[275]	valid_0's rmse: 0.474922
[276]	valid_0's rmse: 0.474534
[277]	valid_0's rmse: 0.474323
[278]	valid_0's rmse: 0.474126
[279]	valid_0's rmse: 0.473676
[280]	valid_0's rmse: 0.473464
[281]	valid_0's rmse: 0.473389
[282]	valid_0's rmse: 0.473095
[283]	valid_0's rmse: 0.472969
[284]	valid_0's rmse: 0.472769
[285]	valid_0's rmse: 0.47261
[286]	valid_0's rmse: 0.472517
[287]	valid_0's rmse: 0.472421
[288]	valid_0's rmse: 0.472354
[289]	valid_0's rmse: 0.472098
[290]	valid_0's rmse: 0.471892
[291]	valid_0's rmse: 0.471778
[292]	valid_0's rmse: 0.471434
[293]	valid_0's rmse: 0.471218
[294]	valid_0's rmse: 0.471097
[295]	valid_0's rmse: 0.470933
[296]	valid_0's rmse: 0.470733
[297]	valid_0's rmse: 0.470554
[298]	valid_0's rmse: 0.47032
[299]	vali

[29]	valid_0's rmse: 1.46334
[30]	valid_0's rmse: 1.45283
[31]	valid_0's rmse: 1.44313
[32]	valid_0's rmse: 1.43351
[33]	valid_0's rmse: 1.4251
[34]	valid_0's rmse: 1.41764
[35]	valid_0's rmse: 1.40996
[36]	valid_0's rmse: 1.40223
[37]	valid_0's rmse: 1.39517
[38]	valid_0's rmse: 1.38897
[39]	valid_0's rmse: 1.38263
[40]	valid_0's rmse: 1.37716
[41]	valid_0's rmse: 1.37177
[42]	valid_0's rmse: 1.36664
[43]	valid_0's rmse: 1.36186
[44]	valid_0's rmse: 1.35673
[45]	valid_0's rmse: 1.35166
[46]	valid_0's rmse: 1.34744
[47]	valid_0's rmse: 1.34357
[48]	valid_0's rmse: 1.34017
[49]	valid_0's rmse: 1.33645
[50]	valid_0's rmse: 1.33295
[51]	valid_0's rmse: 1.32928
[52]	valid_0's rmse: 1.32612
[53]	valid_0's rmse: 1.32325
[54]	valid_0's rmse: 1.32039
[55]	valid_0's rmse: 1.31781
[56]	valid_0's rmse: 1.31413
[57]	valid_0's rmse: 1.31021
[58]	valid_0's rmse: 1.30775
[59]	valid_0's rmse: 1.30491
[60]	valid_0's rmse: 1.30236
[61]	valid_0's rmse: 1.29925
[62]	valid_0's rmse: 1.29699
[63]	valid_0's 

[305]	valid_0's rmse: 1.13064
[306]	valid_0's rmse: 1.13038
[307]	valid_0's rmse: 1.13026
[308]	valid_0's rmse: 1.13
[309]	valid_0's rmse: 1.12985
[310]	valid_0's rmse: 1.12945
[311]	valid_0's rmse: 1.12896
[312]	valid_0's rmse: 1.12846
[313]	valid_0's rmse: 1.12811
[314]	valid_0's rmse: 1.12773
[315]	valid_0's rmse: 1.12742
[316]	valid_0's rmse: 1.12714
[317]	valid_0's rmse: 1.12683
[318]	valid_0's rmse: 1.12661
[319]	valid_0's rmse: 1.12643
[320]	valid_0's rmse: 1.12625
[321]	valid_0's rmse: 1.12609
[322]	valid_0's rmse: 1.12589
[323]	valid_0's rmse: 1.1255
[324]	valid_0's rmse: 1.12521
[325]	valid_0's rmse: 1.12501
[326]	valid_0's rmse: 1.12436
[327]	valid_0's rmse: 1.12357
[328]	valid_0's rmse: 1.12314
[329]	valid_0's rmse: 1.12298
[330]	valid_0's rmse: 1.12249
[331]	valid_0's rmse: 1.12227
[332]	valid_0's rmse: 1.12207
[333]	valid_0's rmse: 1.12186
[334]	valid_0's rmse: 1.12123
[335]	valid_0's rmse: 1.1207
[336]	valid_0's rmse: 1.12029
[337]	valid_0's rmse: 1.1199
[338]	valid_0's 

[77]	valid_0's rmse: 1.35036
[78]	valid_0's rmse: 1.34822
[79]	valid_0's rmse: 1.3472
[80]	valid_0's rmse: 1.34567
[81]	valid_0's rmse: 1.34388
[82]	valid_0's rmse: 1.34214
[83]	valid_0's rmse: 1.34067
[84]	valid_0's rmse: 1.33952
[85]	valid_0's rmse: 1.3384
[86]	valid_0's rmse: 1.33716
[87]	valid_0's rmse: 1.33575
[88]	valid_0's rmse: 1.33422
[89]	valid_0's rmse: 1.3331
[90]	valid_0's rmse: 1.33206
[91]	valid_0's rmse: 1.33056
[92]	valid_0's rmse: 1.32964
[93]	valid_0's rmse: 1.32838
[94]	valid_0's rmse: 1.32688
[95]	valid_0's rmse: 1.32581
[96]	valid_0's rmse: 1.32399
[97]	valid_0's rmse: 1.3232
[98]	valid_0's rmse: 1.32132
[99]	valid_0's rmse: 1.32032
[100]	valid_0's rmse: 1.31891
[101]	valid_0's rmse: 1.31801
[102]	valid_0's rmse: 1.31722
[103]	valid_0's rmse: 1.31658
[104]	valid_0's rmse: 1.31576
[105]	valid_0's rmse: 1.31488
[106]	valid_0's rmse: 1.31378
[107]	valid_0's rmse: 1.31325
[108]	valid_0's rmse: 1.31231
[109]	valid_0's rmse: 1.31138
[110]	valid_0's rmse: 1.31047
[111]	v

[351]	valid_0's rmse: 1.20031
[352]	valid_0's rmse: 1.20016
[353]	valid_0's rmse: 1.20008
[354]	valid_0's rmse: 1.19999
[355]	valid_0's rmse: 1.19981
[356]	valid_0's rmse: 1.19971
[357]	valid_0's rmse: 1.1996
[358]	valid_0's rmse: 1.19951
[359]	valid_0's rmse: 1.19944
[360]	valid_0's rmse: 1.19912
[361]	valid_0's rmse: 1.1989
[362]	valid_0's rmse: 1.19866
[363]	valid_0's rmse: 1.19846
[364]	valid_0's rmse: 1.19785
[365]	valid_0's rmse: 1.19761
[366]	valid_0's rmse: 1.19729
[367]	valid_0's rmse: 1.19699
[368]	valid_0's rmse: 1.19666
[369]	valid_0's rmse: 1.19635
[370]	valid_0's rmse: 1.19608
[371]	valid_0's rmse: 1.19588
[372]	valid_0's rmse: 1.19537
[373]	valid_0's rmse: 1.1952
[374]	valid_0's rmse: 1.19496
[375]	valid_0's rmse: 1.19481
[376]	valid_0's rmse: 1.1948
[377]	valid_0's rmse: 1.19474
[378]	valid_0's rmse: 1.19474
[379]	valid_0's rmse: 1.19436
[380]	valid_0's rmse: 1.19431
[381]	valid_0's rmse: 1.19364
[382]	valid_0's rmse: 1.19306
[383]	valid_0's rmse: 1.19252
[384]	valid_0'

[124]	valid_0's rmse: 1.40755
[125]	valid_0's rmse: 1.4071
[126]	valid_0's rmse: 1.4063
[127]	valid_0's rmse: 1.40541
[128]	valid_0's rmse: 1.40466
[129]	valid_0's rmse: 1.404
[130]	valid_0's rmse: 1.40342
[131]	valid_0's rmse: 1.4026
[132]	valid_0's rmse: 1.40152
[133]	valid_0's rmse: 1.40078
[134]	valid_0's rmse: 1.39984
[135]	valid_0's rmse: 1.39919
[136]	valid_0's rmse: 1.39866
[137]	valid_0's rmse: 1.3982
[138]	valid_0's rmse: 1.39762
[139]	valid_0's rmse: 1.39713
[140]	valid_0's rmse: 1.39653
[141]	valid_0's rmse: 1.39565
[142]	valid_0's rmse: 1.39498
[143]	valid_0's rmse: 1.39409
[144]	valid_0's rmse: 1.39346
[145]	valid_0's rmse: 1.39272
[146]	valid_0's rmse: 1.39196
[147]	valid_0's rmse: 1.3913
[148]	valid_0's rmse: 1.39063
[149]	valid_0's rmse: 1.38999
[150]	valid_0's rmse: 1.38924
[151]	valid_0's rmse: 1.38864
[152]	valid_0's rmse: 1.38812
[153]	valid_0's rmse: 1.38761
[154]	valid_0's rmse: 1.38697
[155]	valid_0's rmse: 1.38648
[156]	valid_0's rmse: 1.38577
[157]	valid_0's r

[398]	valid_0's rmse: 1.31655
[399]	valid_0's rmse: 1.31633
[400]	valid_0's rmse: 1.31612
[401]	valid_0's rmse: 1.316
[402]	valid_0's rmse: 1.31587
[403]	valid_0's rmse: 1.31575
[404]	valid_0's rmse: 1.31564
[405]	valid_0's rmse: 1.31551
[406]	valid_0's rmse: 1.31536
[407]	valid_0's rmse: 1.31523
[408]	valid_0's rmse: 1.31513
[409]	valid_0's rmse: 1.31497
[410]	valid_0's rmse: 1.31488
[411]	valid_0's rmse: 1.31476
[412]	valid_0's rmse: 1.31462
[413]	valid_0's rmse: 1.31452
[414]	valid_0's rmse: 1.31436
[415]	valid_0's rmse: 1.31425
[416]	valid_0's rmse: 1.31399
[417]	valid_0's rmse: 1.31372
[418]	valid_0's rmse: 1.31363
[419]	valid_0's rmse: 1.31344
[420]	valid_0's rmse: 1.3132
[421]	valid_0's rmse: 1.31285
[422]	valid_0's rmse: 1.31254
[423]	valid_0's rmse: 1.31223
[424]	valid_0's rmse: 1.3119
[425]	valid_0's rmse: 1.31158
[426]	valid_0's rmse: 1.31149
[427]	valid_0's rmse: 1.31141
[428]	valid_0's rmse: 1.31138
[429]	valid_0's rmse: 1.31122
[430]	valid_0's rmse: 1.31114
[431]	valid_0'

[166]	valid_0's rmse: 0.566366
[167]	valid_0's rmse: 0.566113
[168]	valid_0's rmse: 0.565724
[169]	valid_0's rmse: 0.565535
[170]	valid_0's rmse: 0.565139
[171]	valid_0's rmse: 0.56468
[172]	valid_0's rmse: 0.564276
[173]	valid_0's rmse: 0.56374
[174]	valid_0's rmse: 0.563131
[175]	valid_0's rmse: 0.56289
[176]	valid_0's rmse: 0.56244
[177]	valid_0's rmse: 0.561977
[178]	valid_0's rmse: 0.561606
[179]	valid_0's rmse: 0.560979
[180]	valid_0's rmse: 0.560534
[181]	valid_0's rmse: 0.560196
[182]	valid_0's rmse: 0.559956
[183]	valid_0's rmse: 0.559525
[184]	valid_0's rmse: 0.559283
[185]	valid_0's rmse: 0.558813
[186]	valid_0's rmse: 0.558339
[187]	valid_0's rmse: 0.557802
[188]	valid_0's rmse: 0.557407
[189]	valid_0's rmse: 0.557006
[190]	valid_0's rmse: 0.556755
[191]	valid_0's rmse: 0.556417
[192]	valid_0's rmse: 0.556026
[193]	valid_0's rmse: 0.555665
[194]	valid_0's rmse: 0.555478
[195]	valid_0's rmse: 0.555204
[196]	valid_0's rmse: 0.554877
[197]	valid_0's rmse: 0.554589
[198]	valid_

[431]	valid_0's rmse: 0.512079
[432]	valid_0's rmse: 0.511839
[433]	valid_0's rmse: 0.511679
[434]	valid_0's rmse: 0.511515
[435]	valid_0's rmse: 0.51135
[436]	valid_0's rmse: 0.511049
[437]	valid_0's rmse: 0.510845
[438]	valid_0's rmse: 0.510693
[439]	valid_0's rmse: 0.510504
[440]	valid_0's rmse: 0.510364
[441]	valid_0's rmse: 0.510318
[442]	valid_0's rmse: 0.510273
[443]	valid_0's rmse: 0.51023
[444]	valid_0's rmse: 0.510021
[445]	valid_0's rmse: 0.50999
[446]	valid_0's rmse: 0.509843
[447]	valid_0's rmse: 0.509721
[448]	valid_0's rmse: 0.509627
[449]	valid_0's rmse: 0.509491
[450]	valid_0's rmse: 0.509458
[451]	valid_0's rmse: 0.509305
[452]	valid_0's rmse: 0.509206
[453]	valid_0's rmse: 0.509043
[454]	valid_0's rmse: 0.508903
[455]	valid_0's rmse: 0.508779
[456]	valid_0's rmse: 0.508648
[457]	valid_0's rmse: 0.508446
[458]	valid_0's rmse: 0.508328
[459]	valid_0's rmse: 0.508207
[460]	valid_0's rmse: 0.508069
[461]	valid_0's rmse: 0.507986
[462]	valid_0's rmse: 0.507804
[463]	valid

[201]	valid_0's rmse: 1.16782
[202]	valid_0's rmse: 1.16747
[203]	valid_0's rmse: 1.16717
[204]	valid_0's rmse: 1.16693
[205]	valid_0's rmse: 1.16655
[206]	valid_0's rmse: 1.16604
[207]	valid_0's rmse: 1.16552
[208]	valid_0's rmse: 1.16503
[209]	valid_0's rmse: 1.16477
[210]	valid_0's rmse: 1.16441
[211]	valid_0's rmse: 1.16407
[212]	valid_0's rmse: 1.16388
[213]	valid_0's rmse: 1.16362
[214]	valid_0's rmse: 1.16338
[215]	valid_0's rmse: 1.16261
[216]	valid_0's rmse: 1.16243
[217]	valid_0's rmse: 1.16219
[218]	valid_0's rmse: 1.16207
[219]	valid_0's rmse: 1.16196
[220]	valid_0's rmse: 1.16183
[221]	valid_0's rmse: 1.16108
[222]	valid_0's rmse: 1.16058
[223]	valid_0's rmse: 1.15989
[224]	valid_0's rmse: 1.1595
[225]	valid_0's rmse: 1.15891
[226]	valid_0's rmse: 1.15859
[227]	valid_0's rmse: 1.15831
[228]	valid_0's rmse: 1.15808
[229]	valid_0's rmse: 1.15774
[230]	valid_0's rmse: 1.15737
[231]	valid_0's rmse: 1.15709
[232]	valid_0's rmse: 1.15685
[233]	valid_0's rmse: 1.15663
[234]	valid

[474]	valid_0's rmse: 1.08871
[475]	valid_0's rmse: 1.08829
[476]	valid_0's rmse: 1.08822
[477]	valid_0's rmse: 1.08811
[478]	valid_0's rmse: 1.08789
[479]	valid_0's rmse: 1.08784
[480]	valid_0's rmse: 1.08777
[481]	valid_0's rmse: 1.08743
[482]	valid_0's rmse: 1.08704
[483]	valid_0's rmse: 1.08676
[484]	valid_0's rmse: 1.08654
[485]	valid_0's rmse: 1.08627
[486]	valid_0's rmse: 1.086
[487]	valid_0's rmse: 1.08589
[488]	valid_0's rmse: 1.08566
[489]	valid_0's rmse: 1.08548
[490]	valid_0's rmse: 1.08539
[491]	valid_0's rmse: 1.08519
[492]	valid_0's rmse: 1.08502
[493]	valid_0's rmse: 1.08483
[494]	valid_0's rmse: 1.08472
[495]	valid_0's rmse: 1.08437
[496]	valid_0's rmse: 1.0843
[497]	valid_0's rmse: 1.08422
[498]	valid_0's rmse: 1.08415
[499]	valid_0's rmse: 1.08409
[500]	valid_0's rmse: 1.08401
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmse: 1.08401
[1]	valid_0's rmse: 2.55024
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 2.4710

[246]	valid_0's rmse: 1.24041
[247]	valid_0's rmse: 1.23991
[248]	valid_0's rmse: 1.23941
[249]	valid_0's rmse: 1.23901
[250]	valid_0's rmse: 1.23871
[251]	valid_0's rmse: 1.2385
[252]	valid_0's rmse: 1.23803
[253]	valid_0's rmse: 1.23754
[254]	valid_0's rmse: 1.23732
[255]	valid_0's rmse: 1.23693
[256]	valid_0's rmse: 1.23631
[257]	valid_0's rmse: 1.23593
[258]	valid_0's rmse: 1.2356
[259]	valid_0's rmse: 1.23482
[260]	valid_0's rmse: 1.23409
[261]	valid_0's rmse: 1.23364
[262]	valid_0's rmse: 1.23331
[263]	valid_0's rmse: 1.23282
[264]	valid_0's rmse: 1.23243
[265]	valid_0's rmse: 1.23228
[266]	valid_0's rmse: 1.23199
[267]	valid_0's rmse: 1.23165
[268]	valid_0's rmse: 1.2314
[269]	valid_0's rmse: 1.23118
[270]	valid_0's rmse: 1.23088
[271]	valid_0's rmse: 1.23041
[272]	valid_0's rmse: 1.23009
[273]	valid_0's rmse: 1.22996
[274]	valid_0's rmse: 1.2299
[275]	valid_0's rmse: 1.22953
[276]	valid_0's rmse: 1.22914
[277]	valid_0's rmse: 1.22874
[278]	valid_0's rmse: 1.2279
[279]	valid_0's

[16]	valid_0's rmse: 2.01584
[17]	valid_0's rmse: 1.9907
[18]	valid_0's rmse: 1.96627
[19]	valid_0's rmse: 1.94314
[20]	valid_0's rmse: 1.92118
[21]	valid_0's rmse: 1.89992
[22]	valid_0's rmse: 1.87983
[23]	valid_0's rmse: 1.86064
[24]	valid_0's rmse: 1.84208
[25]	valid_0's rmse: 1.82422
[26]	valid_0's rmse: 1.80716
[27]	valid_0's rmse: 1.79085
[28]	valid_0's rmse: 1.77522
[29]	valid_0's rmse: 1.76043
[30]	valid_0's rmse: 1.74631
[31]	valid_0's rmse: 1.73285
[32]	valid_0's rmse: 1.72018
[33]	valid_0's rmse: 1.70771
[34]	valid_0's rmse: 1.69613
[35]	valid_0's rmse: 1.68556
[36]	valid_0's rmse: 1.67476
[37]	valid_0's rmse: 1.66438
[38]	valid_0's rmse: 1.65436
[39]	valid_0's rmse: 1.64498
[40]	valid_0's rmse: 1.63592
[41]	valid_0's rmse: 1.62689
[42]	valid_0's rmse: 1.61841
[43]	valid_0's rmse: 1.6102
[44]	valid_0's rmse: 1.60265
[45]	valid_0's rmse: 1.59545
[46]	valid_0's rmse: 1.58848
[47]	valid_0's rmse: 1.58205
[48]	valid_0's rmse: 1.57575
[49]	valid_0's rmse: 1.56993
[50]	valid_0's r

[292]	valid_0's rmse: 1.34055
[293]	valid_0's rmse: 1.34025
[294]	valid_0's rmse: 1.34004
[295]	valid_0's rmse: 1.33976
[296]	valid_0's rmse: 1.33958
[297]	valid_0's rmse: 1.33909
[298]	valid_0's rmse: 1.33883
[299]	valid_0's rmse: 1.33873
[300]	valid_0's rmse: 1.33842
[301]	valid_0's rmse: 1.33797
[302]	valid_0's rmse: 1.33761
[303]	valid_0's rmse: 1.33746
[304]	valid_0's rmse: 1.3373
[305]	valid_0's rmse: 1.337
[306]	valid_0's rmse: 1.33673
[307]	valid_0's rmse: 1.33628
[308]	valid_0's rmse: 1.33581
[309]	valid_0's rmse: 1.33539
[310]	valid_0's rmse: 1.33516
[311]	valid_0's rmse: 1.33489
[312]	valid_0's rmse: 1.33469
[313]	valid_0's rmse: 1.33447
[314]	valid_0's rmse: 1.33425
[315]	valid_0's rmse: 1.33394
[316]	valid_0's rmse: 1.33375
[317]	valid_0's rmse: 1.33356
[318]	valid_0's rmse: 1.33336
[319]	valid_0's rmse: 1.33323
[320]	valid_0's rmse: 1.33308
[321]	valid_0's rmse: 1.33295
[322]	valid_0's rmse: 1.33283
[323]	valid_0's rmse: 1.33268
[324]	valid_0's rmse: 1.33255
[325]	valid_0

[62]	valid_0's rmse: 0.664435
[63]	valid_0's rmse: 0.661875
[64]	valid_0's rmse: 0.659181
[65]	valid_0's rmse: 0.656692
[66]	valid_0's rmse: 0.654444
[67]	valid_0's rmse: 0.65226
[68]	valid_0's rmse: 0.650184
[69]	valid_0's rmse: 0.648202
[70]	valid_0's rmse: 0.646187
[71]	valid_0's rmse: 0.644352
[72]	valid_0's rmse: 0.642451
[73]	valid_0's rmse: 0.640752
[74]	valid_0's rmse: 0.639131
[75]	valid_0's rmse: 0.637556
[76]	valid_0's rmse: 0.635885
[77]	valid_0's rmse: 0.634384
[78]	valid_0's rmse: 0.632814
[79]	valid_0's rmse: 0.631427
[80]	valid_0's rmse: 0.629983
[81]	valid_0's rmse: 0.628773
[82]	valid_0's rmse: 0.627555
[83]	valid_0's rmse: 0.626373
[84]	valid_0's rmse: 0.625227
[85]	valid_0's rmse: 0.623903
[86]	valid_0's rmse: 0.622803
[87]	valid_0's rmse: 0.62158
[88]	valid_0's rmse: 0.620581
[89]	valid_0's rmse: 0.619656
[90]	valid_0's rmse: 0.61877
[91]	valid_0's rmse: 0.617857
[92]	valid_0's rmse: 0.616793
[93]	valid_0's rmse: 0.615747
[94]	valid_0's rmse: 0.614866
[95]	valid_0'

[328]	valid_0's rmse: 0.526021
[329]	valid_0's rmse: 0.525857
[330]	valid_0's rmse: 0.525705
[331]	valid_0's rmse: 0.525574
[332]	valid_0's rmse: 0.52545
[333]	valid_0's rmse: 0.525349
[334]	valid_0's rmse: 0.525228
[335]	valid_0's rmse: 0.524891
[336]	valid_0's rmse: 0.524655
[337]	valid_0's rmse: 0.524423
[338]	valid_0's rmse: 0.524266
[339]	valid_0's rmse: 0.524119
[340]	valid_0's rmse: 0.523865
[341]	valid_0's rmse: 0.523696
[342]	valid_0's rmse: 0.523541
[343]	valid_0's rmse: 0.523385
[344]	valid_0's rmse: 0.523258
[345]	valid_0's rmse: 0.523114
[346]	valid_0's rmse: 0.52297
[347]	valid_0's rmse: 0.522798
[348]	valid_0's rmse: 0.522554
[349]	valid_0's rmse: 0.522394
[350]	valid_0's rmse: 0.522311
[351]	valid_0's rmse: 0.522223
[352]	valid_0's rmse: 0.521993
[353]	valid_0's rmse: 0.52189
[354]	valid_0's rmse: 0.521686
[355]	valid_0's rmse: 0.521456
[356]	valid_0's rmse: 0.521295
[357]	valid_0's rmse: 0.521087
[358]	valid_0's rmse: 0.521018
[359]	valid_0's rmse: 0.52095
[360]	valid_

[95]	valid_0's rmse: 1.23671
[96]	valid_0's rmse: 1.23544
[97]	valid_0's rmse: 1.23426
[98]	valid_0's rmse: 1.2335
[99]	valid_0's rmse: 1.23211
[100]	valid_0's rmse: 1.23139
[101]	valid_0's rmse: 1.23022
[102]	valid_0's rmse: 1.22905
[103]	valid_0's rmse: 1.22797
[104]	valid_0's rmse: 1.2271
[105]	valid_0's rmse: 1.22633
[106]	valid_0's rmse: 1.22498
[107]	valid_0's rmse: 1.22397
[108]	valid_0's rmse: 1.22301
[109]	valid_0's rmse: 1.2222
[110]	valid_0's rmse: 1.22138
[111]	valid_0's rmse: 1.22051
[112]	valid_0's rmse: 1.21958
[113]	valid_0's rmse: 1.21881
[114]	valid_0's rmse: 1.21769
[115]	valid_0's rmse: 1.217
[116]	valid_0's rmse: 1.21606
[117]	valid_0's rmse: 1.21529
[118]	valid_0's rmse: 1.21437
[119]	valid_0's rmse: 1.21369
[120]	valid_0's rmse: 1.213
[121]	valid_0's rmse: 1.21183
[122]	valid_0's rmse: 1.21101
[123]	valid_0's rmse: 1.21036
[124]	valid_0's rmse: 1.20961
[125]	valid_0's rmse: 1.20896
[126]	valid_0's rmse: 1.20742
[127]	valid_0's rmse: 1.20627
[128]	valid_0's rmse: 

[369]	valid_0's rmse: 1.11198
[370]	valid_0's rmse: 1.11174
[371]	valid_0's rmse: 1.11137
[372]	valid_0's rmse: 1.111
[373]	valid_0's rmse: 1.11088
[374]	valid_0's rmse: 1.11071
[375]	valid_0's rmse: 1.11056
[376]	valid_0's rmse: 1.11031
[377]	valid_0's rmse: 1.11013
[378]	valid_0's rmse: 1.11
[379]	valid_0's rmse: 1.10969
[380]	valid_0's rmse: 1.1094
[381]	valid_0's rmse: 1.10922
[382]	valid_0's rmse: 1.10909
[383]	valid_0's rmse: 1.10888
[384]	valid_0's rmse: 1.10878
[385]	valid_0's rmse: 1.10866
[386]	valid_0's rmse: 1.1086
[387]	valid_0's rmse: 1.10823
[388]	valid_0's rmse: 1.10804
[389]	valid_0's rmse: 1.10787
[390]	valid_0's rmse: 1.1078
[391]	valid_0's rmse: 1.10723
[392]	valid_0's rmse: 1.10648
[393]	valid_0's rmse: 1.10607
[394]	valid_0's rmse: 1.10566
[395]	valid_0's rmse: 1.10557
[396]	valid_0's rmse: 1.10545
[397]	valid_0's rmse: 1.10535
[398]	valid_0's rmse: 1.10526
[399]	valid_0's rmse: 1.10516
[400]	valid_0's rmse: 1.10503
[401]	valid_0's rmse: 1.10471
[402]	valid_0's rm

[142]	valid_0's rmse: 1.28345
[143]	valid_0's rmse: 1.28284
[144]	valid_0's rmse: 1.28241
[145]	valid_0's rmse: 1.28165
[146]	valid_0's rmse: 1.28122
[147]	valid_0's rmse: 1.28092
[148]	valid_0's rmse: 1.28067
[149]	valid_0's rmse: 1.2803
[150]	valid_0's rmse: 1.27939
[151]	valid_0's rmse: 1.27871
[152]	valid_0's rmse: 1.27796
[153]	valid_0's rmse: 1.27729
[154]	valid_0's rmse: 1.27659
[155]	valid_0's rmse: 1.27616
[156]	valid_0's rmse: 1.27547
[157]	valid_0's rmse: 1.27507
[158]	valid_0's rmse: 1.27458
[159]	valid_0's rmse: 1.2739
[160]	valid_0's rmse: 1.27365
[161]	valid_0's rmse: 1.27328
[162]	valid_0's rmse: 1.27273
[163]	valid_0's rmse: 1.27224
[164]	valid_0's rmse: 1.27198
[165]	valid_0's rmse: 1.27152
[166]	valid_0's rmse: 1.27088
[167]	valid_0's rmse: 1.27017
[168]	valid_0's rmse: 1.26929
[169]	valid_0's rmse: 1.26869
[170]	valid_0's rmse: 1.26784
[171]	valid_0's rmse: 1.26748
[172]	valid_0's rmse: 1.26714
[173]	valid_0's rmse: 1.26671
[174]	valid_0's rmse: 1.26636
[175]	valid_

[416]	valid_0's rmse: 1.1872
[417]	valid_0's rmse: 1.18694
[418]	valid_0's rmse: 1.18639
[419]	valid_0's rmse: 1.18625
[420]	valid_0's rmse: 1.18575
[421]	valid_0's rmse: 1.18554
[422]	valid_0's rmse: 1.18525
[423]	valid_0's rmse: 1.18496
[424]	valid_0's rmse: 1.18465
[425]	valid_0's rmse: 1.18443
[426]	valid_0's rmse: 1.184
[427]	valid_0's rmse: 1.18359
[428]	valid_0's rmse: 1.18323
[429]	valid_0's rmse: 1.18298
[430]	valid_0's rmse: 1.18279
[431]	valid_0's rmse: 1.18251
[432]	valid_0's rmse: 1.18203
[433]	valid_0's rmse: 1.18163
[434]	valid_0's rmse: 1.18135
[435]	valid_0's rmse: 1.18125
[436]	valid_0's rmse: 1.18099
[437]	valid_0's rmse: 1.1808
[438]	valid_0's rmse: 1.18054
[439]	valid_0's rmse: 1.1803
[440]	valid_0's rmse: 1.18008
[441]	valid_0's rmse: 1.17973
[442]	valid_0's rmse: 1.17956
[443]	valid_0's rmse: 1.17915
[444]	valid_0's rmse: 1.17872
[445]	valid_0's rmse: 1.1784
[446]	valid_0's rmse: 1.17824
[447]	valid_0's rmse: 1.17819
[448]	valid_0's rmse: 1.1779
[449]	valid_0's r

[189]	valid_0's rmse: 1.3704
[190]	valid_0's rmse: 1.37018
[191]	valid_0's rmse: 1.36994
[192]	valid_0's rmse: 1.36971
[193]	valid_0's rmse: 1.3695
[194]	valid_0's rmse: 1.36917
[195]	valid_0's rmse: 1.36896
[196]	valid_0's rmse: 1.36853
[197]	valid_0's rmse: 1.36802
[198]	valid_0's rmse: 1.36757
[199]	valid_0's rmse: 1.36726
[200]	valid_0's rmse: 1.36696
[201]	valid_0's rmse: 1.36656
[202]	valid_0's rmse: 1.36625
[203]	valid_0's rmse: 1.36613
[204]	valid_0's rmse: 1.36595
[205]	valid_0's rmse: 1.3656
[206]	valid_0's rmse: 1.36542
[207]	valid_0's rmse: 1.36526
[208]	valid_0's rmse: 1.36499
[209]	valid_0's rmse: 1.36479
[210]	valid_0's rmse: 1.36445
[211]	valid_0's rmse: 1.36419
[212]	valid_0's rmse: 1.36388
[213]	valid_0's rmse: 1.36366
[214]	valid_0's rmse: 1.36341
[215]	valid_0's rmse: 1.36323
[216]	valid_0's rmse: 1.36284
[217]	valid_0's rmse: 1.36249
[218]	valid_0's rmse: 1.36213
[219]	valid_0's rmse: 1.36178
[220]	valid_0's rmse: 1.36143
[221]	valid_0's rmse: 1.36097
[222]	valid_0

[463]	valid_0's rmse: 1.30753
[464]	valid_0's rmse: 1.30732
[465]	valid_0's rmse: 1.30714
[466]	valid_0's rmse: 1.3069
[467]	valid_0's rmse: 1.30678
[468]	valid_0's rmse: 1.30656
[469]	valid_0's rmse: 1.30643
[470]	valid_0's rmse: 1.30631
[471]	valid_0's rmse: 1.30615
[472]	valid_0's rmse: 1.30601
[473]	valid_0's rmse: 1.30588
[474]	valid_0's rmse: 1.30571
[475]	valid_0's rmse: 1.3056
[476]	valid_0's rmse: 1.30553
[477]	valid_0's rmse: 1.3055
[478]	valid_0's rmse: 1.30541
[479]	valid_0's rmse: 1.30535
[480]	valid_0's rmse: 1.30524
[481]	valid_0's rmse: 1.30493
[482]	valid_0's rmse: 1.30465
[483]	valid_0's rmse: 1.30431
[484]	valid_0's rmse: 1.30404
[485]	valid_0's rmse: 1.30374
[486]	valid_0's rmse: 1.30345
[487]	valid_0's rmse: 1.3032
[488]	valid_0's rmse: 1.30302
[489]	valid_0's rmse: 1.30272
[490]	valid_0's rmse: 1.30251
[491]	valid_0's rmse: 1.3022
[492]	valid_0's rmse: 1.30194
[493]	valid_0's rmse: 1.30172
[494]	valid_0's rmse: 1.30146
[495]	valid_0's rmse: 1.30125
[496]	valid_0's

[229]	valid_0's rmse: 0.545953
[230]	valid_0's rmse: 0.545817
[231]	valid_0's rmse: 0.545663
[232]	valid_0's rmse: 0.545433
[233]	valid_0's rmse: 0.545298
[234]	valid_0's rmse: 0.545064
[235]	valid_0's rmse: 0.544876
[236]	valid_0's rmse: 0.544714
[237]	valid_0's rmse: 0.544556
[238]	valid_0's rmse: 0.544386
[239]	valid_0's rmse: 0.544242
[240]	valid_0's rmse: 0.544084
[241]	valid_0's rmse: 0.54372
[242]	valid_0's rmse: 0.543373
[243]	valid_0's rmse: 0.543111
[244]	valid_0's rmse: 0.542844
[245]	valid_0's rmse: 0.542644
[246]	valid_0's rmse: 0.542473
[247]	valid_0's rmse: 0.542255
[248]	valid_0's rmse: 0.542037
[249]	valid_0's rmse: 0.541841
[250]	valid_0's rmse: 0.541645
[251]	valid_0's rmse: 0.541367
[252]	valid_0's rmse: 0.541212
[253]	valid_0's rmse: 0.540982
[254]	valid_0's rmse: 0.540879
[255]	valid_0's rmse: 0.540675
[256]	valid_0's rmse: 0.540284
[257]	valid_0's rmse: 0.539904
[258]	valid_0's rmse: 0.539555
[259]	valid_0's rmse: 0.538972
[260]	valid_0's rmse: 0.53877
[261]	vali

[493]	valid_0's rmse: 0.506307
[494]	valid_0's rmse: 0.506144
[495]	valid_0's rmse: 0.506058
[496]	valid_0's rmse: 0.506019
[497]	valid_0's rmse: 0.505829
[498]	valid_0's rmse: 0.50577
[499]	valid_0's rmse: 0.505724
[500]	valid_0's rmse: 0.505654
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmse: 0.505654
[1]	valid_0's rmse: 2.44837
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 2.37167
[3]	valid_0's rmse: 2.29973
[4]	valid_0's rmse: 2.23236
[5]	valid_0's rmse: 2.17112
[6]	valid_0's rmse: 2.1122
[7]	valid_0's rmse: 2.05715
[8]	valid_0's rmse: 2.0062
[9]	valid_0's rmse: 1.95843
[10]	valid_0's rmse: 1.91359
[11]	valid_0's rmse: 1.87165
[12]	valid_0's rmse: 1.83268
[13]	valid_0's rmse: 1.79724
[14]	valid_0's rmse: 1.76305
[15]	valid_0's rmse: 1.73168
[16]	valid_0's rmse: 1.70247
[17]	valid_0's rmse: 1.67653
[18]	valid_0's rmse: 1.65138
[19]	valid_0's rmse: 1.628
[20]	valid_0's rmse: 1.60608
[21]	valid_0's rmse: 1.58563
[22]	valid_0's rm

[265]	valid_0's rmse: 1.14627
[266]	valid_0's rmse: 1.1458
[267]	valid_0's rmse: 1.14547
[268]	valid_0's rmse: 1.14503
[269]	valid_0's rmse: 1.14456
[270]	valid_0's rmse: 1.1441
[271]	valid_0's rmse: 1.14356
[272]	valid_0's rmse: 1.1433
[273]	valid_0's rmse: 1.14298
[274]	valid_0's rmse: 1.1427
[275]	valid_0's rmse: 1.14236
[276]	valid_0's rmse: 1.14195
[277]	valid_0's rmse: 1.14122
[278]	valid_0's rmse: 1.14098
[279]	valid_0's rmse: 1.14072
[280]	valid_0's rmse: 1.14054
[281]	valid_0's rmse: 1.14012
[282]	valid_0's rmse: 1.13981
[283]	valid_0's rmse: 1.13941
[284]	valid_0's rmse: 1.13884
[285]	valid_0's rmse: 1.13859
[286]	valid_0's rmse: 1.13826
[287]	valid_0's rmse: 1.13819
[288]	valid_0's rmse: 1.13768
[289]	valid_0's rmse: 1.13734
[290]	valid_0's rmse: 1.13689
[291]	valid_0's rmse: 1.13609
[292]	valid_0's rmse: 1.1358
[293]	valid_0's rmse: 1.13556
[294]	valid_0's rmse: 1.13503
[295]	valid_0's rmse: 1.1345
[296]	valid_0's rmse: 1.13415
[297]	valid_0's rmse: 1.13388
[298]	valid_0's 

[35]	valid_0's rmse: 1.47931
[36]	valid_0's rmse: 1.47215
[37]	valid_0's rmse: 1.46549
[38]	valid_0's rmse: 1.45913
[39]	valid_0's rmse: 1.4531
[40]	valid_0's rmse: 1.4477
[41]	valid_0's rmse: 1.44262
[42]	valid_0's rmse: 1.43768
[43]	valid_0's rmse: 1.43257
[44]	valid_0's rmse: 1.42809
[45]	valid_0's rmse: 1.42379
[46]	valid_0's rmse: 1.4194
[47]	valid_0's rmse: 1.41576
[48]	valid_0's rmse: 1.41097
[49]	valid_0's rmse: 1.40677
[50]	valid_0's rmse: 1.40352
[51]	valid_0's rmse: 1.40031
[52]	valid_0's rmse: 1.39735
[53]	valid_0's rmse: 1.39422
[54]	valid_0's rmse: 1.39128
[55]	valid_0's rmse: 1.38845
[56]	valid_0's rmse: 1.38605
[57]	valid_0's rmse: 1.38366
[58]	valid_0's rmse: 1.38111
[59]	valid_0's rmse: 1.3788
[60]	valid_0's rmse: 1.37673
[61]	valid_0's rmse: 1.37447
[62]	valid_0's rmse: 1.37236
[63]	valid_0's rmse: 1.37012
[64]	valid_0's rmse: 1.36806
[65]	valid_0's rmse: 1.36621
[66]	valid_0's rmse: 1.36444
[67]	valid_0's rmse: 1.36272
[68]	valid_0's rmse: 1.36125
[69]	valid_0's rms

[311]	valid_0's rmse: 1.20602
[312]	valid_0's rmse: 1.20578
[313]	valid_0's rmse: 1.20557
[314]	valid_0's rmse: 1.20536
[315]	valid_0's rmse: 1.20519
[316]	valid_0's rmse: 1.20481
[317]	valid_0's rmse: 1.20462
[318]	valid_0's rmse: 1.20431
[319]	valid_0's rmse: 1.20413
[320]	valid_0's rmse: 1.20397
[321]	valid_0's rmse: 1.20361
[322]	valid_0's rmse: 1.2035
[323]	valid_0's rmse: 1.20309
[324]	valid_0's rmse: 1.20299
[325]	valid_0's rmse: 1.20267
[326]	valid_0's rmse: 1.20218
[327]	valid_0's rmse: 1.20169
[328]	valid_0's rmse: 1.20138
[329]	valid_0's rmse: 1.20099
[330]	valid_0's rmse: 1.20066
[331]	valid_0's rmse: 1.20049
[332]	valid_0's rmse: 1.20024
[333]	valid_0's rmse: 1.19989
[334]	valid_0's rmse: 1.19981
[335]	valid_0's rmse: 1.19949
[336]	valid_0's rmse: 1.19926
[337]	valid_0's rmse: 1.19921
[338]	valid_0's rmse: 1.19902
[339]	valid_0's rmse: 1.19888
[340]	valid_0's rmse: 1.19865
[341]	valid_0's rmse: 1.19854
[342]	valid_0's rmse: 1.19819
[343]	valid_0's rmse: 1.198
[344]	valid_0

[83]	valid_0's rmse: 1.45594
[84]	valid_0's rmse: 1.45413
[85]	valid_0's rmse: 1.45237
[86]	valid_0's rmse: 1.4509
[87]	valid_0's rmse: 1.44949
[88]	valid_0's rmse: 1.448
[89]	valid_0's rmse: 1.44658
[90]	valid_0's rmse: 1.44506
[91]	valid_0's rmse: 1.4437
[92]	valid_0's rmse: 1.44243
[93]	valid_0's rmse: 1.44088
[94]	valid_0's rmse: 1.43923
[95]	valid_0's rmse: 1.43779
[96]	valid_0's rmse: 1.43637
[97]	valid_0's rmse: 1.43494
[98]	valid_0's rmse: 1.43362
[99]	valid_0's rmse: 1.43225
[100]	valid_0's rmse: 1.43106
[101]	valid_0's rmse: 1.43003
[102]	valid_0's rmse: 1.42902
[103]	valid_0's rmse: 1.42761
[104]	valid_0's rmse: 1.42664
[105]	valid_0's rmse: 1.42572
[106]	valid_0's rmse: 1.42487
[107]	valid_0's rmse: 1.42394
[108]	valid_0's rmse: 1.42313
[109]	valid_0's rmse: 1.42225
[110]	valid_0's rmse: 1.42124
[111]	valid_0's rmse: 1.42011
[112]	valid_0's rmse: 1.41912
[113]	valid_0's rmse: 1.4181
[114]	valid_0's rmse: 1.41717
[115]	valid_0's rmse: 1.41627
[116]	valid_0's rmse: 1.4154
[11

[357]	valid_0's rmse: 1.32564
[358]	valid_0's rmse: 1.32503
[359]	valid_0's rmse: 1.3248
[360]	valid_0's rmse: 1.32431
[361]	valid_0's rmse: 1.32425
[362]	valid_0's rmse: 1.32419
[363]	valid_0's rmse: 1.32417
[364]	valid_0's rmse: 1.32411
[365]	valid_0's rmse: 1.3239
[366]	valid_0's rmse: 1.32362
[367]	valid_0's rmse: 1.32342
[368]	valid_0's rmse: 1.32303
[369]	valid_0's rmse: 1.32273
[370]	valid_0's rmse: 1.32241
[371]	valid_0's rmse: 1.32217
[372]	valid_0's rmse: 1.32196
[373]	valid_0's rmse: 1.32172
[374]	valid_0's rmse: 1.32159
[375]	valid_0's rmse: 1.3212
[376]	valid_0's rmse: 1.32078
[377]	valid_0's rmse: 1.32062
[378]	valid_0's rmse: 1.32016
[379]	valid_0's rmse: 1.31973
[380]	valid_0's rmse: 1.31942
[381]	valid_0's rmse: 1.31926
[382]	valid_0's rmse: 1.31912
[383]	valid_0's rmse: 1.31904
[384]	valid_0's rmse: 1.31899
[385]	valid_0's rmse: 1.31896
[386]	valid_0's rmse: 1.31875
[387]	valid_0's rmse: 1.31867
[388]	valid_0's rmse: 1.31851
[389]	valid_0's rmse: 1.31842
[390]	valid_0

[126]	valid_0's rmse: 0.588017
[127]	valid_0's rmse: 0.5873
[128]	valid_0's rmse: 0.586466
[129]	valid_0's rmse: 0.585901
[130]	valid_0's rmse: 0.585476
[131]	valid_0's rmse: 0.58491
[132]	valid_0's rmse: 0.584378
[133]	valid_0's rmse: 0.583843
[134]	valid_0's rmse: 0.58314
[135]	valid_0's rmse: 0.582685
[136]	valid_0's rmse: 0.582327
[137]	valid_0's rmse: 0.581795
[138]	valid_0's rmse: 0.581195
[139]	valid_0's rmse: 0.580661
[140]	valid_0's rmse: 0.580149
[141]	valid_0's rmse: 0.579651
[142]	valid_0's rmse: 0.578829
[143]	valid_0's rmse: 0.578193
[144]	valid_0's rmse: 0.57753
[145]	valid_0's rmse: 0.576965
[146]	valid_0's rmse: 0.576428
[147]	valid_0's rmse: 0.57585
[148]	valid_0's rmse: 0.575275
[149]	valid_0's rmse: 0.574837
[150]	valid_0's rmse: 0.574213
[151]	valid_0's rmse: 0.573843
[152]	valid_0's rmse: 0.573563
[153]	valid_0's rmse: 0.573266
[154]	valid_0's rmse: 0.572761
[155]	valid_0's rmse: 0.572113
[156]	valid_0's rmse: 0.571517
[157]	valid_0's rmse: 0.570912
[158]	valid_0'

[391]	valid_0's rmse: 0.517878
[392]	valid_0's rmse: 0.517782
[393]	valid_0's rmse: 0.517479
[394]	valid_0's rmse: 0.517395
[395]	valid_0's rmse: 0.517205
[396]	valid_0's rmse: 0.51702
[397]	valid_0's rmse: 0.516823
[398]	valid_0's rmse: 0.516759
[399]	valid_0's rmse: 0.516592
[400]	valid_0's rmse: 0.51652
[401]	valid_0's rmse: 0.516453
[402]	valid_0's rmse: 0.516401
[403]	valid_0's rmse: 0.516327
[404]	valid_0's rmse: 0.516235
[405]	valid_0's rmse: 0.516188
[406]	valid_0's rmse: 0.5161
[407]	valid_0's rmse: 0.516002
[408]	valid_0's rmse: 0.515879
[409]	valid_0's rmse: 0.515745
[410]	valid_0's rmse: 0.515621
[411]	valid_0's rmse: 0.515487
[412]	valid_0's rmse: 0.515351
[413]	valid_0's rmse: 0.515249
[414]	valid_0's rmse: 0.515122
[415]	valid_0's rmse: 0.514998
[416]	valid_0's rmse: 0.514827
[417]	valid_0's rmse: 0.514695
[418]	valid_0's rmse: 0.514559
[419]	valid_0's rmse: 0.514404
[420]	valid_0's rmse: 0.514256
[421]	valid_0's rmse: 0.514124
[422]	valid_0's rmse: 0.514079
[423]	valid_

[160]	valid_0's rmse: 1.18808
[161]	valid_0's rmse: 1.1871
[162]	valid_0's rmse: 1.18648
[163]	valid_0's rmse: 1.18582
[164]	valid_0's rmse: 1.18513
[165]	valid_0's rmse: 1.18481
[166]	valid_0's rmse: 1.18388
[167]	valid_0's rmse: 1.18334
[168]	valid_0's rmse: 1.1828
[169]	valid_0's rmse: 1.18245
[170]	valid_0's rmse: 1.18165
[171]	valid_0's rmse: 1.18137
[172]	valid_0's rmse: 1.18084
[173]	valid_0's rmse: 1.18018
[174]	valid_0's rmse: 1.17969
[175]	valid_0's rmse: 1.17929
[176]	valid_0's rmse: 1.17886
[177]	valid_0's rmse: 1.17826
[178]	valid_0's rmse: 1.17787
[179]	valid_0's rmse: 1.17768
[180]	valid_0's rmse: 1.17732
[181]	valid_0's rmse: 1.17689
[182]	valid_0's rmse: 1.17644
[183]	valid_0's rmse: 1.17606
[184]	valid_0's rmse: 1.17575
[185]	valid_0's rmse: 1.17531
[186]	valid_0's rmse: 1.17475
[187]	valid_0's rmse: 1.17401
[188]	valid_0's rmse: 1.17358
[189]	valid_0's rmse: 1.1729
[190]	valid_0's rmse: 1.17259
[191]	valid_0's rmse: 1.17197
[192]	valid_0's rmse: 1.17155
[193]	valid_0

[434]	valid_0's rmse: 1.09657
[435]	valid_0's rmse: 1.09646
[436]	valid_0's rmse: 1.09619
[437]	valid_0's rmse: 1.09596
[438]	valid_0's rmse: 1.09577
[439]	valid_0's rmse: 1.0956
[440]	valid_0's rmse: 1.09518
[441]	valid_0's rmse: 1.09485
[442]	valid_0's rmse: 1.09444
[443]	valid_0's rmse: 1.09419
[444]	valid_0's rmse: 1.09384
[445]	valid_0's rmse: 1.09358
[446]	valid_0's rmse: 1.09339
[447]	valid_0's rmse: 1.09324
[448]	valid_0's rmse: 1.09293
[449]	valid_0's rmse: 1.09267
[450]	valid_0's rmse: 1.09252
[451]	valid_0's rmse: 1.09216
[452]	valid_0's rmse: 1.09194
[453]	valid_0's rmse: 1.09172
[454]	valid_0's rmse: 1.09152
[455]	valid_0's rmse: 1.09134
[456]	valid_0's rmse: 1.09122
[457]	valid_0's rmse: 1.09111
[458]	valid_0's rmse: 1.09098
[459]	valid_0's rmse: 1.09084
[460]	valid_0's rmse: 1.09068
[461]	valid_0's rmse: 1.09034
[462]	valid_0's rmse: 1.09006
[463]	valid_0's rmse: 1.08991
[464]	valid_0's rmse: 1.08986
[465]	valid_0's rmse: 1.08961
[466]	valid_0's rmse: 1.08924
[467]	valid

[207]	valid_0's rmse: 1.25827
[208]	valid_0's rmse: 1.25772
[209]	valid_0's rmse: 1.25731
[210]	valid_0's rmse: 1.25705
[211]	valid_0's rmse: 1.25614
[212]	valid_0's rmse: 1.25546
[213]	valid_0's rmse: 1.25484
[214]	valid_0's rmse: 1.25426
[215]	valid_0's rmse: 1.25366
[216]	valid_0's rmse: 1.25337
[217]	valid_0's rmse: 1.25299
[218]	valid_0's rmse: 1.2527
[219]	valid_0's rmse: 1.2523
[220]	valid_0's rmse: 1.25204
[221]	valid_0's rmse: 1.25149
[222]	valid_0's rmse: 1.25083
[223]	valid_0's rmse: 1.25044
[224]	valid_0's rmse: 1.2499
[225]	valid_0's rmse: 1.24935
[226]	valid_0's rmse: 1.24912
[227]	valid_0's rmse: 1.24886
[228]	valid_0's rmse: 1.24849
[229]	valid_0's rmse: 1.24816
[230]	valid_0's rmse: 1.24707
[231]	valid_0's rmse: 1.24653
[232]	valid_0's rmse: 1.24618
[233]	valid_0's rmse: 1.24568
[234]	valid_0's rmse: 1.24523
[235]	valid_0's rmse: 1.24507
[236]	valid_0's rmse: 1.24464
[237]	valid_0's rmse: 1.24409
[238]	valid_0's rmse: 1.24375
[239]	valid_0's rmse: 1.24348
[240]	valid_0

[480]	valid_0's rmse: 1.17376
[481]	valid_0's rmse: 1.17356
[482]	valid_0's rmse: 1.17343
[483]	valid_0's rmse: 1.17325
[484]	valid_0's rmse: 1.17314
[485]	valid_0's rmse: 1.17287
[486]	valid_0's rmse: 1.17258
[487]	valid_0's rmse: 1.17224
[488]	valid_0's rmse: 1.17189
[489]	valid_0's rmse: 1.17152
[490]	valid_0's rmse: 1.17117
[491]	valid_0's rmse: 1.17107
[492]	valid_0's rmse: 1.17102
[493]	valid_0's rmse: 1.17095
[494]	valid_0's rmse: 1.17088
[495]	valid_0's rmse: 1.17078
[496]	valid_0's rmse: 1.17074
[497]	valid_0's rmse: 1.17042
[498]	valid_0's rmse: 1.17014
[499]	valid_0's rmse: 1.16989
[500]	valid_0's rmse: 1.16976
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmse: 1.16976
[1]	valid_0's rmse: 2.55483
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 2.50786
[3]	valid_0's rmse: 2.46204
[4]	valid_0's rmse: 2.41899
[5]	valid_0's rmse: 2.37671
[6]	valid_0's rmse: 2.33686
[7]	valid_0's rmse: 2.29787
[8]	valid_0's rmse: 2.26059
[9]	val

[252]	valid_0's rmse: 1.34815
[253]	valid_0's rmse: 1.34779
[254]	valid_0's rmse: 1.34743
[255]	valid_0's rmse: 1.34711
[256]	valid_0's rmse: 1.34669
[257]	valid_0's rmse: 1.34649
[258]	valid_0's rmse: 1.34629
[259]	valid_0's rmse: 1.34605
[260]	valid_0's rmse: 1.34572
[261]	valid_0's rmse: 1.34551
[262]	valid_0's rmse: 1.34529
[263]	valid_0's rmse: 1.3451
[264]	valid_0's rmse: 1.34488
[265]	valid_0's rmse: 1.34458
[266]	valid_0's rmse: 1.34416
[267]	valid_0's rmse: 1.34372
[268]	valid_0's rmse: 1.3434
[269]	valid_0's rmse: 1.34307
[270]	valid_0's rmse: 1.34271
[271]	valid_0's rmse: 1.34242
[272]	valid_0's rmse: 1.34211
[273]	valid_0's rmse: 1.34186
[274]	valid_0's rmse: 1.34167
[275]	valid_0's rmse: 1.34138
[276]	valid_0's rmse: 1.34119
[277]	valid_0's rmse: 1.34103
[278]	valid_0's rmse: 1.34087
[279]	valid_0's rmse: 1.3407
[280]	valid_0's rmse: 1.34053
[281]	valid_0's rmse: 1.34017
[282]	valid_0's rmse: 1.33992
[283]	valid_0's rmse: 1.33956
[284]	valid_0's rmse: 1.33926
[285]	valid_0

In [42]:
for i in range(4):
    print('meter: '+ str(i))
    for model in meter_models[i]:
        print(model.best_score_['valid_0']['rmse'])


meter: 0
0.44402443647211
0.5037557146083765
0.5034573194220682
0.5056544663624474
0.5044811636157097
meter: 1
1.0805589828015705
1.0840114848170186
1.082725101682771
1.0857007964247567
1.083169370910824
meter: 2
1.1646970352062893
1.1689685395173655
1.167609959186031
1.1618060312661231
1.1697564843681851
meter: 3
1.298707785158492
1.2984848672246652
1.3002476563829886
1.298004684128228
1.2990536544240658


In [43]:
# Importance rank for first model in cross val models
for i in range(4):
    print('meter: '+ str(i))
    imprtc_df = pd.DataFrame()
    imprtc_df['feature'] = sample_train_X.drop('meter_reading_log1p', axis=1).columns   
    imprtc_df['importance'] = meter_models[i][0].feature_importances_
    imprtc_df.sort_values('importance', ascending=False, inplace= True)
    print(imprtc_df)


meter: 0
                         feature  importance
5                    meter_0_std         860
1                   meter_0_mean         663
2                 meter_0_median         489
21                       site_id         481
90                          hour         400
48    precip_depth_1_hr_std_lag3         373
22                   primary_use         288
61    air_temperature_mean_lag72         269
4                    meter_0_max         245
63     air_temperature_min_lag72         236
62     air_temperature_max_lag72         234
71     dew_temperature_min_lag72         233
24               log_square_feet         213
79  sea_level_pressure_min_lag72         187
78  sea_level_pressure_max_lag72         180
69    dew_temperature_mean_lag72         177
70     dew_temperature_max_lag72         175
73  precip_depth_1_hr_mean_lag72         166
85         wind_speed_mean_lag72         153
89                     dayofweek         150
84      wind_direction_std_lag72         145
8

[91 rows x 2 columns]
meter: 3
                          feature  importance
20                    meter_3_std         649
16                   meter_3_mean         649
61     air_temperature_mean_lag72         388
19                    meter_3_max         370
5                     meter_0_std         338
17                 meter_3_median         336
90                           hour         322
63      air_temperature_min_lag72         319
62      air_temperature_max_lag72         237
1                    meter_0_mean         234
71      dew_temperature_min_lag72         218
69     dew_temperature_mean_lag72         183
81      wind_direction_mean_lag72         182
6                    meter_1_mean         178
48     precip_depth_1_hr_std_lag3         170
78   sea_level_pressure_max_lag72         169
35       air_temperature_min_lag3         162
9                     meter_1_max         160
21                        site_id         151
64      air_temperature_std_lag72         147
88 

In [44]:
# %%time
# ## Single fit single model

# gbm = LGBMRegressor(**gbm_params)
# f_train_X, f_train_y = getInFoldXY(train.index)
# gbm.fit(f_train_X, f_train_y)

In [45]:
# Generate test_X
test_X = x_pipes.transform(
    test
        .merge(building_pipes.transform(building), on='building_id', how='left').drop(['row_id'], axis=1)
        .merge(weather_pipes.transform(weather_test), on=['site_id', 'timestamp'], how='left')
    )

print(test_X.sample(n=20,  random_state=42))
print(test_X.shape)
print(test_X.dtypes)

         meter  meter_0_mean  meter_0_median  meter_0_min  meter_0_max  \
3573457      0      4.832031        4.785156     3.177734     5.453125   
8315486      1      2.617188        2.638672     1.054688     3.656250   
40305643     2      0.000000        0.000000     0.000000     0.000000   
16083617     0      1.302734        1.029297     0.000000     2.785156   
37204119     2      5.789062        5.761719     5.511719     6.179688   
32144852     1      4.425781        4.429688     0.000000     5.359375   
5105044      0      5.921875        5.910156     4.453125     6.140625   
36982844     1      5.328125        6.355469     0.000000     6.632812   
20487823     2      5.406250        5.531250     0.000000     8.062500   
8404196      1      5.562500        5.515625     3.964844     6.082031   
6889602      0      3.972656        3.908203     0.000000     5.429688   
16963616     0      6.503906        6.460938     6.082031     6.972656   
39666699     2      4.281250        4.

[20 rows x 91 columns]
(41697600, 91)
meter                            category
meter_0_mean                      float16
meter_0_median                    float16
meter_0_min                       float16
meter_0_max                       float16
meter_0_std                       float16
meter_1_mean                      float16
meter_1_median                    float16
meter_1_min                       float16
meter_1_max                       float16
meter_1_std                       float16
meter_2_mean                      float16
meter_2_median                    float16
meter_2_min                       float16
meter_2_max                       float16
meter_2_std                       float16
meter_3_mean                      float16
meter_3_median                    float16
meter_3_min                       float16
meter_3_max                       float16
meter_3_std                       float16
site_id                          category
primary_use                      categ

In [46]:
def predMeters(test_X):
    test_y = test_X[['meter']] 
    test_y['meter_reading_log1p'] = np.nan
    for i in range(4):
        X = test_X[test_X['meter'] == i]
        if X.shape[0] > 0:
            preds = np.expm1(sum([model.predict(X) for model in meter_models[i]])/folds)
            test_y.loc[test_y['meter'] == i, 'meter_reading_log1p'] = preds
    return test_y['meter_reading_log1p'].tolist()
    
print(predMeters(test_X.sample(n=20,  random_state=42)))    


[138.3789650113943, 9.587109971107767, 74.71525386468431, 1.8335104109099847, 1197.5802655883576, 8.352500391811056, 344.6668009083402, 1234.49578715777, 340.2959327776593, 103.4015938306154, 42.710519111500545, 722.6858687481617, 1152.189154998877, 36.802299267520205, 64.23895492762244, 58.14238222819031, 15.389941928000397, 73.6037674398394, 49.066690589135916, 177.20253032660617]


In [47]:
# Predict using cross val models ensemble 
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(predMeters(test_X.iloc[i:i+step_size]))
    i+=step_size
    gc.collect()


100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [48:21<00:00,  3.94s/it]


In [48]:
# Save using cross val models ensemble 
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission_meter.csv.zip', index=False)
submission.shape

41697600


(41697600, 2)

In [49]:
# # Predict single model fit
# i=0
# res=[]
# step_size = 50000
# for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
#    #res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
#    res.append(np.expm1(gbm.predict(test_X.iloc[i:i+step_size])))
#    i+=step_size
    