In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [2]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float16, 'year_built': np.float16, 'floor_count': np.float16},
    'weather' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16}
}

file_loc = {}    
for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
    for name in ['building_metadata','weather_train','weather_test','train','test']:
        if path.exists(dir_path + name + '.csv'):
            file_loc[name]= dir_path + name + '.csv'
    
    building = pd.read_csv(file_loc['building_metadata'], dtype=file_dtype['building_metadata'])
    weather_train = pd.read_csv(file_loc['weather_train'], dtype=file_dtype['weather'])
    weather_test = pd.read_csv(file_loc['weather_test'], dtype=file_dtype['weather'])
    train = pd.read_csv(file_loc['train'], dtype=file_dtype['train'])
    test = pd.read_csv(file_loc['test'], dtype=file_dtype['test'])

#print(weather_train.isna().sum())    
#weather_train = imputeWeather(weather_train, limit_direction='both')
#add_lag_feature(weather_train, window=3)
#add_lag_feature(weather_train, window=24)
#add_lag_feature(weather_train, window=72)
#print(weather_train.isna().sum())
#weather_test = imputeWeather(weather_test, limit_direction='both')
#add_lag_feature(weather_test, window=3)
#add_lag_feature(weather_test, window=24)
#add_lag_feature(weather_test, window=72)
    
#train = train.merge(building, on='building_id', how='left')
#test = test.merge(building, on='building_id', how='left')
#train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
#test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

#del weather_train, weather_test
#gc.collect()

In [3]:
class ImputeWeather(TransformerMixin):

    def __init__(self, method:str='linear', gap_limit:int=None, limit_direction:str='forward'):
        self._method = method
        self._gap_limit = gap_limit
        self._limit_direction = limit_direction
        
    def transform(self, weather_df, **transform_params):
        grouped_weather_df = weather_df.groupby('site_id').apply(lambda group: group.interpolate(method=self._method, limit=self._gap_limit, limit_direction=self._limit_direction))
        if 'cloud_coverage' in grouped_weather_df.columns:
            grouped_weather_df['cloud_coverage'] = grouped_weather_df['cloud_coverage'].round(decimals=0).clip(0,8)
        grouped_weather_df.reset_index(inplace=True)
        return grouped_weather_df.drop('index', axis=1)

    def fit(self, X, y=None, **fit_params):
        return self

In [4]:
class AddWeatherLags(TransformerMixin):
    
    def __init__(self, window):
        self._window = window
        
    def transform(self, weather_df, **transform_params):
        group_df = weather_df.groupby(['site_id'])
        cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
        rolled = group_df[cols].rolling(window=self._window, min_periods=0)
        lag_mean = rolled.mean().reset_index().astype(np.float16)
        lag_max = rolled.max().reset_index().astype(np.float16)
        lag_min = rolled.min().reset_index().astype(np.float16)
        lag_std = rolled.std().reset_index().astype(np.float16)
        for col in cols:
            weather_df[f'{col}_mean_lag{self._window}'] = lag_mean[col]
            weather_df[f'{col}_max_lag{self._window}'] = lag_max[col]
            weather_df[f'{col}_min_lag{self._window}'] = lag_min[col]
            weather_df[f'{col}_std_lag{self._window}'] = lag_std[col]
        return weather_df

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
class AddWeather(TransformerMixin):

    def __init__(self, weather_df):
        self._b_df = weather_df
        
    def transform(self, df, **transform_params):
        
        return df.merge(weather_test, on=['site_id', 'timestamp'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

In [6]:
class AddBuilding(TransformerMixin):

    def __init__(self, building_df):
        self._b_df = building_df
        
    def transform(self, df, **transform_params):
        
        return df.merge(_b_df, on='building_id', how='left')

    def fit(self, X, y=None, **fit_params):
        return self

In [7]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [8]:
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        #north_america = holidays.CA() + holidays.US() + holidays.MX()
        us =  holidays.US();
        df['holiday'] = df['timestamp'].apply(lambda x: us.get(x))
        df['holiday'] = df['holiday'].astype('category')
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [9]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [10]:
class SetCatTypes(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['primary_use']= df['primary_use'].astype('category')
        df['meter'] = df["meter"].astype('category')
        df['site_id'] = df["site_id"].astype('category')
        df['building_id'] = df['building_id'].astype('category')
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [11]:
class ImputeCloudCoverage(TransformerMixin):
        
    def transform(self, df, **transform_params):
        # set age of building to mediam of site_id
        # else if set ot overall median
        median = df['cloud_coverage'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['cloud_coverage'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = i_median
            else:
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = median
        df['cloud_coverage'] = np.uint8(df['cloud_coverage'])
        df['cloud_coverage'] = df['cloud_coverage']
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [12]:
class CloudTimeCat(TransformerMixin):
        
    def transform(self, df, **transform_params):
        tempDf = df[['cloud_coverage', 'hour']].astype('int')
        tempDf['cloud_coverage'] = (tempDf['cloud_coverage']).astype('int')
        tempDf['hour'] = (tempDf['hour']).astype('int')
        tempDf = tempDf.astype('str')
        df['cloud_time_cat'] = 'c' + tempDf['cloud_coverage'] + 't' + tempDf['hour']
        df['cloud_time_cat'] = df['cloud_time_cat'].astype('category')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [13]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        return df.drop(self._drop_cols, axis=1)

    def fit(self, X, y=None, **fit_params):
        return self

In [14]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        year_built_median = df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [15]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [16]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('category') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = (df['timestamp'].dt.dayofweek * 24) + df['timestamp'].dt.hour
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('category')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [17]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [18]:
class FillMean(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].mean())
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [19]:
class FillZeros(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(0)
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [20]:
class FillMedian(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].median())
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [21]:
class FillPopular(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].value_counts()[0])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [22]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [23]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [24]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [25]:
weather_pipes = Pipeline(
    steps=[
        ('imputeWeather', ImputeWeather()),
        ('fillMean',FillMean(['air_temperature','dew_temperature'
                              , 'precip_depth_1_hr', 'sea_level_pressure'])),
        ('imputeCloudCoverage', ImputeCloudCoverage()),
        ('addWeatherLags3', AddWeatherLags(3)),
        ('addWeatherLags72', AddWeatherLags(72)),
    ]
)

building_pipes = Pipeline(
    steps=[
        ('logSquareFeet', LogSquareFeet()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('fillMean',FillMean(['floor_count'])),
        ('dropClos', DropCols(['square_feet', 'year_built'])),
    ]
)


# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        ('convertToDatetime', ConvertToDatetime()),
        ('AddHolidays', AddHolidays()),
        ('addRelativeHumidity',AddRelativeHumidity()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('setCatTypes', SetCatTypes()),
        ('fillMean',FillMean([])),
        ('fillZeros',FillZeros([])),
        ('dropClos', DropCols(['timestamp'])),
        ('GC', GC())
    ]
)

In [26]:
building_trans= building_pipes.transform(building)
weather_train_trans = weather_pipes.transform(weather_train)
weather_test_trans = weather_pipes.transform(weather_test)

test_X = x_pipes.transform(
    test
        .merge(building_trans, on='building_id', how='left').drop(['row_id'], axis=1)
        .merge(weather_test_trans, on=['site_id', 'timestamp'], how='left')
    )

#train_X = x_pipes.transform(
#    train
#        .merge(building_trans, on='building_id', how='left')
#        .merge(weather_train_trans, on=['site_id', 'timestamp'], how='left')
#    )

#print(test_X.shape)
#print(train_X.sample(n=20,  random_state=42))

In [27]:
print(test_X.shape)

(41697600, 73)


In [28]:
def getOutsideFoldXY(train_index):
    X = train.iloc[train_index].drop('meter_reading', axis=1)
    X_buildings = building[building['building_id'].isin(X['building_id'].unique())]
    X_weather = building[building['building_id'].isin(X['building_id'].unique())]
    X = x_pipes.transform(
        X
            .merge(building_pipes.transform(X_buildings), on='building_id', how='left')
            .merge(weather_train_trans, on=['site_id', 'timestamp'], how='left')
        )
    f_train_y = np.log1p(train.iloc[train_index]['meter_reading'])
    print(X.columns)
    return X,f_train_y



def getInFoldXY(train_index):
    X = train.iloc[train_index].drop('meter_reading', axis=1)
    X_buildings = building[building['building_id'].isin(X['building_id'].unique())]
    X = X.merge(building_pipes.transform(X_buildings), on='building_id', how='left')
    X_weather = weather_train[
        (weather_train['site_id'].isin(X['site_id'].unique())) 
         & (weather_train['timestamp'].isin(X['timestamp'].unique())) 
    ]
    X = x_pipes.transform(
        X.merge(weather_pipes.transform(X_weather), how='left'))
    f_train_y = np.log1p(train.iloc[train_index]['meter_reading'])
    return X, f_train_y


print(getInFoldXY(train.head(10).index))

(  building_id meter site_id          primary_use  floor_count  \
0           0     0       0            Education     3.740234   
1           1     0       0            Education     3.740234   
2           2     0       0            Education     3.740234   
3           3     0       0            Education     3.740234   
4           4     0       0            Education     3.740234   
5           5     0       0            Education     3.740234   
6           6     0       0  Lodging/residential     3.740234   
7           7     0       0            Education     3.740234   
8           8     0       0            Education     3.740234   
9           9     0       0               Office     3.740234   

   log_square_feet  building_age  air_temperature  cloud_coverage  \
0         8.914062           108             25.0               6   
1         7.910156           104             25.0               6   
2         8.585938            91             25.0               6   
3      

In [29]:
%%time

folds = 5

# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

models = []
best_scores = []
for train_index, val_index in kf.split(train, train['building_id']):
    f_train_X, f_train_y = getInFoldXY(train_index)
    f_val_X, f_val_y = getInFoldXY(val_index)
    gbm = LGBMRegressor(n_estimators=500, # for accuracy use large numbers like 6000 
                  learning_rate=0.4,
                  feature_fraction=0.9,
                  subsample=0.1,  # 
                  subsample_freq=1,
                  num_leaves=20,
                  max_depth=10,
                  metric='rmse',
                  lambda_l1= 1,  
                  lambda_l2= 1,
                  verbose= 100)
    gbm.fit(f_train_X, f_train_y,
        eval_set=[(f_val_X, f_val_y)],
        eval_metric=lbm_rmslee,
        early_stopping_rounds=20)
    models.append(gbm)
    #y_pred = gbm.predict(f_val_X, num_iteration=gbm.best_iteration_)
    # eval
    #rmsle_score = lbm_rmslee(f_val_X, y_pred)[1]
    best_scores.append(gbm.best_score_)
    gc.collect()


[1]	valid_0's rmse: 1.86924	valid_0's RMSLEE: 1.86924
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.73784	valid_0's RMSLEE: 1.73784
[3]	valid_0's rmse: 1.60845	valid_0's RMSLEE: 1.60845
[4]	valid_0's rmse: 1.43966	valid_0's RMSLEE: 1.43966
[5]	valid_0's rmse: 1.40451	valid_0's RMSLEE: 1.40451
[6]	valid_0's rmse: 1.36456	valid_0's RMSLEE: 1.36456
[7]	valid_0's rmse: 1.29051	valid_0's RMSLEE: 1.29051
[8]	valid_0's rmse: 1.27562	valid_0's RMSLEE: 1.27562
[9]	valid_0's rmse: 1.26495	valid_0's RMSLEE: 1.26495
[10]	valid_0's rmse: 1.2209	valid_0's RMSLEE: 1.2209
[11]	valid_0's rmse: 1.2109	valid_0's RMSLEE: 1.2109
[12]	valid_0's rmse: 1.19104	valid_0's RMSLEE: 1.19104
[13]	valid_0's rmse: 1.16822	valid_0's RMSLEE: 1.16822
[14]	valid_0's rmse: 1.15101	valid_0's RMSLEE: 1.15101
[15]	valid_0's rmse: 1.14088	valid_0's RMSLEE: 1.14088
[16]	valid_0's rmse: 1.1301	valid_0's RMSLEE: 1.1301
[17]	valid_0's rmse: 1.12117	valid_0's RMSLEE: 1.12117
[18]	valid_0's rmse

[145]	valid_0's rmse: 0.897442	valid_0's RMSLEE: 0.897442
[146]	valid_0's rmse: 0.896833	valid_0's RMSLEE: 0.896833
[147]	valid_0's rmse: 0.896558	valid_0's RMSLEE: 0.896558
[148]	valid_0's rmse: 0.896291	valid_0's RMSLEE: 0.896291
[149]	valid_0's rmse: 0.895628	valid_0's RMSLEE: 0.895628
[150]	valid_0's rmse: 0.894901	valid_0's RMSLEE: 0.894901
[151]	valid_0's rmse: 0.894377	valid_0's RMSLEE: 0.894377
[152]	valid_0's rmse: 0.893881	valid_0's RMSLEE: 0.893881
[153]	valid_0's rmse: 0.893377	valid_0's RMSLEE: 0.893377
[154]	valid_0's rmse: 0.893155	valid_0's RMSLEE: 0.893155
[155]	valid_0's rmse: 0.892896	valid_0's RMSLEE: 0.892896
[156]	valid_0's rmse: 0.892597	valid_0's RMSLEE: 0.892597
[157]	valid_0's rmse: 0.892236	valid_0's RMSLEE: 0.892236
[158]	valid_0's rmse: 0.89184	valid_0's RMSLEE: 0.89184
[159]	valid_0's rmse: 0.891198	valid_0's RMSLEE: 0.891198
[160]	valid_0's rmse: 0.889446	valid_0's RMSLEE: 0.889446
[161]	valid_0's rmse: 0.889162	valid_0's RMSLEE: 0.889162
[162]	valid_0's 

[286]	valid_0's rmse: 0.848409	valid_0's RMSLEE: 0.848409
[287]	valid_0's rmse: 0.847976	valid_0's RMSLEE: 0.847976
[288]	valid_0's rmse: 0.847821	valid_0's RMSLEE: 0.847821
[289]	valid_0's rmse: 0.847706	valid_0's RMSLEE: 0.847706
[290]	valid_0's rmse: 0.846751	valid_0's RMSLEE: 0.846751
[291]	valid_0's rmse: 0.846555	valid_0's RMSLEE: 0.846555
[292]	valid_0's rmse: 0.846459	valid_0's RMSLEE: 0.846459
[293]	valid_0's rmse: 0.846267	valid_0's RMSLEE: 0.846267
[294]	valid_0's rmse: 0.846064	valid_0's RMSLEE: 0.846064
[295]	valid_0's rmse: 0.845919	valid_0's RMSLEE: 0.845919
[296]	valid_0's rmse: 0.845825	valid_0's RMSLEE: 0.845825
[297]	valid_0's rmse: 0.845427	valid_0's RMSLEE: 0.845427
[298]	valid_0's rmse: 0.845152	valid_0's RMSLEE: 0.845152
[299]	valid_0's rmse: 0.845002	valid_0's RMSLEE: 0.845002
[300]	valid_0's rmse: 0.844412	valid_0's RMSLEE: 0.844412
[301]	valid_0's rmse: 0.844351	valid_0's RMSLEE: 0.844351
[302]	valid_0's rmse: 0.844256	valid_0's RMSLEE: 0.844256
[303]	valid_0'

[427]	valid_0's rmse: 0.818764	valid_0's RMSLEE: 0.818764
[428]	valid_0's rmse: 0.818655	valid_0's RMSLEE: 0.818655
[429]	valid_0's rmse: 0.818558	valid_0's RMSLEE: 0.818558
[430]	valid_0's rmse: 0.818468	valid_0's RMSLEE: 0.818468
[431]	valid_0's rmse: 0.818413	valid_0's RMSLEE: 0.818413
[432]	valid_0's rmse: 0.818404	valid_0's RMSLEE: 0.818404
[433]	valid_0's rmse: 0.818255	valid_0's RMSLEE: 0.818255
[434]	valid_0's rmse: 0.818154	valid_0's RMSLEE: 0.818154
[435]	valid_0's rmse: 0.818064	valid_0's RMSLEE: 0.818064
[436]	valid_0's rmse: 0.81798	valid_0's RMSLEE: 0.81798
[437]	valid_0's rmse: 0.817903	valid_0's RMSLEE: 0.817903
[438]	valid_0's rmse: 0.817906	valid_0's RMSLEE: 0.817906
[439]	valid_0's rmse: 0.817812	valid_0's RMSLEE: 0.817812
[440]	valid_0's rmse: 0.81765	valid_0's RMSLEE: 0.81765
[441]	valid_0's rmse: 0.817528	valid_0's RMSLEE: 0.817528
[442]	valid_0's rmse: 0.817454	valid_0's RMSLEE: 0.817454
[443]	valid_0's rmse: 0.817395	valid_0's RMSLEE: 0.817395
[444]	valid_0's rm

[68]	valid_0's rmse: 0.967563	valid_0's RMSLEE: 0.967563
[69]	valid_0's rmse: 0.966716	valid_0's RMSLEE: 0.966716
[70]	valid_0's rmse: 0.964845	valid_0's RMSLEE: 0.964845
[71]	valid_0's rmse: 0.963993	valid_0's RMSLEE: 0.963993
[72]	valid_0's rmse: 0.961735	valid_0's RMSLEE: 0.961735
[73]	valid_0's rmse: 0.96088	valid_0's RMSLEE: 0.96088
[74]	valid_0's rmse: 0.959687	valid_0's RMSLEE: 0.959687
[75]	valid_0's rmse: 0.958849	valid_0's RMSLEE: 0.958849
[76]	valid_0's rmse: 0.957912	valid_0's RMSLEE: 0.957912
[77]	valid_0's rmse: 0.957053	valid_0's RMSLEE: 0.957053
[78]	valid_0's rmse: 0.953565	valid_0's RMSLEE: 0.953565
[79]	valid_0's rmse: 0.952844	valid_0's RMSLEE: 0.952844
[80]	valid_0's rmse: 0.951793	valid_0's RMSLEE: 0.951793
[81]	valid_0's rmse: 0.950789	valid_0's RMSLEE: 0.950789
[82]	valid_0's rmse: 0.95005	valid_0's RMSLEE: 0.95005
[83]	valid_0's rmse: 0.948676	valid_0's RMSLEE: 0.948676
[84]	valid_0's rmse: 0.947931	valid_0's RMSLEE: 0.947931
[85]	valid_0's rmse: 0.947024	valid

[210]	valid_0's rmse: 0.862582	valid_0's RMSLEE: 0.862582
[211]	valid_0's rmse: 0.861327	valid_0's RMSLEE: 0.861327
[212]	valid_0's rmse: 0.860536	valid_0's RMSLEE: 0.860536
[213]	valid_0's rmse: 0.860222	valid_0's RMSLEE: 0.860222
[214]	valid_0's rmse: 0.85979	valid_0's RMSLEE: 0.85979
[215]	valid_0's rmse: 0.859518	valid_0's RMSLEE: 0.859518
[216]	valid_0's rmse: 0.859364	valid_0's RMSLEE: 0.859364
[217]	valid_0's rmse: 0.859146	valid_0's RMSLEE: 0.859146
[218]	valid_0's rmse: 0.859013	valid_0's RMSLEE: 0.859013
[219]	valid_0's rmse: 0.858621	valid_0's RMSLEE: 0.858621
[220]	valid_0's rmse: 0.858399	valid_0's RMSLEE: 0.858399
[221]	valid_0's rmse: 0.858186	valid_0's RMSLEE: 0.858186
[222]	valid_0's rmse: 0.857902	valid_0's RMSLEE: 0.857902
[223]	valid_0's rmse: 0.857658	valid_0's RMSLEE: 0.857658
[224]	valid_0's rmse: 0.857482	valid_0's RMSLEE: 0.857482
[225]	valid_0's rmse: 0.857207	valid_0's RMSLEE: 0.857207
[226]	valid_0's rmse: 0.85678	valid_0's RMSLEE: 0.85678
[227]	valid_0's rm

[351]	valid_0's rmse: 0.823929	valid_0's RMSLEE: 0.823929
[352]	valid_0's rmse: 0.823839	valid_0's RMSLEE: 0.823839
[353]	valid_0's rmse: 0.823681	valid_0's RMSLEE: 0.823681
[354]	valid_0's rmse: 0.82351	valid_0's RMSLEE: 0.82351
[355]	valid_0's rmse: 0.82339	valid_0's RMSLEE: 0.82339
[356]	valid_0's rmse: 0.82321	valid_0's RMSLEE: 0.82321
[357]	valid_0's rmse: 0.823049	valid_0's RMSLEE: 0.823049
[358]	valid_0's rmse: 0.822559	valid_0's RMSLEE: 0.822559
[359]	valid_0's rmse: 0.822351	valid_0's RMSLEE: 0.822351
[360]	valid_0's rmse: 0.822304	valid_0's RMSLEE: 0.822304
[361]	valid_0's rmse: 0.822042	valid_0's RMSLEE: 0.822042
[362]	valid_0's rmse: 0.821947	valid_0's RMSLEE: 0.821947
[363]	valid_0's rmse: 0.821886	valid_0's RMSLEE: 0.821886
[364]	valid_0's rmse: 0.821696	valid_0's RMSLEE: 0.821696
[365]	valid_0's rmse: 0.821566	valid_0's RMSLEE: 0.821566
[366]	valid_0's rmse: 0.821487	valid_0's RMSLEE: 0.821487
[367]	valid_0's rmse: 0.821345	valid_0's RMSLEE: 0.821345
[368]	valid_0's rmse

[492]	valid_0's rmse: 0.803285	valid_0's RMSLEE: 0.803285
[493]	valid_0's rmse: 0.803188	valid_0's RMSLEE: 0.803188
[494]	valid_0's rmse: 0.803068	valid_0's RMSLEE: 0.803068
[495]	valid_0's rmse: 0.802944	valid_0's RMSLEE: 0.802944
[496]	valid_0's rmse: 0.802932	valid_0's RMSLEE: 0.802932
[497]	valid_0's rmse: 0.802549	valid_0's RMSLEE: 0.802549
[498]	valid_0's rmse: 0.802271	valid_0's RMSLEE: 0.802271
[499]	valid_0's rmse: 0.802195	valid_0's RMSLEE: 0.802195
[500]	valid_0's rmse: 0.802089	valid_0's RMSLEE: 0.802089
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmse: 0.802089	valid_0's RMSLEE: 0.802089
[1]	valid_0's rmse: 1.86753	valid_0's RMSLEE: 1.86753
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.73419	valid_0's RMSLEE: 1.73419
[3]	valid_0's rmse: 1.60041	valid_0's RMSLEE: 1.60041
[4]	valid_0's rmse: 1.54326	valid_0's RMSLEE: 1.54326
[5]	valid_0's rmse: 1.3978	valid_0's RMSLEE: 1.3978
[6]	valid_0's rmse: 1.36929	valid_0's RMSLE

[134]	valid_0's rmse: 0.909086	valid_0's RMSLEE: 0.909086
[135]	valid_0's rmse: 0.90801	valid_0's RMSLEE: 0.90801
[136]	valid_0's rmse: 0.90711	valid_0's RMSLEE: 0.90711
[137]	valid_0's rmse: 0.906513	valid_0's RMSLEE: 0.906513
[138]	valid_0's rmse: 0.906069	valid_0's RMSLEE: 0.906069
[139]	valid_0's rmse: 0.905387	valid_0's RMSLEE: 0.905387
[140]	valid_0's rmse: 0.904693	valid_0's RMSLEE: 0.904693
[141]	valid_0's rmse: 0.903937	valid_0's RMSLEE: 0.903937
[142]	valid_0's rmse: 0.902338	valid_0's RMSLEE: 0.902338
[143]	valid_0's rmse: 0.901163	valid_0's RMSLEE: 0.901163
[144]	valid_0's rmse: 0.900865	valid_0's RMSLEE: 0.900865
[145]	valid_0's rmse: 0.900578	valid_0's RMSLEE: 0.900578
[146]	valid_0's rmse: 0.900197	valid_0's RMSLEE: 0.900197
[147]	valid_0's rmse: 0.899593	valid_0's RMSLEE: 0.899593
[148]	valid_0's rmse: 0.89916	valid_0's RMSLEE: 0.89916
[149]	valid_0's rmse: 0.898678	valid_0's RMSLEE: 0.898678
[150]	valid_0's rmse: 0.898285	valid_0's RMSLEE: 0.898285
[151]	valid_0's rmse

[275]	valid_0's rmse: 0.848563	valid_0's RMSLEE: 0.848563
[276]	valid_0's rmse: 0.84844	valid_0's RMSLEE: 0.84844
[277]	valid_0's rmse: 0.848243	valid_0's RMSLEE: 0.848243
[278]	valid_0's rmse: 0.848066	valid_0's RMSLEE: 0.848066
[279]	valid_0's rmse: 0.847854	valid_0's RMSLEE: 0.847854
[280]	valid_0's rmse: 0.847665	valid_0's RMSLEE: 0.847665
[281]	valid_0's rmse: 0.846614	valid_0's RMSLEE: 0.846614
[282]	valid_0's rmse: 0.846454	valid_0's RMSLEE: 0.846454
[283]	valid_0's rmse: 0.846289	valid_0's RMSLEE: 0.846289
[284]	valid_0's rmse: 0.846149	valid_0's RMSLEE: 0.846149
[285]	valid_0's rmse: 0.846075	valid_0's RMSLEE: 0.846075
[286]	valid_0's rmse: 0.846003	valid_0's RMSLEE: 0.846003
[287]	valid_0's rmse: 0.845318	valid_0's RMSLEE: 0.845318
[288]	valid_0's rmse: 0.844761	valid_0's RMSLEE: 0.844761
[289]	valid_0's rmse: 0.844569	valid_0's RMSLEE: 0.844569
[290]	valid_0's rmse: 0.844265	valid_0's RMSLEE: 0.844265
[291]	valid_0's rmse: 0.843982	valid_0's RMSLEE: 0.843982
[292]	valid_0's 

[416]	valid_0's rmse: 0.8225	valid_0's RMSLEE: 0.8225
[417]	valid_0's rmse: 0.822428	valid_0's RMSLEE: 0.822428
[418]	valid_0's rmse: 0.822357	valid_0's RMSLEE: 0.822357
[419]	valid_0's rmse: 0.822273	valid_0's RMSLEE: 0.822273
[420]	valid_0's rmse: 0.822119	valid_0's RMSLEE: 0.822119
[421]	valid_0's rmse: 0.821966	valid_0's RMSLEE: 0.821966
[422]	valid_0's rmse: 0.821871	valid_0's RMSLEE: 0.821871
[423]	valid_0's rmse: 0.821841	valid_0's RMSLEE: 0.821841
[424]	valid_0's rmse: 0.821732	valid_0's RMSLEE: 0.821732
[425]	valid_0's rmse: 0.821654	valid_0's RMSLEE: 0.821654
[426]	valid_0's rmse: 0.820741	valid_0's RMSLEE: 0.820741
[427]	valid_0's rmse: 0.820635	valid_0's RMSLEE: 0.820635
[428]	valid_0's rmse: 0.820573	valid_0's RMSLEE: 0.820573
[429]	valid_0's rmse: 0.820432	valid_0's RMSLEE: 0.820432
[430]	valid_0's rmse: 0.820393	valid_0's RMSLEE: 0.820393
[431]	valid_0's rmse: 0.820239	valid_0's RMSLEE: 0.820239
[432]	valid_0's rmse: 0.82015	valid_0's RMSLEE: 0.82015
[433]	valid_0's rmse

[57]	valid_0's rmse: 0.986157	valid_0's RMSLEE: 0.986157
[58]	valid_0's rmse: 0.985348	valid_0's RMSLEE: 0.985348
[59]	valid_0's rmse: 0.984679	valid_0's RMSLEE: 0.984679
[60]	valid_0's rmse: 0.982979	valid_0's RMSLEE: 0.982979
[61]	valid_0's rmse: 0.981981	valid_0's RMSLEE: 0.981981
[62]	valid_0's rmse: 0.980905	valid_0's RMSLEE: 0.980905
[63]	valid_0's rmse: 0.976836	valid_0's RMSLEE: 0.976836
[64]	valid_0's rmse: 0.976437	valid_0's RMSLEE: 0.976437
[65]	valid_0's rmse: 0.975118	valid_0's RMSLEE: 0.975118
[66]	valid_0's rmse: 0.974328	valid_0's RMSLEE: 0.974328
[67]	valid_0's rmse: 0.973266	valid_0's RMSLEE: 0.973266
[68]	valid_0's rmse: 0.972071	valid_0's RMSLEE: 0.972071
[69]	valid_0's rmse: 0.970996	valid_0's RMSLEE: 0.970996
[70]	valid_0's rmse: 0.970637	valid_0's RMSLEE: 0.970637
[71]	valid_0's rmse: 0.97002	valid_0's RMSLEE: 0.97002
[72]	valid_0's rmse: 0.967722	valid_0's RMSLEE: 0.967722
[73]	valid_0's rmse: 0.966659	valid_0's RMSLEE: 0.966659
[74]	valid_0's rmse: 0.965979	val

[199]	valid_0's rmse: 0.871593	valid_0's RMSLEE: 0.871593
[200]	valid_0's rmse: 0.871261	valid_0's RMSLEE: 0.871261
[201]	valid_0's rmse: 0.870641	valid_0's RMSLEE: 0.870641
[202]	valid_0's rmse: 0.870223	valid_0's RMSLEE: 0.870223
[203]	valid_0's rmse: 0.87019	valid_0's RMSLEE: 0.87019
[204]	valid_0's rmse: 0.869902	valid_0's RMSLEE: 0.869902
[205]	valid_0's rmse: 0.869816	valid_0's RMSLEE: 0.869816
[206]	valid_0's rmse: 0.86967	valid_0's RMSLEE: 0.86967
[207]	valid_0's rmse: 0.869397	valid_0's RMSLEE: 0.869397
[208]	valid_0's rmse: 0.869111	valid_0's RMSLEE: 0.869111
[209]	valid_0's rmse: 0.86896	valid_0's RMSLEE: 0.86896
[210]	valid_0's rmse: 0.868795	valid_0's RMSLEE: 0.868795
[211]	valid_0's rmse: 0.867908	valid_0's RMSLEE: 0.867908
[212]	valid_0's rmse: 0.867605	valid_0's RMSLEE: 0.867605
[213]	valid_0's rmse: 0.867343	valid_0's RMSLEE: 0.867343
[214]	valid_0's rmse: 0.866958	valid_0's RMSLEE: 0.866958
[215]	valid_0's rmse: 0.866426	valid_0's RMSLEE: 0.866426
[216]	valid_0's rmse

[340]	valid_0's rmse: 0.835977	valid_0's RMSLEE: 0.835977
[341]	valid_0's rmse: 0.835833	valid_0's RMSLEE: 0.835833
[342]	valid_0's rmse: 0.83564	valid_0's RMSLEE: 0.83564
[343]	valid_0's rmse: 0.835463	valid_0's RMSLEE: 0.835463
[344]	valid_0's rmse: 0.835303	valid_0's RMSLEE: 0.835303
[345]	valid_0's rmse: 0.835167	valid_0's RMSLEE: 0.835167
[346]	valid_0's rmse: 0.83502	valid_0's RMSLEE: 0.83502
[347]	valid_0's rmse: 0.834361	valid_0's RMSLEE: 0.834361
[348]	valid_0's rmse: 0.834196	valid_0's RMSLEE: 0.834196
[349]	valid_0's rmse: 0.834122	valid_0's RMSLEE: 0.834122
[350]	valid_0's rmse: 0.83405	valid_0's RMSLEE: 0.83405
[351]	valid_0's rmse: 0.833919	valid_0's RMSLEE: 0.833919
[352]	valid_0's rmse: 0.833691	valid_0's RMSLEE: 0.833691
[353]	valid_0's rmse: 0.833544	valid_0's RMSLEE: 0.833544
[354]	valid_0's rmse: 0.83345	valid_0's RMSLEE: 0.83345
[355]	valid_0's rmse: 0.833464	valid_0's RMSLEE: 0.833464
[356]	valid_0's rmse: 0.833332	valid_0's RMSLEE: 0.833332
[357]	valid_0's rmse: 

[481]	valid_0's rmse: 0.815252	valid_0's RMSLEE: 0.815252
[482]	valid_0's rmse: 0.815231	valid_0's RMSLEE: 0.815231
[483]	valid_0's rmse: 0.815129	valid_0's RMSLEE: 0.815129
[484]	valid_0's rmse: 0.815141	valid_0's RMSLEE: 0.815141
[485]	valid_0's rmse: 0.815114	valid_0's RMSLEE: 0.815114
[486]	valid_0's rmse: 0.81505	valid_0's RMSLEE: 0.81505
[487]	valid_0's rmse: 0.814899	valid_0's RMSLEE: 0.814899
[488]	valid_0's rmse: 0.81479	valid_0's RMSLEE: 0.81479
[489]	valid_0's rmse: 0.814625	valid_0's RMSLEE: 0.814625
[490]	valid_0's rmse: 0.814468	valid_0's RMSLEE: 0.814468
[491]	valid_0's rmse: 0.814385	valid_0's RMSLEE: 0.814385
[492]	valid_0's rmse: 0.814014	valid_0's RMSLEE: 0.814014
[493]	valid_0's rmse: 0.8138	valid_0's RMSLEE: 0.8138
[494]	valid_0's rmse: 0.813687	valid_0's RMSLEE: 0.813687
[495]	valid_0's rmse: 0.813533	valid_0's RMSLEE: 0.813533
[496]	valid_0's rmse: 0.813448	valid_0's RMSLEE: 0.813448
[497]	valid_0's rmse: 0.813358	valid_0's RMSLEE: 0.813358
[498]	valid_0's rmse: 

[123]	valid_0's rmse: 0.917979	valid_0's RMSLEE: 0.917979
[124]	valid_0's rmse: 0.917579	valid_0's RMSLEE: 0.917579
[125]	valid_0's rmse: 0.916817	valid_0's RMSLEE: 0.916817
[126]	valid_0's rmse: 0.916197	valid_0's RMSLEE: 0.916197
[127]	valid_0's rmse: 0.915604	valid_0's RMSLEE: 0.915604
[128]	valid_0's rmse: 0.915086	valid_0's RMSLEE: 0.915086
[129]	valid_0's rmse: 0.914605	valid_0's RMSLEE: 0.914605
[130]	valid_0's rmse: 0.914212	valid_0's RMSLEE: 0.914212
[131]	valid_0's rmse: 0.913622	valid_0's RMSLEE: 0.913622
[132]	valid_0's rmse: 0.91322	valid_0's RMSLEE: 0.91322
[133]	valid_0's rmse: 0.91234	valid_0's RMSLEE: 0.91234
[134]	valid_0's rmse: 0.91182	valid_0's RMSLEE: 0.91182
[135]	valid_0's rmse: 0.910675	valid_0's RMSLEE: 0.910675
[136]	valid_0's rmse: 0.910324	valid_0's RMSLEE: 0.910324
[137]	valid_0's rmse: 0.910004	valid_0's RMSLEE: 0.910004
[138]	valid_0's rmse: 0.909335	valid_0's RMSLEE: 0.909335
[139]	valid_0's rmse: 0.908969	valid_0's RMSLEE: 0.908969
[140]	valid_0's rmse

[264]	valid_0's rmse: 0.853374	valid_0's RMSLEE: 0.853374
[265]	valid_0's rmse: 0.853243	valid_0's RMSLEE: 0.853243
[266]	valid_0's rmse: 0.853102	valid_0's RMSLEE: 0.853102
[267]	valid_0's rmse: 0.852894	valid_0's RMSLEE: 0.852894
[268]	valid_0's rmse: 0.852634	valid_0's RMSLEE: 0.852634
[269]	valid_0's rmse: 0.852267	valid_0's RMSLEE: 0.852267
[270]	valid_0's rmse: 0.852144	valid_0's RMSLEE: 0.852144
[271]	valid_0's rmse: 0.85193	valid_0's RMSLEE: 0.85193
[272]	valid_0's rmse: 0.851747	valid_0's RMSLEE: 0.851747
[273]	valid_0's rmse: 0.851635	valid_0's RMSLEE: 0.851635
[274]	valid_0's rmse: 0.851398	valid_0's RMSLEE: 0.851398
[275]	valid_0's rmse: 0.851088	valid_0's RMSLEE: 0.851088
[276]	valid_0's rmse: 0.850811	valid_0's RMSLEE: 0.850811
[277]	valid_0's rmse: 0.850471	valid_0's RMSLEE: 0.850471
[278]	valid_0's rmse: 0.850261	valid_0's RMSLEE: 0.850261
[279]	valid_0's rmse: 0.849937	valid_0's RMSLEE: 0.849937
[280]	valid_0's rmse: 0.84979	valid_0's RMSLEE: 0.84979
[281]	valid_0's rm

[406]	valid_0's rmse: 0.82282	valid_0's RMSLEE: 0.82282
[407]	valid_0's rmse: 0.822599	valid_0's RMSLEE: 0.822599
[408]	valid_0's rmse: 0.822325	valid_0's RMSLEE: 0.822325
[409]	valid_0's rmse: 0.822154	valid_0's RMSLEE: 0.822154
[410]	valid_0's rmse: 0.821672	valid_0's RMSLEE: 0.821672
[411]	valid_0's rmse: 0.821611	valid_0's RMSLEE: 0.821611
[412]	valid_0's rmse: 0.821361	valid_0's RMSLEE: 0.821361
[413]	valid_0's rmse: 0.821258	valid_0's RMSLEE: 0.821258
[414]	valid_0's rmse: 0.82106	valid_0's RMSLEE: 0.82106
[415]	valid_0's rmse: 0.820899	valid_0's RMSLEE: 0.820899
[416]	valid_0's rmse: 0.820795	valid_0's RMSLEE: 0.820795
[417]	valid_0's rmse: 0.820599	valid_0's RMSLEE: 0.820599
[418]	valid_0's rmse: 0.820532	valid_0's RMSLEE: 0.820532
[419]	valid_0's rmse: 0.820351	valid_0's RMSLEE: 0.820351
[420]	valid_0's rmse: 0.820165	valid_0's RMSLEE: 0.820165
[421]	valid_0's rmse: 0.820062	valid_0's RMSLEE: 0.820062
[422]	valid_0's rmse: 0.819982	valid_0's RMSLEE: 0.819982
[423]	valid_0's rm

In [30]:
for score in best_scores:
    print(score['valid_0']['RMSLEE'])

0.8097856451393137
0.8020886662879281
0.8098382565588232
0.813121048944298
0.809304454884437


In [31]:
imprtc_df = pd.DataFrame()
imprtc_df['feature'] = test_X.columns   
imprtc_df['importance'] = models[0].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)


                          feature  importance
0                     building_id        3548
1                           meter         897
72                           hour         662
29     precip_depth_1_hr_std_lag3         336
42     air_temperature_mean_lag72         280
44      air_temperature_min_lag72         258
43      air_temperature_max_lag72         231
52      dew_temperature_min_lag72         189
59   sea_level_pressure_max_lag72         176
50     dew_temperature_mean_lag72         139
60   sea_level_pressure_min_lag72         137
51      dew_temperature_max_lag72         130
2                         site_id         128
5                 log_square_feet         122
62      wind_direction_mean_lag72         115
66          wind_speed_mean_lag72         108
54   precip_depth_1_hr_mean_lag72         105
7                 air_temperature          98
49       cloud_coverage_std_lag72          98
65       wind_direction_std_lag72          97
45      air_temperature_std_lag72 

In [32]:
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
    i+=step_size
    
    
    


100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [32:49<00:00,  2.26s/it]


In [33]:
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission.csv', index=False)
submission

41697600


Unnamed: 0,row_id,meter_reading
0,0,1.655565
1,1,1.937839
2,2,0.193740
3,3,3.292613
4,4,0.863788
5,5,0.163876
6,6,2.212237
7,7,1.682488
8,8,48.827745
9,9,1.020517


In [None]:
#
#
#
#
#
# everything below is older stuff - please ignore
#
#
#
#
#

In [226]:

gbm=LGBMRegressor(n_estimators=500, # for accuracy use large numbers like 6000 
                  learning_rate=0.1,
                  feature_fraction=0.9,
                  subsample=0.1,  # batches of 20% of the data
                  subsample_freq=1,
                  num_leaves=160,
                  max_depth=10,
                  metric='rmse',
                  verbose= 100)


In [None]:
# Cross val testing - can be skipped
scores = cross_val_score(gbm, train_X, train_y, cv=5, 
                        scoring=rmsee_scorer)
print('rmsee scores:\n', scores)


In [84]:
del scores
gc.collect()

87

In [96]:
# Grid param search - can be skpped
grid_param = {
    'gbm__n_estimators': [500],
    'gbm__subsample': [0.1],
    'gbm__learning_rate': [0.1],
    'gbm__num_leaves': [80, 160],
    'gbm__max_depth': [11]
}

gd_sr = GridSearchCV(pipe,
                     param_grid=grid_param,
                     scoring=rmsee_scorer,
                     cv=4,
                     n_jobs=-1)

gd_sr.fit(train_X, train_y)

best_parameters = gd_sr.best_params_
print(best_parameters)
best_result = gd_sr.best_score_
print(best_result)


KeyboardInterrupt: 

In [27]:
sorted(gd_sr.cv_results_.keys())
for key in gd_sr.cv_results_.keys():
    print(str(key) + "  " + str(gd_sr.cv_results_[key]))


mean_fit_time  [607.31617959 748.329434  ]
std_fit_time  [11.42404894  7.16639039]
mean_score_time  [545.60431012 743.97350645]
std_score_time  [15.76616037 10.19180404]
param_gbm__learning_rate  [0.1 0.1]
param_gbm__max_depth  [11 11]
param_gbm__n_estimators  [500 500]
param_gbm__num_leaves  [80 160]
param_gbm__subsample  [0.1 0.1]
params  [{'gbm__learning_rate': 0.1, 'gbm__max_depth': 11, 'gbm__n_estimators': 500, 'gbm__num_leaves': 80, 'gbm__subsample': 0.1}, {'gbm__learning_rate': 0.1, 'gbm__max_depth': 11, 'gbm__n_estimators': 500, 'gbm__num_leaves': 160, 'gbm__subsample': 0.1}]
split0_test_score  [-1.50000092 -1.49220935]
split1_test_score  [-1.11290672 -1.10616635]
split2_test_score  [-1.46453371 -1.4708326 ]
mean_test_score  [-1.35914711 -1.35640277]
std_test_score  [0.17471926 0.17715895]
rank_test_score  [2 1]


In [28]:
del gd_sr
gc.collect

<function gc.collect(generation=2)>

In [60]:
# fit on all the data
pipe.fit(train_X, train_y, 
         gbm__eval_metric=rmsee, gbm__verbose=100)


Pipeline(memory=None,
         steps=[('pre_b_pipes',
                 Pipeline(memory=None,
                          steps=[('fillMean',
                                  <__main__.FillMean object at 0x0000010B91471EF0>),
                                 ('fillZeros',
                                  <__main__.FillZeros object at 0x0000010B96C7D128>),
                                 ('imputeCloudCoverage',
                                  <__main__.ImputeCloudCoverage object at 0x0000010B96C7D0B8>),
                                 ('imputeYearBuilt',
                                  <__main__.ImputeYearBuilt object at 0x0000010B96C7D6A0>),
                                 ('dro...
                               colsample_bytree=1.0, feature_fraction=0.9,
                               importance_type='split', learning_rate=0.1,
                               max_depth=11, metric='rmse',
                               min_child_samples=20, min_child_weight=0.001,
                

In [61]:
# Get features list for fit - can be skipped
imprtc_df = pd.DataFrame()
imprtc_df['feature'] = pre_b_pipes.transform(train_X).columns   
imprtc_df['importance'] = pipe.named_steps['gbm'].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)
print(imprtc_df.nsmallest(7,'importance')['feature'].tolist())



                          feature  importance
0                     building_id       16616
1                           meter        7207
69                           hour        4903
25     precip_depth_1_hr_std_lag3        2708
40      air_temperature_min_lag72        2258
38     air_temperature_mean_lag72        2243
39      air_temperature_max_lag72        2015
48      dew_temperature_min_lag72        1727
55   sea_level_pressure_max_lag72        1605
58      wind_direction_mean_lag72        1582
42      cloud_coverage_mean_lag72        1531
47      dew_temperature_max_lag72        1483
46     dew_temperature_mean_lag72        1477
61       wind_direction_std_lag72        1472
56   sea_level_pressure_min_lag72        1446
62          wind_speed_mean_lag72        1438
45       cloud_coverage_std_lag72        1378
41      air_temperature_std_lag72        1268
53    precip_depth_1_hr_std_lag72        1232
50   precip_depth_1_hr_mean_lag72        1205
65           wind_speed_std_lag72 

In [62]:
set_size = len(test_X)
iterations = 100
batch_size = set_size // iterations

print(set_size, iterations, batch_size)
assert set_size == iterations * batch_size

meter_reading = []
for i in tqdm(range(iterations)):
    pos = i*batch_size
    batch = np.expm1(pipe.predict(test_X.iloc[pos : pos+batch_size]).clip(0))
    meter_reading.extend(batch)

print(len(meter_reading))
assert len(meter_reading) == set_size

41697600 100 416976


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [16:11<00:00, 12.78s/it]


41697600


In [63]:
sub = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
# hack to prevent negative numbers
print(sub.shape)
sub['meter_reading'] = meter_reading
sub.to_csv('submission.csv', index = False)

(41697600, 2)
