In [282]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [283]:
def imputeWeather(weather_df, method:str='linear', gap_limit:int=None, limit_direction:str='forward'):
    grouped_weather_df = weather_df.groupby('site_id').apply(lambda group: group.interpolate(method=method, limit=gap_limit, limit_direction=limit_direction))
    
    if 'cloud_coverage' in grouped_weather_df.columns:
        grouped_weather_df['cloud_coverage'] = grouped_weather_df['cloud_coverage'].round(decimals=0).clip(0,8)
    grouped_weather_df.reset_index(inplace=True)
    return grouped_weather_df.drop('index', axis=1)
        

In [284]:
def add_lag_feature(weather_df, window=3):
    # https://www.kaggle.com/corochann/ashrae-training-lgbm-by-meter-type
    group_df = weather_df.groupby('site_id')
    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    lag_max = rolled.max().reset_index().astype(np.float16)
    lag_min = rolled.min().reset_index().astype(np.float16)
    lag_std = rolled.std().reset_index().astype(np.float16)
    for col in cols:
        weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
        weather_df[f'{col}_max_lag{window}'] = lag_max[col]
        weather_df[f'{col}_min_lag{window}'] = lag_min[col]
        weather_df[f'{col}_std_lag{window}'] = lag_std[col]

In [285]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float16, 'year_built': np.float16, 'floor_count': np.float16},
    'weather' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16}
}

file_loc = {}    
for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
    for name in ['building_metadata','weather_train','weather_test','train','test']:
        if path.exists(dir_path + name + '.csv'):
            file_loc[name]= dir_path + name + '.csv'
    
    building = pd.read_csv(file_loc['building_metadata'], dtype=file_dtype['building_metadata'])
    weather_train = pd.read_csv(file_loc['weather_train'], dtype=file_dtype['weather'])
    weather_test = pd.read_csv(file_loc['weather_test'], dtype=file_dtype['weather'])
    train = pd.read_csv(file_loc['train'], dtype=file_dtype['train'])
    test = pd.read_csv(file_loc['test'], dtype=file_dtype['test'])

print(weather_train.isna().sum())    
weather_train = imputeWeather(weather_train, limit_direction='both')
add_lag_feature(weather_train, window=3)
#add_lag_feature(weather_train, window=24)
add_lag_feature(weather_train, window=72)
print(weather_train.isna().sum())
weather_test = imputeWeather(weather_test, limit_direction='both')
add_lag_feature(weather_test, window=3)
#add_lag_feature(weather_test, window=24)
add_lag_feature(weather_test, window=72)
    
train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

del weather_train, weather_test
gc.collect()

site_id                   0
timestamp                 0
air_temperature          55
cloud_coverage        69173
dew_temperature         113
precip_depth_1_hr     50289
sea_level_pressure    10618
wind_direction         6268
wind_speed              304
dtype: int64
site_id                              0
timestamp                            0
air_temperature                      0
cloud_coverage                   17228
dew_temperature                      0
precip_depth_1_hr                26273
sea_level_pressure                8755
wind_direction                       0
wind_speed                           0
air_temperature_mean_lag3            0
air_temperature_max_lag3             0
air_temperature_min_lag3             0
air_temperature_std_lag3            16
cloud_coverage_mean_lag3         17228
cloud_coverage_max_lag3          17228
cloud_coverage_min_lag3          17228
cloud_coverage_std_lag3          17242
dew_temperature_mean_lag3            0
dew_temperature_max_lag3         

917

In [286]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [287]:
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        #north_america = holidays.CA() + holidays.US() + holidays.MX()
        us =  holidays.US();
        df['holiday'] = df['timestamp'].apply(lambda x: us.get(x))
        df['holiday'] = df['holiday'].astype('category')
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [288]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [289]:
class SetCatTypes(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['primary_use']= df['primary_use'].astype('category')
        df['meter'] = df["meter"].astype('category')
        df['site_id'] = df["site_id"].astype('category')
        df['building_id'] = df['building_id'].astype('category')
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [290]:
class ImputeCloudCoverage(TransformerMixin):
        
    def transform(self, df, **transform_params):
        # set age of building to mediam of site_id
        # else if set ot overall median
        median = df['cloud_coverage'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['cloud_coverage'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = i_median
            else:
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = median
        df['cloud_coverage'] = np.uint8(df['cloud_coverage'])
        df['cloud_coverage'] = df['cloud_coverage']
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [291]:
class CloudTimeCat(TransformerMixin):
        
    def transform(self, df, **transform_params):
        tempDf = df[['cloud_coverage', 'hour']].astype('int')
        tempDf['cloud_coverage'] = (tempDf['cloud_coverage']).astype('int')
        tempDf['hour'] = (tempDf['hour']).astype('int')
        tempDf = tempDf.astype('str')
        df['cloud_time_cat'] = 'c' + tempDf['cloud_coverage'] + 't' + tempDf['hour']
        df['cloud_time_cat'] = df['cloud_time_cat'].astype('category')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [292]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        return df.drop(self._drop_cols, axis=1)

    def fit(self, X, y=None, **fit_params):
        return self

In [293]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        year_built_median = df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [294]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [295]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('category') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = (df['timestamp'].dt.dayofweek * 24) + df['timestamp'].dt.hour
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('category')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [296]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [297]:
class FillMean(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].mean())
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [298]:
class FillZeros(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(0)
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [299]:
class FillMedian(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].median())
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [300]:
class FillPopular(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].value_counts()[0])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [301]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [302]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [303]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [304]:
# pre_a_pipes is for preprocessing that doesn't change impute
# values
pre_a_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        ('convertToDatetime', ConvertToDatetime()),
        ('AddHolidays', AddHolidays()),
        ('addRelativeHumidity',AddRelativeHumidity()),
        ('logSquareFeet', LogSquareFeet()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('setCatTypes', SetCatTypes()),
    ]
)

In [305]:
# pre_b_pipes is for imputed values and final
# drops before modeling
drop_cols = ['timestamp', 'square_feet', 
             'year_built' ]
fill_w_mean = ['floor_count','air_temperature','dew_temperature', 
              'precip_depth_1_hr', 'sea_level_pressure']
fill_w_zero = []


pre_b_pipes = Pipeline(
    steps=[
        (('pre_a_pipe'),pre_a_pipes),
        ('fillMean',FillMean(fill_w_mean)),
        ('fillZeros',FillZeros(fill_w_zero)),
        ('imputeCloudCoverage', ImputeCloudCoverage()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('dropClos', DropCols(drop_cols)),
        ('GC', GC())
    ]
)

pipe = Pipeline(
    steps=[
        ('pre_b_pipes',pre_b_pipes),
        ('gbm', gbm)]
)

train_X = pre_b_pipes.transform(train.drop('meter_reading', axis=1))
test_X = pre_b_pipes.transform(test.drop(['row_id'], axis=1))
train_y = np.log1p(train['meter_reading'])

print(train_X.shape)
print(test_X.shape)
print(train_X.sample(n=20,  random_state=42))
print(train_X.dtypes)
print(train_y.shape)

del train, test
gc.collect()


(20216100, 73)
(41697600, 73)
         building_id meter site_id                    primary_use  \
14245562        1324     1      14  Entertainment/public assembly   
1282718         1013     0      10                      Education   
13883790         229     1       2                      Education   
4781820          217     3       2                      Education   
10415393        1434     0      15                      Education   
1057008         1047     0      12                Public services   
4507399          911     1       9                      Education   
19478829        1039     0      12                      Education   
8955615          265     0       2                         Office   
13799839         896     0       9                      Education   
15647011         973     0       9                         Office   
2524294          813     0       8                Public services   
10016102         870     0       8  Entertainment/public assembly   
3915

[20 rows x 73 columns]
building_id                      category
meter                            category
site_id                          category
primary_use                      category
floor_count                       float16
air_temperature                   float16
cloud_coverage                      uint8
dew_temperature                   float16
precip_depth_1_hr                 float16
sea_level_pressure                float16
wind_direction                    float16
wind_speed                        float16
air_temperature_mean_lag3         float16
air_temperature_max_lag3          float16
air_temperature_min_lag3          float16
air_temperature_std_lag3          float16
cloud_coverage_mean_lag3          float16
cloud_coverage_max_lag3           float16
cloud_coverage_min_lag3           float16
cloud_coverage_std_lag3           float16
dew_temperature_mean_lag3         float16
dew_temperature_max_lag3          float16
dew_temperature_min_lag3          float16
dew_tempera

101

In [306]:
print(test_X.shape)

(41697600, 73)


In [271]:
%%time

folds = 4

# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

models = []
best_scores = []
for train_index, val_index in kf.split(train_X, train_X['building_id']):
    f_train_X = train_X.iloc[train_index]
    f_train_y = train_y.iloc[train_index]
    lgb_train = lgb.Dataset(f_train_X,f_train_y)
    f_val_X = train_X.iloc[val_index]
    f_val_y = train_y.iloc[val_index]
    lgb_val = lgb.Dataset(f_val_X,f_val_y)
    gbm = LGBMRegressor(n_estimators=500, # for accuracy use large numbers like 6000 
                  learning_rate=0.4,
                  feature_fraction=0.9,
                  subsample=0.1,  # batches of 20% of the data
                  subsample_freq=1,
                  num_leaves=20,
                  max_depth=10,
                  metric='rmse',
                  verbose= 100)
    gbm.fit(f_train_X, f_train_y,
        eval_set=[(f_val_X, f_val_y)],
        eval_metric=lbm_rmslee,
        early_stopping_rounds=20)
    models.append(gbm)
    #y_pred = gbm.predict(f_val_X, num_iteration=gbm.best_iteration_)
    # eval
    #rmsle_score = lbm_rmslee(f_val_X, y_pred)[1]
    best_scores.append(gbm.best_score_)

[1]	valid_0's rmse: 1.86773	valid_0's RMSLEE: 1.86773
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.73568	valid_0's RMSLEE: 1.73568
[3]	valid_0's rmse: 1.57852	valid_0's RMSLEE: 1.57852
[4]	valid_0's rmse: 1.45606	valid_0's RMSLEE: 1.45606
[5]	valid_0's rmse: 1.35844	valid_0's RMSLEE: 1.35844
[6]	valid_0's rmse: 1.31915	valid_0's RMSLEE: 1.31915
[7]	valid_0's rmse: 1.29617	valid_0's RMSLEE: 1.29617
[8]	valid_0's rmse: 1.27799	valid_0's RMSLEE: 1.27799
[9]	valid_0's rmse: 1.25482	valid_0's RMSLEE: 1.25482
[10]	valid_0's rmse: 1.21649	valid_0's RMSLEE: 1.21649
[11]	valid_0's rmse: 1.20205	valid_0's RMSLEE: 1.20205
[12]	valid_0's rmse: 1.1753	valid_0's RMSLEE: 1.1753
[13]	valid_0's rmse: 1.16974	valid_0's RMSLEE: 1.16974
[14]	valid_0's rmse: 1.15	valid_0's RMSLEE: 1.15
[15]	valid_0's rmse: 1.14106	valid_0's RMSLEE: 1.14106
[16]	valid_0's rmse: 1.13701	valid_0's RMSLEE: 1.13701
[17]	valid_0's rmse: 1.10811	valid_0's RMSLEE: 1.10811
[18]	valid_0's rmse: 

[145]	valid_0's rmse: 0.894462	valid_0's RMSLEE: 0.894462
[146]	valid_0's rmse: 0.89383	valid_0's RMSLEE: 0.89383
[147]	valid_0's rmse: 0.893458	valid_0's RMSLEE: 0.893458
[148]	valid_0's rmse: 0.8932	valid_0's RMSLEE: 0.8932
[149]	valid_0's rmse: 0.892797	valid_0's RMSLEE: 0.892797
[150]	valid_0's rmse: 0.892721	valid_0's RMSLEE: 0.892721
[151]	valid_0's rmse: 0.892288	valid_0's RMSLEE: 0.892288
[152]	valid_0's rmse: 0.891849	valid_0's RMSLEE: 0.891849
[153]	valid_0's rmse: 0.89145	valid_0's RMSLEE: 0.89145
[154]	valid_0's rmse: 0.891105	valid_0's RMSLEE: 0.891105
[155]	valid_0's rmse: 0.890689	valid_0's RMSLEE: 0.890689
[156]	valid_0's rmse: 0.890148	valid_0's RMSLEE: 0.890148
[157]	valid_0's rmse: 0.88979	valid_0's RMSLEE: 0.88979
[158]	valid_0's rmse: 0.889518	valid_0's RMSLEE: 0.889518
[159]	valid_0's rmse: 0.889346	valid_0's RMSLEE: 0.889346
[160]	valid_0's rmse: 0.888963	valid_0's RMSLEE: 0.888963
[161]	valid_0's rmse: 0.88847	valid_0's RMSLEE: 0.88847
[162]	valid_0's rmse: 0.88

[286]	valid_0's rmse: 0.845811	valid_0's RMSLEE: 0.845811
[287]	valid_0's rmse: 0.84548	valid_0's RMSLEE: 0.84548
[288]	valid_0's rmse: 0.845187	valid_0's RMSLEE: 0.845187
[289]	valid_0's rmse: 0.844836	valid_0's RMSLEE: 0.844836
[290]	valid_0's rmse: 0.844159	valid_0's RMSLEE: 0.844159
[291]	valid_0's rmse: 0.843825	valid_0's RMSLEE: 0.843825
[292]	valid_0's rmse: 0.843664	valid_0's RMSLEE: 0.843664
[293]	valid_0's rmse: 0.843445	valid_0's RMSLEE: 0.843445
[294]	valid_0's rmse: 0.843338	valid_0's RMSLEE: 0.843338
[295]	valid_0's rmse: 0.843129	valid_0's RMSLEE: 0.843129
[296]	valid_0's rmse: 0.842911	valid_0's RMSLEE: 0.842911
[297]	valid_0's rmse: 0.842657	valid_0's RMSLEE: 0.842657
[298]	valid_0's rmse: 0.84238	valid_0's RMSLEE: 0.84238
[299]	valid_0's rmse: 0.842173	valid_0's RMSLEE: 0.842173
[300]	valid_0's rmse: 0.841789	valid_0's RMSLEE: 0.841789
[301]	valid_0's rmse: 0.841596	valid_0's RMSLEE: 0.841596
[302]	valid_0's rmse: 0.841522	valid_0's RMSLEE: 0.841522
[303]	valid_0's rm

[427]	valid_0's rmse: 0.819405	valid_0's RMSLEE: 0.819405
[428]	valid_0's rmse: 0.819351	valid_0's RMSLEE: 0.819351
[429]	valid_0's rmse: 0.819359	valid_0's RMSLEE: 0.819359
[430]	valid_0's rmse: 0.81923	valid_0's RMSLEE: 0.81923
[431]	valid_0's rmse: 0.819122	valid_0's RMSLEE: 0.819122
[432]	valid_0's rmse: 0.818879	valid_0's RMSLEE: 0.818879
[433]	valid_0's rmse: 0.818356	valid_0's RMSLEE: 0.818356
[434]	valid_0's rmse: 0.818271	valid_0's RMSLEE: 0.818271
[435]	valid_0's rmse: 0.818086	valid_0's RMSLEE: 0.818086
[436]	valid_0's rmse: 0.818014	valid_0's RMSLEE: 0.818014
[437]	valid_0's rmse: 0.817962	valid_0's RMSLEE: 0.817962
[438]	valid_0's rmse: 0.817775	valid_0's RMSLEE: 0.817775
[439]	valid_0's rmse: 0.817669	valid_0's RMSLEE: 0.817669
[440]	valid_0's rmse: 0.817599	valid_0's RMSLEE: 0.817599
[441]	valid_0's rmse: 0.817472	valid_0's RMSLEE: 0.817472
[442]	valid_0's rmse: 0.817418	valid_0's RMSLEE: 0.817418
[443]	valid_0's rmse: 0.817055	valid_0's RMSLEE: 0.817055
[444]	valid_0's 

[68]	valid_0's rmse: 0.962997	valid_0's RMSLEE: 0.962997
[69]	valid_0's rmse: 0.962071	valid_0's RMSLEE: 0.962071
[70]	valid_0's rmse: 0.961528	valid_0's RMSLEE: 0.961528
[71]	valid_0's rmse: 0.960713	valid_0's RMSLEE: 0.960713
[72]	valid_0's rmse: 0.957943	valid_0's RMSLEE: 0.957943
[73]	valid_0's rmse: 0.957195	valid_0's RMSLEE: 0.957195
[74]	valid_0's rmse: 0.956416	valid_0's RMSLEE: 0.956416
[75]	valid_0's rmse: 0.955947	valid_0's RMSLEE: 0.955947
[76]	valid_0's rmse: 0.955128	valid_0's RMSLEE: 0.955128
[77]	valid_0's rmse: 0.954491	valid_0's RMSLEE: 0.954491
[78]	valid_0's rmse: 0.953644	valid_0's RMSLEE: 0.953644
[79]	valid_0's rmse: 0.952998	valid_0's RMSLEE: 0.952998
[80]	valid_0's rmse: 0.952271	valid_0's RMSLEE: 0.952271
[81]	valid_0's rmse: 0.951484	valid_0's RMSLEE: 0.951484
[82]	valid_0's rmse: 0.950438	valid_0's RMSLEE: 0.950438
[83]	valid_0's rmse: 0.94957	valid_0's RMSLEE: 0.94957
[84]	valid_0's rmse: 0.949247	valid_0's RMSLEE: 0.949247
[85]	valid_0's rmse: 0.944776	val

[210]	valid_0's rmse: 0.870723	valid_0's RMSLEE: 0.870723
[211]	valid_0's rmse: 0.870042	valid_0's RMSLEE: 0.870042
[212]	valid_0's rmse: 0.869683	valid_0's RMSLEE: 0.869683
[213]	valid_0's rmse: 0.869345	valid_0's RMSLEE: 0.869345
[214]	valid_0's rmse: 0.86907	valid_0's RMSLEE: 0.86907
[215]	valid_0's rmse: 0.869027	valid_0's RMSLEE: 0.869027
[216]	valid_0's rmse: 0.868886	valid_0's RMSLEE: 0.868886
[217]	valid_0's rmse: 0.868462	valid_0's RMSLEE: 0.868462
[218]	valid_0's rmse: 0.868007	valid_0's RMSLEE: 0.868007
[219]	valid_0's rmse: 0.867683	valid_0's RMSLEE: 0.867683
[220]	valid_0's rmse: 0.867432	valid_0's RMSLEE: 0.867432
[221]	valid_0's rmse: 0.867219	valid_0's RMSLEE: 0.867219
[222]	valid_0's rmse: 0.866899	valid_0's RMSLEE: 0.866899
[223]	valid_0's rmse: 0.866502	valid_0's RMSLEE: 0.866502
[224]	valid_0's rmse: 0.866211	valid_0's RMSLEE: 0.866211
[225]	valid_0's rmse: 0.865747	valid_0's RMSLEE: 0.865747
[226]	valid_0's rmse: 0.865657	valid_0's RMSLEE: 0.865657
[227]	valid_0's 

[351]	valid_0's rmse: 0.834338	valid_0's RMSLEE: 0.834338
[352]	valid_0's rmse: 0.834252	valid_0's RMSLEE: 0.834252
[353]	valid_0's rmse: 0.834115	valid_0's RMSLEE: 0.834115
[354]	valid_0's rmse: 0.834062	valid_0's RMSLEE: 0.834062
[355]	valid_0's rmse: 0.833988	valid_0's RMSLEE: 0.833988
[356]	valid_0's rmse: 0.833908	valid_0's RMSLEE: 0.833908
[357]	valid_0's rmse: 0.833751	valid_0's RMSLEE: 0.833751
[358]	valid_0's rmse: 0.833319	valid_0's RMSLEE: 0.833319
[359]	valid_0's rmse: 0.833171	valid_0's RMSLEE: 0.833171
[360]	valid_0's rmse: 0.832996	valid_0's RMSLEE: 0.832996
[361]	valid_0's rmse: 0.832778	valid_0's RMSLEE: 0.832778
[362]	valid_0's rmse: 0.832389	valid_0's RMSLEE: 0.832389
[363]	valid_0's rmse: 0.832272	valid_0's RMSLEE: 0.832272
[364]	valid_0's rmse: 0.832043	valid_0's RMSLEE: 0.832043
[365]	valid_0's rmse: 0.831938	valid_0's RMSLEE: 0.831938
[366]	valid_0's rmse: 0.831838	valid_0's RMSLEE: 0.831838
[367]	valid_0's rmse: 0.831654	valid_0's RMSLEE: 0.831654
[368]	valid_0'

[492]	valid_0's rmse: 0.814068	valid_0's RMSLEE: 0.814068
[493]	valid_0's rmse: 0.814023	valid_0's RMSLEE: 0.814023
[494]	valid_0's rmse: 0.813926	valid_0's RMSLEE: 0.813926
[495]	valid_0's rmse: 0.813911	valid_0's RMSLEE: 0.813911
[496]	valid_0's rmse: 0.813814	valid_0's RMSLEE: 0.813814
[497]	valid_0's rmse: 0.813601	valid_0's RMSLEE: 0.813601
[498]	valid_0's rmse: 0.813538	valid_0's RMSLEE: 0.813538
[499]	valid_0's rmse: 0.813448	valid_0's RMSLEE: 0.813448
[500]	valid_0's rmse: 0.81339	valid_0's RMSLEE: 0.81339
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmse: 0.81339	valid_0's RMSLEE: 0.81339
[1]	valid_0's rmse: 1.86797	valid_0's RMSLEE: 1.86797
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.73031	valid_0's RMSLEE: 1.73031
[3]	valid_0's rmse: 1.58188	valid_0's RMSLEE: 1.58188
[4]	valid_0's rmse: 1.47036	valid_0's RMSLEE: 1.47036
[5]	valid_0's rmse: 1.36584	valid_0's RMSLEE: 1.36584
[6]	valid_0's rmse: 1.32698	valid_0's RMSLEE:

[134]	valid_0's rmse: 0.914896	valid_0's RMSLEE: 0.914896
[135]	valid_0's rmse: 0.914064	valid_0's RMSLEE: 0.914064
[136]	valid_0's rmse: 0.913848	valid_0's RMSLEE: 0.913848
[137]	valid_0's rmse: 0.913298	valid_0's RMSLEE: 0.913298
[138]	valid_0's rmse: 0.912542	valid_0's RMSLEE: 0.912542
[139]	valid_0's rmse: 0.911798	valid_0's RMSLEE: 0.911798
[140]	valid_0's rmse: 0.911159	valid_0's RMSLEE: 0.911159
[141]	valid_0's rmse: 0.910673	valid_0's RMSLEE: 0.910673
[142]	valid_0's rmse: 0.910218	valid_0's RMSLEE: 0.910218
[143]	valid_0's rmse: 0.909551	valid_0's RMSLEE: 0.909551
[144]	valid_0's rmse: 0.909203	valid_0's RMSLEE: 0.909203
[145]	valid_0's rmse: 0.908759	valid_0's RMSLEE: 0.908759
[146]	valid_0's rmse: 0.908289	valid_0's RMSLEE: 0.908289
[147]	valid_0's rmse: 0.907686	valid_0's RMSLEE: 0.907686
[148]	valid_0's rmse: 0.907099	valid_0's RMSLEE: 0.907099
[149]	valid_0's rmse: 0.906589	valid_0's RMSLEE: 0.906589
[150]	valid_0's rmse: 0.906067	valid_0's RMSLEE: 0.906067
[151]	valid_0'

[275]	valid_0's rmse: 0.850783	valid_0's RMSLEE: 0.850783
[276]	valid_0's rmse: 0.850631	valid_0's RMSLEE: 0.850631
[277]	valid_0's rmse: 0.850579	valid_0's RMSLEE: 0.850579
[278]	valid_0's rmse: 0.850343	valid_0's RMSLEE: 0.850343
[279]	valid_0's rmse: 0.8502	valid_0's RMSLEE: 0.8502
[280]	valid_0's rmse: 0.850079	valid_0's RMSLEE: 0.850079
[281]	valid_0's rmse: 0.849771	valid_0's RMSLEE: 0.849771
[282]	valid_0's rmse: 0.849608	valid_0's RMSLEE: 0.849608
[283]	valid_0's rmse: 0.849423	valid_0's RMSLEE: 0.849423
[284]	valid_0's rmse: 0.849324	valid_0's RMSLEE: 0.849324
[285]	valid_0's rmse: 0.849086	valid_0's RMSLEE: 0.849086
[286]	valid_0's rmse: 0.848945	valid_0's RMSLEE: 0.848945
[287]	valid_0's rmse: 0.848504	valid_0's RMSLEE: 0.848504
[288]	valid_0's rmse: 0.848289	valid_0's RMSLEE: 0.848289
[289]	valid_0's rmse: 0.848149	valid_0's RMSLEE: 0.848149
[290]	valid_0's rmse: 0.84774	valid_0's RMSLEE: 0.84774
[291]	valid_0's rmse: 0.847435	valid_0's RMSLEE: 0.847435
[292]	valid_0's rmse

[416]	valid_0's rmse: 0.823518	valid_0's RMSLEE: 0.823518
[417]	valid_0's rmse: 0.823329	valid_0's RMSLEE: 0.823329
[418]	valid_0's rmse: 0.823254	valid_0's RMSLEE: 0.823254
[419]	valid_0's rmse: 0.823169	valid_0's RMSLEE: 0.823169
[420]	valid_0's rmse: 0.822774	valid_0's RMSLEE: 0.822774
[421]	valid_0's rmse: 0.82269	valid_0's RMSLEE: 0.82269
[422]	valid_0's rmse: 0.82269	valid_0's RMSLEE: 0.82269
[423]	valid_0's rmse: 0.822605	valid_0's RMSLEE: 0.822605
[424]	valid_0's rmse: 0.822543	valid_0's RMSLEE: 0.822543
[425]	valid_0's rmse: 0.822456	valid_0's RMSLEE: 0.822456
[426]	valid_0's rmse: 0.82174	valid_0's RMSLEE: 0.82174
[427]	valid_0's rmse: 0.821592	valid_0's RMSLEE: 0.821592
[428]	valid_0's rmse: 0.821494	valid_0's RMSLEE: 0.821494
[429]	valid_0's rmse: 0.821453	valid_0's RMSLEE: 0.821453
[430]	valid_0's rmse: 0.821243	valid_0's RMSLEE: 0.821243
[431]	valid_0's rmse: 0.821174	valid_0's RMSLEE: 0.821174
[432]	valid_0's rmse: 0.821143	valid_0's RMSLEE: 0.821143
[433]	valid_0's rmse

[57]	valid_0's rmse: 0.971821	valid_0's RMSLEE: 0.971821
[58]	valid_0's rmse: 0.970476	valid_0's RMSLEE: 0.970476
[59]	valid_0's rmse: 0.969615	valid_0's RMSLEE: 0.969615
[60]	valid_0's rmse: 0.968955	valid_0's RMSLEE: 0.968955
[61]	valid_0's rmse: 0.967953	valid_0's RMSLEE: 0.967953
[62]	valid_0's rmse: 0.966994	valid_0's RMSLEE: 0.966994
[63]	valid_0's rmse: 0.966091	valid_0's RMSLEE: 0.966091
[64]	valid_0's rmse: 0.964945	valid_0's RMSLEE: 0.964945
[65]	valid_0's rmse: 0.96427	valid_0's RMSLEE: 0.96427
[66]	valid_0's rmse: 0.962426	valid_0's RMSLEE: 0.962426
[67]	valid_0's rmse: 0.961567	valid_0's RMSLEE: 0.961567
[68]	valid_0's rmse: 0.960583	valid_0's RMSLEE: 0.960583
[69]	valid_0's rmse: 0.959705	valid_0's RMSLEE: 0.959705
[70]	valid_0's rmse: 0.959119	valid_0's RMSLEE: 0.959119
[71]	valid_0's rmse: 0.958141	valid_0's RMSLEE: 0.958141
[72]	valid_0's rmse: 0.955376	valid_0's RMSLEE: 0.955376
[73]	valid_0's rmse: 0.953901	valid_0's RMSLEE: 0.953901
[74]	valid_0's rmse: 0.953218	val

[199]	valid_0's rmse: 0.871327	valid_0's RMSLEE: 0.871327
[200]	valid_0's rmse: 0.871026	valid_0's RMSLEE: 0.871026
[201]	valid_0's rmse: 0.870718	valid_0's RMSLEE: 0.870718
[202]	valid_0's rmse: 0.870516	valid_0's RMSLEE: 0.870516
[203]	valid_0's rmse: 0.870097	valid_0's RMSLEE: 0.870097
[204]	valid_0's rmse: 0.869737	valid_0's RMSLEE: 0.869737
[205]	valid_0's rmse: 0.86944	valid_0's RMSLEE: 0.86944
[206]	valid_0's rmse: 0.869153	valid_0's RMSLEE: 0.869153
[207]	valid_0's rmse: 0.868823	valid_0's RMSLEE: 0.868823
[208]	valid_0's rmse: 0.868551	valid_0's RMSLEE: 0.868551
[209]	valid_0's rmse: 0.86837	valid_0's RMSLEE: 0.86837
[210]	valid_0's rmse: 0.868195	valid_0's RMSLEE: 0.868195
[211]	valid_0's rmse: 0.867656	valid_0's RMSLEE: 0.867656
[212]	valid_0's rmse: 0.867542	valid_0's RMSLEE: 0.867542
[213]	valid_0's rmse: 0.867385	valid_0's RMSLEE: 0.867385
[214]	valid_0's rmse: 0.867144	valid_0's RMSLEE: 0.867144
[215]	valid_0's rmse: 0.866876	valid_0's RMSLEE: 0.866876
[216]	valid_0's rm

[340]	valid_0's rmse: 0.83387	valid_0's RMSLEE: 0.83387
[341]	valid_0's rmse: 0.83336	valid_0's RMSLEE: 0.83336
[342]	valid_0's rmse: 0.833041	valid_0's RMSLEE: 0.833041
[343]	valid_0's rmse: 0.832872	valid_0's RMSLEE: 0.832872
[344]	valid_0's rmse: 0.83284	valid_0's RMSLEE: 0.83284
[345]	valid_0's rmse: 0.832562	valid_0's RMSLEE: 0.832562
[346]	valid_0's rmse: 0.832274	valid_0's RMSLEE: 0.832274
[347]	valid_0's rmse: 0.831896	valid_0's RMSLEE: 0.831896
[348]	valid_0's rmse: 0.831861	valid_0's RMSLEE: 0.831861
[349]	valid_0's rmse: 0.831732	valid_0's RMSLEE: 0.831732
[350]	valid_0's rmse: 0.831652	valid_0's RMSLEE: 0.831652
[351]	valid_0's rmse: 0.831569	valid_0's RMSLEE: 0.831569
[352]	valid_0's rmse: 0.831484	valid_0's RMSLEE: 0.831484
[353]	valid_0's rmse: 0.831179	valid_0's RMSLEE: 0.831179
[354]	valid_0's rmse: 0.831089	valid_0's RMSLEE: 0.831089
[355]	valid_0's rmse: 0.830945	valid_0's RMSLEE: 0.830945
[356]	valid_0's rmse: 0.830638	valid_0's RMSLEE: 0.830638
[357]	valid_0's rmse

[481]	valid_0's rmse: 0.812735	valid_0's RMSLEE: 0.812735
[482]	valid_0's rmse: 0.812642	valid_0's RMSLEE: 0.812642
[483]	valid_0's rmse: 0.81264	valid_0's RMSLEE: 0.81264
[484]	valid_0's rmse: 0.812413	valid_0's RMSLEE: 0.812413
[485]	valid_0's rmse: 0.812244	valid_0's RMSLEE: 0.812244
[486]	valid_0's rmse: 0.812153	valid_0's RMSLEE: 0.812153
[487]	valid_0's rmse: 0.812077	valid_0's RMSLEE: 0.812077
[488]	valid_0's rmse: 0.811856	valid_0's RMSLEE: 0.811856
[489]	valid_0's rmse: 0.811647	valid_0's RMSLEE: 0.811647
[490]	valid_0's rmse: 0.811421	valid_0's RMSLEE: 0.811421
[491]	valid_0's rmse: 0.811392	valid_0's RMSLEE: 0.811392
[492]	valid_0's rmse: 0.811137	valid_0's RMSLEE: 0.811137
[493]	valid_0's rmse: 0.811047	valid_0's RMSLEE: 0.811047
[494]	valid_0's rmse: 0.810971	valid_0's RMSLEE: 0.810971
[495]	valid_0's rmse: 0.810822	valid_0's RMSLEE: 0.810822
[496]	valid_0's rmse: 0.810758	valid_0's RMSLEE: 0.810758
[497]	valid_0's rmse: 0.810681	valid_0's RMSLEE: 0.810681
[498]	valid_0's 

In [275]:
for score in best_scores:
    print(score['valid_0']['RMSLEE'])

0.8092493467418131
0.8133896469589785
0.8121420331264276
0.8103245467523503


In [310]:
imprtc_df = pd.DataFrame()
imprtc_df['feature'] = train_X.columns   
imprtc_df['importance'] = models[0].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)


                          feature  importance
0                     building_id        3605
1                           meter         862
71                           hour         642
27     precip_depth_1_hr_std_lag3         334
42      air_temperature_min_lag72         234
41      air_temperature_max_lag72         230
40     air_temperature_mean_lag72         218
50      dew_temperature_min_lag72         190
57   sea_level_pressure_max_lag72         154
2                         site_id         136
48     dew_temperature_mean_lag72         136
49      dew_temperature_max_lag72         133
69                log_square_feet         131
58   sea_level_pressure_min_lag72         128
63       wind_direction_std_lag72         116
60      wind_direction_mean_lag72         111
64          wind_speed_mean_lag72         108
47       cloud_coverage_std_lag72         106
44      cloud_coverage_mean_lag72         104
43      air_temperature_std_lag72         103
55    precip_depth_1_hr_std_lag72 

In [308]:
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
    i+=step_size
    
    
    


100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [27:42<00:00,  1.96s/it]


In [309]:
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission.csv', index=False)
submission

  building_id meter site_id primary_use  floor_count  air_temperature  \
0           0     0       0   Education          NaN        17.796875   
1           1     0       0   Education          NaN        17.796875   
2           2     0       0   Education          NaN        17.796875   
3           3     0       0   Education          NaN        17.796875   
4           4     0       0   Education          NaN        17.796875   

   cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
0               4        11.703125                0.0              1021.5   
1               4        11.703125                0.0              1021.5   
2               4        11.703125                0.0              1021.5   
3               4        11.703125                0.0              1021.5   
4               4        11.703125                0.0              1021.5   

   ...  wind_direction_std_lag72  wind_speed_mean_lag72  wind_speed_max_lag72  \
0  ...           

Unnamed: 0,row_id,meter_reading
0,0,2.737759
1,1,4.791736
2,2,0.771552
3,3,2.262538
4,4,5.799311
5,5,1.012726
6,6,2.739228
7,7,0.723243
8,8,15.980293
9,9,1.684215


In [None]:
#
#
#
#
#
# everything below is older stuff - please ignore
#
#
#
#
#

In [226]:

gbm=LGBMRegressor(n_estimators=500, # for accuracy use large numbers like 6000 
                  learning_rate=0.1,
                  feature_fraction=0.9,
                  subsample=0.1,  # batches of 20% of the data
                  subsample_freq=1,
                  num_leaves=160,
                  max_depth=10,
                  metric='rmse',
                  verbose= 100)


In [None]:
# Cross val testing - can be skipped
scores = cross_val_score(gbm, train_X, train_y, cv=5, 
                        scoring=rmsee_scorer)
print('rmsee scores:\n', scores)


In [84]:
del scores
gc.collect()

87

In [96]:
# Grid param search - can be skpped
grid_param = {
    'gbm__n_estimators': [500],
    'gbm__subsample': [0.1],
    'gbm__learning_rate': [0.1],
    'gbm__num_leaves': [80, 160],
    'gbm__max_depth': [11]
}

gd_sr = GridSearchCV(pipe,
                     param_grid=grid_param,
                     scoring=rmsee_scorer,
                     cv=4,
                     n_jobs=-1)

gd_sr.fit(train_X, train_y)

best_parameters = gd_sr.best_params_
print(best_parameters)
best_result = gd_sr.best_score_
print(best_result)


KeyboardInterrupt: 

In [27]:
sorted(gd_sr.cv_results_.keys())
for key in gd_sr.cv_results_.keys():
    print(str(key) + "  " + str(gd_sr.cv_results_[key]))


mean_fit_time  [607.31617959 748.329434  ]
std_fit_time  [11.42404894  7.16639039]
mean_score_time  [545.60431012 743.97350645]
std_score_time  [15.76616037 10.19180404]
param_gbm__learning_rate  [0.1 0.1]
param_gbm__max_depth  [11 11]
param_gbm__n_estimators  [500 500]
param_gbm__num_leaves  [80 160]
param_gbm__subsample  [0.1 0.1]
params  [{'gbm__learning_rate': 0.1, 'gbm__max_depth': 11, 'gbm__n_estimators': 500, 'gbm__num_leaves': 80, 'gbm__subsample': 0.1}, {'gbm__learning_rate': 0.1, 'gbm__max_depth': 11, 'gbm__n_estimators': 500, 'gbm__num_leaves': 160, 'gbm__subsample': 0.1}]
split0_test_score  [-1.50000092 -1.49220935]
split1_test_score  [-1.11290672 -1.10616635]
split2_test_score  [-1.46453371 -1.4708326 ]
mean_test_score  [-1.35914711 -1.35640277]
std_test_score  [0.17471926 0.17715895]
rank_test_score  [2 1]


In [28]:
del gd_sr
gc.collect

<function gc.collect(generation=2)>

In [60]:
# fit on all the data
pipe.fit(train_X, train_y, 
         gbm__eval_metric=rmsee, gbm__verbose=100)


Pipeline(memory=None,
         steps=[('pre_b_pipes',
                 Pipeline(memory=None,
                          steps=[('fillMean',
                                  <__main__.FillMean object at 0x0000010B91471EF0>),
                                 ('fillZeros',
                                  <__main__.FillZeros object at 0x0000010B96C7D128>),
                                 ('imputeCloudCoverage',
                                  <__main__.ImputeCloudCoverage object at 0x0000010B96C7D0B8>),
                                 ('imputeYearBuilt',
                                  <__main__.ImputeYearBuilt object at 0x0000010B96C7D6A0>),
                                 ('dro...
                               colsample_bytree=1.0, feature_fraction=0.9,
                               importance_type='split', learning_rate=0.1,
                               max_depth=11, metric='rmse',
                               min_child_samples=20, min_child_weight=0.001,
                

In [61]:
# Get features list for fit - can be skipped
imprtc_df = pd.DataFrame()
imprtc_df['feature'] = pre_b_pipes.transform(train_X).columns   
imprtc_df['importance'] = pipe.named_steps['gbm'].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)
print(imprtc_df.nsmallest(7,'importance')['feature'].tolist())



                          feature  importance
0                     building_id       16616
1                           meter        7207
69                           hour        4903
25     precip_depth_1_hr_std_lag3        2708
40      air_temperature_min_lag72        2258
38     air_temperature_mean_lag72        2243
39      air_temperature_max_lag72        2015
48      dew_temperature_min_lag72        1727
55   sea_level_pressure_max_lag72        1605
58      wind_direction_mean_lag72        1582
42      cloud_coverage_mean_lag72        1531
47      dew_temperature_max_lag72        1483
46     dew_temperature_mean_lag72        1477
61       wind_direction_std_lag72        1472
56   sea_level_pressure_min_lag72        1446
62          wind_speed_mean_lag72        1438
45       cloud_coverage_std_lag72        1378
41      air_temperature_std_lag72        1268
53    precip_depth_1_hr_std_lag72        1232
50   precip_depth_1_hr_mean_lag72        1205
65           wind_speed_std_lag72 

In [62]:
set_size = len(test_X)
iterations = 100
batch_size = set_size // iterations

print(set_size, iterations, batch_size)
assert set_size == iterations * batch_size

meter_reading = []
for i in tqdm(range(iterations)):
    pos = i*batch_size
    batch = np.expm1(pipe.predict(test_X.iloc[pos : pos+batch_size]).clip(0))
    meter_reading.extend(batch)

print(len(meter_reading))
assert len(meter_reading) == set_size

41697600 100 416976


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [16:11<00:00, 12.78s/it]


41697600


In [63]:
sub = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
# hack to prevent negative numbers
print(sub.shape)
sub['meter_reading'] = meter_reading
sub.to_csv('submission.csv', index = False)

(41697600, 2)
