In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [2]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float16, 'year_built': np.float16, 'floor_count': np.float16},
    'weather' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16}
}

file_loc = {}    
for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
    for name in ['building_metadata','weather_train','weather_test','train','test']:
        if path.exists(dir_path + name + '.csv'):
            file_loc[name]= dir_path + name + '.csv'
    
    building = pd.read_csv(file_loc['building_metadata'], dtype=file_dtype['building_metadata'])
    weather_train = pd.read_csv(file_loc['weather_train'], dtype=file_dtype['weather'])
    weather_test = pd.read_csv(file_loc['weather_test'], dtype=file_dtype['weather'])
    train = pd.read_csv(file_loc['train'], dtype=file_dtype['train'])
    test = pd.read_csv(file_loc['test'], dtype=file_dtype['test'])


In [3]:
class ImputeWeather(TransformerMixin):

    def __init__(self, method:str='linear', gap_limit:int=None, limit_direction:str='forward'):
        self._method = method
        self._gap_limit = gap_limit
        self._limit_direction = limit_direction
        
    def transform(self, weather_df, **transform_params):
        grouped_weather_df = weather_df.groupby('site_id').apply(lambda group: group.interpolate(method=self._method, limit=self._gap_limit, limit_direction=self._limit_direction))
        if 'cloud_coverage' in grouped_weather_df.columns:
            grouped_weather_df['cloud_coverage'] = grouped_weather_df['cloud_coverage'].round(decimals=0).clip(0,8)
        grouped_weather_df.reset_index(inplace=True)
        return grouped_weather_df.drop('index', axis=1)

    def fit(self, X, y=None, **fit_params):
        return self

In [4]:
class AddWeatherLags(TransformerMixin):
    
    def __init__(self, window):
        self._window = window
        
    def transform(self, weather_df, **transform_params):
        group_df = weather_df.groupby(['site_id'])
        cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
        rolled = group_df[cols].rolling(window=self._window, min_periods=0)
        lag_mean = rolled.mean().reset_index().astype(np.float16)
        lag_max = rolled.max().reset_index().astype(np.float16)
        lag_min = rolled.min().reset_index().astype(np.float16)
        lag_std = rolled.std().reset_index().astype(np.float16)
        for col in cols:
            weather_df[f'{col}_mean_lag{self._window}'] = lag_mean[col]
            weather_df[f'{col}_max_lag{self._window}'] = lag_max[col]
            weather_df[f'{col}_min_lag{self._window}'] = lag_min[col]
            weather_df[f'{col}_std_lag{self._window}'] = lag_std[col]
        return weather_df

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
class AddWeather(TransformerMixin):

    def __init__(self, weather_df):
        self._b_df = weather_df
        
    def transform(self, df, **transform_params):
        
        return df.merge(weather_test, on=['site_id', 'timestamp'], how='left')

    def fit(self, X, y=None, **fit_params):
        return self

In [6]:
class AddBuilding(TransformerMixin):

    def __init__(self, building_df):
        self._b_df = building_df
        
    def transform(self, df, **transform_params):
        
        return df.merge(_b_df, on='building_id', how='left')

    def fit(self, X, y=None, **fit_params):
        return self

In [7]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [32]:
in_us = [0,2,3,4,5,6,8,9,10,13,14]
in_ca = [7,11]
in_uk = [1,5]
in_ie = [12]
us_cal =  holidays.US()
ca_cal = holidays.CA()
ie_cal = holidays.IE()
uk_cal = holidays.UK()

def holidayName(timestamp, site_id):
    if site_id in in_ca:
        return ca_cal.get(timestamp)
    elif site_id in in_uk:
        return uk_cal.get(timestamp)
    elif site_id in in_ie:
        return ie_cal.get(timestamp)
    else:
        return us_cal.get(timestamp)
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class AddHolidays(TransformerMixin):
    def transform(self, df, **transform_params):
        temp_df = df[['site_id','timestamp']]
        df['holiday'] = temp_df.apply(lambda x: holidayName(x.timestamp, x.site_id), axis=1)
        #df['holiday'] = df.apply(lambda x: all_holidays.get(x))
 #       df['holiday'] = df.apply(lambda x: holidayName(x.timestamp, x.site_id))
        df['holiday'] = df['holiday'].astype('category')
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
class RmHolidays(TransformerMixin):
    def transform(self, df, **transform_params):

        temp_df = df[['site_id','timestamp']]
        temp_df['holiday'] = temp_df.apply(lambda x: holidayName(x.timestamp, x.site_id), axis=1)
        #df['holiday'] = df.apply(lambda x: all_holidays.get(x))
 #       df['holiday'] = df.apply(lambda x: holidayName(x.timestamp, x.site_id))
        df = df.drop(temp_df['holiday'].notnull())
    return df

    def fit(self, X, y=None, **fit_params):
        return self

In [33]:
class LogSquareFeet(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['log_square_feet'] = np.float16(np.log(df['square_feet']))
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [34]:
class SetCatTypes(TransformerMixin):
        
    def transform(self, df, **transform_params):
        df['primary_use']= df['primary_use'].astype('category')
        df['meter'] = df["meter"].astype('category')
        df['site_id'] = df["site_id"].astype('category')
        df['building_id'] = df['building_id'].astype('category')
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [35]:
class ImputeCloudCoverage(TransformerMixin):
        
    def transform(self, df, **transform_params):
        # set age of building to mediam of site_id
        # else if set ot overall median
        median = df['cloud_coverage'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['cloud_coverage'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = i_median
            else:
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = median
        df['cloud_coverage'] = np.uint8(df['cloud_coverage'])
        df['cloud_coverage'] = df['cloud_coverage']
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [36]:
class CloudTimeCat(TransformerMixin):
        
    def transform(self, df, **transform_params):
        tempDf = df[['cloud_coverage', 'hour']].astype('int')
        tempDf['cloud_coverage'] = (tempDf['cloud_coverage']).astype('int')
        tempDf['hour'] = (tempDf['hour']).astype('int')
        tempDf = tempDf.astype('str')
        df['cloud_time_cat'] = 'c' + tempDf['cloud_coverage'] + 't' + tempDf['hour']
        df['cloud_time_cat'] = df['cloud_time_cat'].astype('category')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self


In [37]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        return df.drop(self._drop_cols, axis=1)

    def fit(self, X, y=None, **fit_params):
        return self

In [38]:
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        year_built_median = df['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
            else:
                df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
        df['building_age'] = np.uint8(df['year_built']-1900)
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [39]:
class AddMeterDummies(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        for i in range(4):
            df['_meter_'+str(i)] = (df['building_id'].isin(
                train.loc[train['meter'] == i].building_id.unique()))
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [40]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('category') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = (df['timestamp'].dt.dayofweek * 24) + df['timestamp'].dt.hour
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('category')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [41]:
class AddRelativeHumidity(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        df = df_a
        # code here
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [42]:
class FillMean(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].mean())
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [43]:
class FillZeros(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(0)
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [44]:
class FillMedian(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].median())
        return df

    def fit(self, X, y=None, **fit_params):
        return self


In [45]:
class FillPopular(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].value_counts()[0])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [46]:
class MarkNaNs(TransformerMixin):
        
    def transform(self, df, **transform_params):
        for col in  df.columns[df.isna().any()].tolist():
            df['_' + col + '_nan' ] = df[col].isnull()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [47]:
class GC(TransformerMixin):
        
    def transform(self, df, **transform_params):
        gc.collect()
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [48]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)

def lbm_rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# rob's custome function to do RMSLE while in the log1p space
def lbm_rmslee(y_true, y_pred):
    return 'RMSLEE', np.sqrt(np.mean(np.power(y_pred - y_true, 2))), False



In [63]:
weather_pipes = Pipeline(
    steps=[
        ('imputeWeather', ImputeWeather()),
        ('fillMean',FillMean(['air_temperature','dew_temperature'
                              , 'precip_depth_1_hr', 'sea_level_pressure'])),
        ('imputeCloudCoverage', ImputeCloudCoverage()),
        ('addWeatherLags3', AddWeatherLags(3)),
        ('addWeatherLags72', AddWeatherLags(72)),
    ]
)

building_pipes = Pipeline(
    steps=[
        ('logSquareFeet', LogSquareFeet()),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('fillMean',FillMean(['floor_count'])),
        ('dropClos', DropCols(['square_feet', 'year_built'])),
    ]
)


# pre_a_pipes is for preprocessing that doesn't change impute
# values
x_pipes = Pipeline(
    steps=[
        #('markNans',MarkNaNs()),
        ('convertToDatetime', ConvertToDatetime()),
        ('AddHolidays', AddHolidays()),
        ('addRelativeHumidity',AddRelativeHumidity()),
        ('addTimeFeatures', AddTimeFeatures()),
        ('setCatTypes', SetCatTypes()),
        ('fillMean',FillMean([])),
        ('fillZeros',FillZeros([])),
        ('dropClos', DropCols(['timestamp'])),
        ('GC', GC())
    ]
)

In [64]:
#building_trans= building_pipes.transform(building)
#weather_train_trans = weather_pipes.transform(weather_train)
#weather_test_trans = weather_pipes.transform(weather_test)

test_X = x_pipes.transform(
    test
        .merge(building_pipes.transform(building), on='building_id', how='left').drop(['row_id'], axis=1)
        .merge(weather_pipes.transform(weather_test), on=['site_id', 'timestamp'], how='left')
    )

#train_X = x_pipes.transform(
#    train
#        .merge(building_trans, on='building_id', how='left')
#        .merge(weather_train_trans, on=['site_id', 'timestamp'], how='left')
#    )

#print(test_X.shape)
print(test_X.sample(n=20,  random_state=42))

         building_id meter site_id                    primary_use  \
3573457          173     0       2                      Education   
8315486          222     1       2  Entertainment/public assembly   
40305643        1354     2      15                      Education   
16083617         712     0       5                      Education   
37204119        1344     2      15                      Education   
32144852        1119     1      13                         Office   
5105044          249     0       2  Entertainment/public assembly   
36982844        1303     1      14                     Healthcare   
20487823         945     2       9                         Office   
8404196          217     1       2                      Education   
6889602          241     0       2  Entertainment/public assembly   
16963616         784     0       6                      Education   
39666699        1381     2      15                         Office   
26802058        1179     1      13

[20 rows x 73 columns]


In [65]:
print(test_X.shape)
print(test_X.sample(n=20,  random_state=42))

(41697600, 73)
         building_id meter site_id                    primary_use  \
3573457          173     0       2                      Education   
8315486          222     1       2  Entertainment/public assembly   
40305643        1354     2      15                      Education   
16083617         712     0       5                      Education   
37204119        1344     2      15                      Education   
32144852        1119     1      13                         Office   
5105044          249     0       2  Entertainment/public assembly   
36982844        1303     1      14                     Healthcare   
20487823         945     2       9                         Office   
8404196          217     1       2                      Education   
6889602          241     0       2  Entertainment/public assembly   
16963616         784     0       6                      Education   
39666699        1381     2      15                         Office   
26802058        117

[20 rows x 73 columns]


In [66]:
def getOutsideFoldXY(train_index):
    X = train.iloc[train_index].drop('meter_reading', axis=1)
    X_buildings = building[building['building_id'].isin(X['building_id'].unique())]
    X_weather = building[building['building_id'].isin(X['building_id'].unique())]
    X = x_pipes.transform(
        X
            .merge(building_pipes.transform(X_buildings), on='building_id', how='left')
            .merge(weather_train_trans, on=['site_id', 'timestamp'], how='left')
        )
    f_train_y = np.log1p(train.iloc[train_index]['meter_reading'])
    print(X.columns)
    return X,f_train_y



def getInFoldXY(train_index):
    X = train.iloc[train_index].drop('meter_reading', axis=1)
    X_buildings = building[building['building_id'].isin(X['building_id'].unique())]
    X = X.merge(building_pipes.transform(X_buildings), on='building_id', how='left')
    X_weather = weather_train[
        (weather_train['site_id'].isin(X['site_id'].unique())) 
         & (weather_train['timestamp'].isin(X['timestamp'].unique())) 
    ]
    X = x_pipes.transform(
        X.merge(weather_pipes.transform(X_weather), how='left'))
    f_train_y = np.log1p(train.iloc[train_index]['meter_reading'])
    return X, f_train_y


print(getInFoldXY(train.head(10).index))

(  building_id meter site_id          primary_use  floor_count  \
0           0     0       0            Education     3.740234   
1           1     0       0            Education     3.740234   
2           2     0       0            Education     3.740234   
3           3     0       0            Education     3.740234   
4           4     0       0            Education     3.740234   
5           5     0       0            Education     3.740234   
6           6     0       0  Lodging/residential     3.740234   
7           7     0       0            Education     3.740234   
8           8     0       0            Education     3.740234   
9           9     0       0               Office     3.740234   

   log_square_feet  building_age  air_temperature  cloud_coverage  \
0         8.914062           108             25.0               6   
1         7.910156           104             25.0               6   
2         8.585938            91             25.0               6   
3      

In [None]:
%%time

folds = 5

# this stratified strategy from
# https://www.kaggle.com/isaienkov/lightgbm-fe-1-19/notebook
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

models = []
best_scores = []
for train_index, val_index in kf.split(train, train['building_id']):
    f_train_X, f_train_y = getInFoldXY(train_index)
    f_val_X, f_val_y = getInFoldXY(val_index)
    gbm = LGBMRegressor(n_estimators=1000, # for accuracy use large numbers like 6000 
                  learning_rate=0.4,
                  feature_fraction=0.9,
                  subsample=0.1,  # 
                  subsample_freq=1,
                  num_leaves=20,
                  max_depth=10,
                  metric='rmse',
                  lambda_l1= 1,  
                  lambda_l2= 1,
                  verbose= 100)
    gbm.fit(f_train_X, f_train_y,
        eval_set=[(f_val_X, f_val_y)],
        # https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114722#latest-660848
        # eval_metric=lbm_rmslee,
        early_stopping_rounds=20)
    models.append(gbm)
    #y_pred = gbm.predict(f_val_X, num_iteration=gbm.best_iteration_)
    # eval
    #rmsle_score = lbm_rmslee(f_val_X, y_pred)[1]
    best_scores.append(gbm.best_score_)
    gc.collect()


In [None]:
for score in best_scores:
    print(score['valid_0']['rmse'])

In [None]:
imprtc_df = pd.DataFrame()
imprtc_df['feature'] = test_X.columns   
imprtc_df['importance'] = models[0].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)
print(test_X.columns)

In [None]:
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(np.expm1(sum([model.predict(test_X.iloc[i:i+step_size]) for model in models])/folds))
    i+=step_size
    
    
    


In [71]:
res = np.concatenate(res)
print(len(res))
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission.csv.zip', index=False)
submission

41697600


Unnamed: 0,row_id,meter_reading
0,0,0.361497
1,1,0.611516
2,2,0.006448
3,3,2.185482
4,4,0.556502
5,5,0.000000
6,6,1.874477
7,7,1.968647
8,8,22.731738
9,9,1.221862


In [None]:
#
#
#
#
#
# everything below is older stuff - please ignore
#
#
#
#
#

In [226]:

gbm=LGBMRegressor(n_estimators=500, # for accuracy use large numbers like 6000 
                  learning_rate=0.1,
                  feature_fraction=0.9,
                  subsample=0.1,  # batches of 20% of the data
                  subsample_freq=1,
                  num_leaves=160,
                  max_depth=10,
                  metric='rmse',
                  verbose= 100)


In [None]:
# Cross val testing - can be skipped
scores = cross_val_score(gbm, train_X, train_y, cv=5, 
                        scoring=rmsee_scorer)
print('rmsee scores:\n', scores)


In [84]:
del scores
gc.collect()

87

In [96]:
# Grid param search - can be skpped
grid_param = {
    'gbm__n_estimators': [500],
    'gbm__subsample': [0.1],
    'gbm__learning_rate': [0.1],
    'gbm__num_leaves': [80, 160],
    'gbm__max_depth': [11]
}

gd_sr = GridSearchCV(pipe,
                     param_grid=grid_param,
                     scoring=rmsee_scorer,
                     cv=4,
                     n_jobs=-1)

gd_sr.fit(train_X, train_y)

best_parameters = gd_sr.best_params_
print(best_parameters)
best_result = gd_sr.best_score_
print(best_result)


KeyboardInterrupt: 

In [27]:
sorted(gd_sr.cv_results_.keys())
for key in gd_sr.cv_results_.keys():
    print(str(key) + "  " + str(gd_sr.cv_results_[key]))


mean_fit_time  [607.31617959 748.329434  ]
std_fit_time  [11.42404894  7.16639039]
mean_score_time  [545.60431012 743.97350645]
std_score_time  [15.76616037 10.19180404]
param_gbm__learning_rate  [0.1 0.1]
param_gbm__max_depth  [11 11]
param_gbm__n_estimators  [500 500]
param_gbm__num_leaves  [80 160]
param_gbm__subsample  [0.1 0.1]
params  [{'gbm__learning_rate': 0.1, 'gbm__max_depth': 11, 'gbm__n_estimators': 500, 'gbm__num_leaves': 80, 'gbm__subsample': 0.1}, {'gbm__learning_rate': 0.1, 'gbm__max_depth': 11, 'gbm__n_estimators': 500, 'gbm__num_leaves': 160, 'gbm__subsample': 0.1}]
split0_test_score  [-1.50000092 -1.49220935]
split1_test_score  [-1.11290672 -1.10616635]
split2_test_score  [-1.46453371 -1.4708326 ]
mean_test_score  [-1.35914711 -1.35640277]
std_test_score  [0.17471926 0.17715895]
rank_test_score  [2 1]


In [28]:
del gd_sr
gc.collect

<function gc.collect(generation=2)>

In [60]:
# fit on all the data
pipe.fit(train_X, train_y, 
         gbm__eval_metric=rmsee, gbm__verbose=100)


Pipeline(memory=None,
         steps=[('pre_b_pipes',
                 Pipeline(memory=None,
                          steps=[('fillMean',
                                  <__main__.FillMean object at 0x0000010B91471EF0>),
                                 ('fillZeros',
                                  <__main__.FillZeros object at 0x0000010B96C7D128>),
                                 ('imputeCloudCoverage',
                                  <__main__.ImputeCloudCoverage object at 0x0000010B96C7D0B8>),
                                 ('imputeYearBuilt',
                                  <__main__.ImputeYearBuilt object at 0x0000010B96C7D6A0>),
                                 ('dro...
                               colsample_bytree=1.0, feature_fraction=0.9,
                               importance_type='split', learning_rate=0.1,
                               max_depth=11, metric='rmse',
                               min_child_samples=20, min_child_weight=0.001,
                

In [61]:
# Get features list for fit - can be skipped
imprtc_df = pd.DataFrame()
imprtc_df['feature'] = pre_b_pipes.transform(train_X).columns   
imprtc_df['importance'] = pipe.named_steps['gbm'].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)
print(imprtc_df.nsmallest(7,'importance')['feature'].tolist())



                          feature  importance
0                     building_id       16616
1                           meter        7207
69                           hour        4903
25     precip_depth_1_hr_std_lag3        2708
40      air_temperature_min_lag72        2258
38     air_temperature_mean_lag72        2243
39      air_temperature_max_lag72        2015
48      dew_temperature_min_lag72        1727
55   sea_level_pressure_max_lag72        1605
58      wind_direction_mean_lag72        1582
42      cloud_coverage_mean_lag72        1531
47      dew_temperature_max_lag72        1483
46     dew_temperature_mean_lag72        1477
61       wind_direction_std_lag72        1472
56   sea_level_pressure_min_lag72        1446
62          wind_speed_mean_lag72        1438
45       cloud_coverage_std_lag72        1378
41      air_temperature_std_lag72        1268
53    precip_depth_1_hr_std_lag72        1232
50   precip_depth_1_hr_mean_lag72        1205
65           wind_speed_std_lag72 

In [62]:
set_size = len(test_X)
iterations = 100
batch_size = set_size // iterations

print(set_size, iterations, batch_size)
assert set_size == iterations * batch_size

meter_reading = []
for i in tqdm(range(iterations)):
    pos = i*batch_size
    batch = np.expm1(pipe.predict(test_X.iloc[pos : pos+batch_size]).clip(0))
    meter_reading.extend(batch)

print(len(meter_reading))
assert len(meter_reading) == set_size

41697600 100 416976


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [16:11<00:00, 12.78s/it]


41697600


In [63]:
sub = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
# hack to prevent negative numbers
print(sub.shape)
sub['meter_reading'] = meter_reading
sub.to_csv('submission.csv', index = False)

(41697600, 2)
