In [118]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype

warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [119]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.int32, 'year_built': np.float32, 'floor_count': np.float32},
    'weather' : {'site_id': np.int8, 'air_temperature': np.float32, 'cloud_coverage': np.float32, 'dew_temperature': np.float32,
                     'precip_depth_1_hr': np.float32, 'sea_level_pressure': np.float32, 'wind_direction': np.float32, 'wind_speed': np.float32}
}

file_loc = {}    
for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
    for name in ['building_metadata','weather_train','weather_test','train','test']:
        if path.exists(dir_path + name + '.csv'):
            file_loc[name]= dir_path + name + '.csv'
    
    building = pd.read_csv(file_loc['building_metadata'], dtype=file_dtype['building_metadata'])
    weather_train = pd.read_csv(file_loc['weather_train'], dtype=file_dtype['weather'])
    weather_test = pd.read_csv(file_loc['weather_test'], dtype=file_dtype['weather'])
    train = pd.read_csv(file_loc['train'], dtype=file_dtype['train'])
    test = pd.read_csv(file_loc['test'], dtype=file_dtype['test'])

train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

In [120]:
def convertToDatetime(df):
    #time_stamps
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    print('-- convertToDatetime done---------------------------')
    print(df[["timestamp"]].sample(n=20, random_state=42))
    print('\n\n')



In [121]:
# add features we are 100% sure about
def logSquareFeet(df):
    df.rename(columns={"square_feet": "log_square_feet"}, inplace=True)
    df['log_square_feet'] = np.float16(np.log(df['log_square_feet']))
    print('-- logSquareFeet done---------------------------')
    print(df[["log_square_feet"]].sample(n=20, random_state=42))
    print('\n\n')

    

In [122]:
# Set types category types
def setCatTypes(df):
    df["primary_use"]= df["primary_use"].astype("category")
    df["meter"] = df["meter"].astype("category")
    df["site_id"] = df["site_id"].astype("category")
    df["building_id"] = df["building_id"].astype("category")
    print('-- setCatTypes done---------------------------')
    print(df[["primary_use","meter","site_id","building_id"]].sample(n=20, random_state=42))
    print('\n\n')


In [123]:
# Imputing cloud coverage
def imputeCloudCoverage(df):
    # set age of building to mediam of site_id
    # else if set ot overall median
    median = df['cloud_coverage'].median()
    # Set all year_built NaNs to site mean for year_built
    for i, i_median in df.groupby(['site_id'])['cloud_coverage'].median().items():
        print(str(i) + " " +str(i_median))
        #print(i_median)
        if not np.isnan(i_median):
            df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = i_median
        else:
            df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = median
    df['cloud_coverage'] = np.uint8(df['cloud_coverage'])
    df['cloud_coverage'] = df['cloud_coverage'].astype('category')
    print('-- impute year built done---------------------------')
    print(df.groupby(['site_id'])['cloud_coverage'].describe())
    print('\n\n')

In [124]:
# creates bucked categories for cloud coverage by time day
# t[clound converage /2]c[hour divied /4]
def cloudTimeCat(df):
    tempDf = df[['cloud_coverage', 'hour']].astype('int')
    tempDf['cloud_coverage'] = (tempDf['cloud_coverage']).astype('int')
    tempDf['hour'] = (tempDf['hour']).astype('int')
    tempDf = tempDf.astype('str')
    df['cloud_time_cat'] = 'c' + tempDf['cloud_coverage'] + 't' + tempDf['hour']
    df['cloud_time_cat'] = df['cloud_time_cat'].astype('category')
    print('-- cloudHourCat done---------------------------')
    print(df[['cloud_time_cat']].sample(n=20, random_state=42))
    print('\n\n')                                                                      


In [125]:
class DropCols(TransformerMixin):

    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
        
    def transform(self, df, **transform_params):
        return df.drop(self._drop_cols, axis=1)

    def fit(self, X, y=None, **fit_params):
        return self

In [126]:
# Creating building_age
def imputeYearBuilt(df):
    # set age of building to mediam of site_id
    # else if set ot overall median
    year_built_median = df['year_built'].median()
    # Set all year_built NaNs to site mean for year_built
    for i, i_median in df.groupby(['site_id'])['year_built'].median().items():
        if not np.isnan(i_median):
            df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
        else:
            df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
    df['building_age'] = np.uint8(df['year_built']-1900)
    print('-- impute year built done---------------------------')
    print(df.groupby(['site_id'])['building_age'].describe())
    print('\n\n')
    
class ImputeYearBuilt(TransformerMixin):

    def transform(self, df, **transform_params):
        copy = df
        year_built_median = copy['year_built'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in copy.groupby(['site_id'])['year_built'].median().items():
            if not np.isnan(i_median):
                copy.loc[(copy['year_built'].isnull()) & (copy['site_id'] == i), 'year_built'] = i_median
            else:
                copy.loc[(copy['year_built'].isnull()) & (copy['site_id'] == i), 'year_built'] = year_built_median
        copy['building_age'] = np.uint8(copy['year_built']-1900)
        return copy

    def fit(self, X, y=None, **fit_params):
        return self


In [127]:
def addMeterDummies(df):
    for i in range(4):
        df["_meter_"+str(i)] = (df['building_id'].isin(
            train.loc[train['meter'] == i].building_id.unique()))
    print('-- addMeterDummies done---------------------------')
    print(df[['_meter_0','_meter_1','_meter_2','_meter_3']].sample(n=20, random_state=42))
    print('\n\n')


In [128]:
def addTimeFeatures(df):
    df['dayofweek'] = df["timestamp"].dt.dayofweek.astype('category') # vs weekend?
    df['weekday'] = df["timestamp"].dt.weekday.astype('category')
    #df["week"] = df["timestamp"].dt.week.astype('category')
    df["hour"] = df["timestamp"].dt.hour.astype('category')
    print('-- addTimeFeatures done---------------------------')
    print(df['timestamp'].sample(n=20, random_state=42))
    print('\n\n')


In [129]:
def addRelativeHumidity(df):
    # placeholder http://bmcnoldy.rsmas.miami.edu/Humidity.html
    print('-- addRelativeHumidity done---------------------------')
    #print(df['relative_humidity'].sample(n=20, random_state=42))
    print('\n\n')

In [130]:

fill_w_neg_one = []
fill_w_zero = ['floor_count']
fill_w_popular = []
fill_w_mean = ['air_temperature','dew_temperature', 
              "precip_depth_1_hr", "sea_level_pressure", "wind_speed"]

def generalImputes(df):
    for col in fill_w_neg_one:
        df[col].fillna(-1, inplace=True)
    for col in fill_w_popular:
        df[col].fillna(df[col].value_counts()[0], inplace=True)
    for col in fill_w_zero:
        df[col].fillna(0, inplace=True)
    for col in fill_w_mean:
        df[col].fillna(df[col].mean(), inplace=True)
    print(df[fill_w_neg_one + fill_w_zero + fill_w_popular + fill_w_mean].sample(n=20, random_state=42))
    
            
for df in [train, test]:
    convertToDatetime(df)
    addRelativeHumidity(df)
    addTimeFeatures(df)
    logSquareFeet(df)
    imputeYearBuilt(df)
    imputeCloudCoverage(df)
    #cloudTimeCat(df) this feature ranks high but doesn't change the score 
    addMeterDummies(df)
    generalImputes(df)
    setCatTypes(df)

gc.collect()
    
print('--NaN Checks')
print(train.isnull().sum())

-- convertToDatetime done---------------------------
                   timestamp
14245562 2016-09-16 16:00:00
1282718  2016-01-24 06:00:00
13883790 2016-09-10 07:00:00
4781820  2016-04-01 01:00:00
10415393 2016-07-10 04:00:00
1057008  2016-01-20 04:00:00
4507399  2016-03-26 20:00:00
19478829 2016-12-18 23:00:00
8955615  2016-06-14 06:00:00
13799839 2016-09-08 19:00:00
15647011 2016-10-11 11:00:00
2524294  2016-02-16 08:00:00
10016102 2016-07-03 02:00:00
3915750  2016-03-15 03:00:00
17217526 2016-11-08 09:00:00
11478    2016-01-01 04:00:00
18919011 2016-12-09 02:00:00
8709341  2016-06-09 21:00:00
16313567 2016-10-23 07:00:00
6289526  2016-04-27 20:00:00



-- addRelativeHumidity done---------------------------



-- addTimeFeatures done---------------------------
14245562   2016-09-16 16:00:00
1282718    2016-01-24 06:00:00
13883790   2016-09-10 07:00:00
4781820    2016-04-01 01:00:00
10415393   2016-07-10 04:00:00
1057008    2016-01-20 04:00:00
4507399    2016-03-26 20:00:00
19478829 

-- convertToDatetime done---------------------------
                   timestamp
3573457  2017-02-02 08:00:00
8315486  2018-12-18 00:00:00
40305643 2018-06-14 12:00:00
16083617 2018-08-23 23:00:00
37204119 2017-01-02 00:00:00
32144852 2018-12-30 19:00:00
5105044  2017-09-11 03:00:00
36982844 2018-11-30 19:00:00
20487823 2017-04-23 08:00:00
8404196  2018-12-30 19:00:00
6889602  2018-05-26 10:00:00
16963616 2017-11-21 15:00:00
39666699 2018-02-26 12:00:00
26802058 2017-01-09 07:00:00
30785716 2018-06-30 12:00:00
8763147  2017-02-23 08:00:00
19415014 2018-08-06 05:00:00
35698052 2018-05-28 20:00:00
26291343 2017-07-20 10:00:00
25410971 2017-11-29 18:00:00



-- addRelativeHumidity done---------------------------



-- addTimeFeatures done---------------------------
3573457    2017-02-02 08:00:00
8315486    2018-12-18 00:00:00
40305643   2018-06-14 12:00:00
16083617   2018-08-23 23:00:00
37204119   2017-01-02 00:00:00
32144852   2018-12-30 19:00:00
5105044    2017-09-11 03:00:00
36982844 

building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
log_square_feet             0
year_built                  0
floor_count                 0
air_temperature             0
cloud_coverage              0
dew_temperature             0
precip_depth_1_hr           0
sea_level_pressure          0
wind_direction        1449048
wind_speed                  0
dayofweek                   0
weekday                     0
hour                        0
building_age                0
_meter_0                    0
_meter_1                    0
_meter_2                    0
_meter_3                    0
dtype: int64


In [131]:
#drop_cols = ['row_id','wind_direction','year_built','meter_reading','timestamp','precip_depth_1_hr']  
# create test train
#train_y =  np.log1p(train["meter_reading"]) # ask why
#train_X = train.drop(filter(lambda i: i!='row_id', drop_cols), axis=1)
#test_X = test.drop(filter(lambda i: i!='meter_reading', drop_cols), axis=1)


gc.collect();

print(train.dtypes)
print(train.columns)


building_id                 category
meter                       category
timestamp             datetime64[ns]
meter_reading                float32
site_id                     category
primary_use                 category
log_square_feet              float16
year_built                   float32
floor_count                  float32
air_temperature              float32
cloud_coverage              category
dew_temperature              float32
precip_depth_1_hr            float32
sea_level_pressure           float32
wind_direction               float32
wind_speed                   float32
dayofweek                   category
weekday                     category
hour                        category
building_age                   uint8
_meter_0                        bool
_meter_1                        bool
_meter_2                        bool
_meter_3                        bool
dtype: object
Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'log

In [138]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)


gbm=LGBMRegressor(n_estimators=100, # for accuracy use large numbers like 6000 
                  learning_rate=0.23,
                  feature_fraction=0.9,
                  subsample=0.2,  # batches of 20% of the data
                  subsample_freq=1,
                  num_leaves=20,
                  metric='rmse',
                  verbose= 100)


In [139]:
drop_cols = ['wind_direction','year_built','timestamp','precip_depth_1_hr']


union_pipe = FeatureUnion (transformer_list = [
    ('imputeYearBuilt', ImputeYearBuilt())
])

tranform_pipes = Pipeline(
    steps=[
        #('union_pipe', union_pipe),
        ('imputeYearBuilt', ImputeYearBuilt()),
        ('dropClos', DropCols(drop_cols))
    ]
)

pipe = Pipeline(
    steps=[
        ('tranform_pipes',tranform_pipes),
        ('gbm', gbm)]
)



In [88]:
df = tranform_pipe.transform(train)
print(df.dtypes)

building_id           category
meter                 category
meter_reading          float32
site_id               category
primary_use           category
log_square_feet        float16
floor_count            float32
air_temperature        float32
cloud_coverage           uint8
dew_temperature        float32
sea_level_pressure     float32
wind_speed             float32
dayofweek             category
weekday               category
hour                  category
building_age             uint8
cloud_time_cat        category
_meter_0                  bool
_meter_1                  bool
_meter_2                  bool
_meter_3                  bool
dtype: object


In [115]:
# Cross val testing - can be skipped
#scores = cross_val_score(gbm, train.drop(drop_cols + ['meter_reading'], axis=1), np.log1p(train["meter_reading"]), cv=5, 
                        scoring=rmsee_scorer)
#print("rmsee scores:\n", scores)




rmsee scores:
 [-1.3220619  -1.28108523 -1.10180097 -1.21826303 -1.41899279]


In [None]:
# Cross val testing - can be skipped
scores = cross_val_score(pipe, train.drop('meter_reading', axis=1), np.log1p(train["meter_reading"]), cv=5, 
                        scoring=rmsee_scorer)
print("rmsee scores:\n", scores)


In [66]:
# Grid param search - can be skpped
grid_param = {
    #'n_estimators': [1000, 3000, 6000],
    'gbm__subsample': [0.1]
}

gd_sr = GridSearchCV(pipe,
                     param_grid=grid_param,
                     scoring=rmsee_scorer,
                     cv=3,
                     n_jobs=-1)

gd_sr.fit(train.drop('meter_reading', axis=1), np.log1p(train["meter_reading"]))

best_parameters = gd_sr.best_params_
print(best_parameters)
best_result = gd_sr.best_score_
print(best_result)

<class 'pandas.core.series.Series'>
{'gbm__subsample': 0.1}
-1.5102730938597186


In [135]:
# fit on all the data
pipe.fit(
    train.drop('meter_reading', axis=1), 
    np.log1p(train["meter_reading"]),
    gbm__eval_metric=rmsee, gbm__verbose=100
)


Pipeline(memory=None,
         steps=[('tranform_pipes',
                 Pipeline(memory=None,
                          steps=[('imputeYearBuilt',
                                  <__main__.ImputeYearBuilt object at 0x0000014DC5829710>),
                                 ('dropClos',
                                  <__main__.DropCols object at 0x0000014DC5829470>)],
                          verbose=False)),
                ('gbm',
                 LGBMRegressor(boosting_type='gbdt', class_weight=None,
                               colsample_bytree=1.0, feature_fraction=0.9,
                               importance_type='split', learning_rate=0.23,
                               max_depth=-1, metric='rmse',
                               min_child_samples=20, min_child_weight=0.001,
                               min_split_gain=0.0, n_estimators=6000, n_jobs=-1,
                               num_leaves=20, objective=None, random_state=None,
                               reg_alp

In [None]:
imprtc_df = pd.DataFrame()
imprtc_df["feature"] = tranform_pipe.transform(train.drop('meter_reading', axis=1)).columns   
imprtc_df["importance"] = pipe.named_steps['gbm'].feature_importances_
imprtc_df.sort_values('importance', ascending=False, inplace= True)
print(imprtc_df)
print(imprtc_df.nsmallest(7,'importance')['feature'].tolist())



In [136]:
gc.collect();
test_y = pipe.predict(test.drop('row_id', axis=1))

In [None]:
gc.collect();

'''
test_X = test.drop('row_id'), axis=1)

from tqdm import tqdm
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(gbm.predict(test_X.iloc[i:i+step_size]))
    i+=step_size
'''

In [137]:
sub = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
#res = np.concatenate(test_y)
# hack to prevent negative numbers
sub["meter_reading"] = np.expm1(test_y.clip(0))
sub.to_csv("submission.csv", index = False)