In [886]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
import gc
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype

warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [887]:
df_train_dtypes = {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32}
df_test_dtypes = {'building_id': np.int16, 'meter': np.int8}
df_building_metadata_dtypes = {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.int32, 'year_built': np.float32, 'floor_count': np.float32}
df_weather_dtypes = {'site_id': np.int8, 'air_temperature': np.float32, 'cloud_coverage': np.float32, 'dew_temperature': np.float32,
                     'precip_depth_1_hr': np.float32, 'sea_level_pressure': np.float32, 'wind_direction': np.float32, 'wind_speed': np.float32}

# import - second run you can skip
building = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv', dtype=df_building_metadata_dtypes)
weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv', dtype=df_weather_dtypes)
weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv', dtype=df_weather_dtypes)
train = pd.read_csv('../input/ashrae-energy-prediction/train.csv', dtype=df_train_dtypes)
test = pd.read_csv('../input/ashrae-energy-prediction/test.csv', dtype=df_test_dtypes)

train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

In [888]:
# note some building are built in the future!
#train[ train['year_built'] > 2000]['year_built'].value_counts().sort_index().plot(kind='bar')

In [889]:
def convertToDatetime(df):
    #time_stamps
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    print('-- convertToDatetime done---------------------------')
    print(df[["timestamp"]].sample(n=20, random_state=42))
    print('\n\n')



In [890]:
# add features we are 100% sure about
def logSquareFeet(df):
    #time_stamps
    df["timestamp"] = pd.to_datetime(df["timestamp"])    
    df.rename(columns={"square_feet": "log_square_feet"}, inplace=True)
    df['log_square_feet'] = np.float16(np.log(df['log_square_feet']))
    print('-- logSquareFeet done---------------------------')
    print(df[["log_square_feet"]].sample(n=20, random_state=42))
    print('\n\n')

    

In [891]:
# Set types category types
def setCatTypes(df):
    df["primary_use"]= df["primary_use"].astype("category")
    df["meter"] = df["meter"].astype("category")
    df["site_id"] = df["site_id"].astype("category")
    df["building_id"] = df["building_id"].astype("category")
    print('-- setCatTypes done---------------------------')
    print(df[["primary_use","meter","site_id","building_id"]].sample(n=20, random_state=42))
    print('\n\n')


In [892]:
# Imputing cloud coverage
def imputeCloudCoverage(df):
    # set age of building to mediam of site_id
    # else if set ot overall median
    median = df['cloud_coverage'].median()
    # Set all year_built NaNs to site mean for year_built
    for i, i_median in df.groupby(['site_id'])['cloud_coverage'].median().items():
        print(str(i) + " " +str(i_median))
        #print(i_median)
        if not np.isnan(i_median):
            df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = i_median
        else:
            df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = median
    df['cloud_coverage'] = np.uint8(df['cloud_coverage'])
    print('-- impute year built done---------------------------')
    print(df.groupby(['site_id'])['cloud_coverage'].describe())
    print('\n\n')

In [894]:
# creates bucked categories for cloud coverage by time day
# t[clound converage /2]c[hour divied /4]
def cloudTimeCat(df):
    tempDf = df[['cloud_coverage', 'hour']].astype('int')
    tempDf['cloud_coverage'] = (tempDf['cloud_coverage'] / 2).astype('int')
    tempDf['hour'] = (tempDf['hour'] / 4).astype('int')
    tempDf = tempDf.astype('str')
    df['cloud_time_cat'] = 'c' + tempDf['cloud_coverage'] + 't' + tempDf['hour']
    df['cloud_time_cat'] = df['cloud_time_cat'].astype('category')
    print('-- cloudHourCat done---------------------------')
    print(df[['cloud_time_cat']].sample(n=20, random_state=42))
    print('\n\n')                                                                      


In [895]:
# Creating building_age
def imputeYearBuilt(df):
    # set age of building to mediam of site_id
    # else if set ot overall median
    year_built_median = df['year_built'].median()
    # Set all year_built NaNs to site mean for year_built
    for i, i_median in df.groupby(['site_id'])['year_built'].median().items():
        if not np.isnan(i_median):
            df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = i_median
        else:
            df.loc[(df['year_built'].isnull()) & (df['site_id'] == i), 'year_built'] = year_built_median
    df['building_age'] = np.uint8(df['year_built']-1900)
    df['building_age'] = df['building_age']
    print('-- impute year built done---------------------------')
    print(df.groupby(['site_id'])['building_age'].describe())
    print('\n\n')

In [896]:
def addMeterDummies(df):
    for i in range(4):
        df["_meter_"+str(i)] = (df['building_id'].isin(
            train.loc[train['meter'] == i].building_id.unique()))
    print('-- addMeterDummies done---------------------------')
    print(df[['_meter_0','_meter_1','_meter_2','_meter_3']].sample(n=20, random_state=42))
    print('\n\n')


In [897]:
def addTimeFeatures(df):
    df['dayofweek'] = df["timestamp"].dt.dayofweek.astype('category') # vs weekend?
    df['weekday'] = df["timestamp"].dt.weekday.astype('category')
    df["hour"] = df["timestamp"].dt.hour.astype('category')
    print('-- addTimeFeatures done---------------------------')
    print(df['timestamp'].sample(n=20, random_state=42))
    print('\n\n')


In [898]:

fill_w_neg_one = []
fill_w_zero = ['floor_count']
fill_w_popular = []
fill_w_mean = ['air_temperature','dew_temperature', 
              "precip_depth_1_hr", "sea_level_pressure", "wind_speed"]

def generalImputes(df):
    for col in fill_w_neg_one:
        df[col].fillna(-1, inplace=True)
    for col in fill_w_popular:
        df[col].fillna(df[col].value_counts()[0], inplace=True)
    for col in fill_w_zero:
        df[col].fillna(0, inplace=True)
    for col in fill_w_mean:
        df[col].fillna(df[col].mean(), inplace=True)
    print(df[fill_w_neg_one + fill_w_zero + fill_w_popular + fill_w_mean].sample(n=20, random_state=42))
    
            
for df in [train, test]:
    convertToDatetime(df)
    addTimeFeatures(df)
    logSquareFeet(df)
    imputeYearBuilt(df)
    imputeCloudCoverage(df)
    cloudTimeCat(df)
    addMeterDummies(df)
    generalImputes(df)
    setCatTypes(df)
    gc.collect()
    





-- convertToDatetime done---------------------------
                   timestamp
14245562 2016-09-16 16:00:00
1282718  2016-01-24 06:00:00
13883790 2016-09-10 07:00:00
4781820  2016-04-01 01:00:00
10415393 2016-07-10 04:00:00
1057008  2016-01-20 04:00:00
4507399  2016-03-26 20:00:00
19478829 2016-12-18 23:00:00
8955615  2016-06-14 06:00:00
13799839 2016-09-08 19:00:00
15647011 2016-10-11 11:00:00
2524294  2016-02-16 08:00:00
10016102 2016-07-03 02:00:00
3915750  2016-03-15 03:00:00
17217526 2016-11-08 09:00:00
11478    2016-01-01 04:00:00
18919011 2016-12-09 02:00:00
8709341  2016-06-09 21:00:00
16313567 2016-10-23 07:00:00
6289526  2016-04-27 20:00:00



-- addTimeFeatures done---------------------------
14245562   2016-09-16 16:00:00
1282718    2016-01-24 06:00:00
13883790   2016-09-10 07:00:00
4781820    2016-04-01 01:00:00
10415393   2016-07-10 04:00:00
1057008    2016-01-20 04:00:00
4507399    2016-03-26 20:00:00
19478829   2016-12-18 23:00:00
8955615    2016-06-14 06:00:00
13799

6289526          1011.799988     2.10000  
-- setCatTypes done---------------------------
                            primary_use meter site_id building_id
14245562  Entertainment/public assembly     1      14        1324
1282718                       Education     0      10        1013
13883790                      Education     1       2         229
4781820                       Education     3       2         217
10415393                      Education     0      15        1434
1057008                 Public services     0      12        1047
4507399                       Education     1       9         911
19478829                      Education     0      12        1039
8955615                          Office     0       2         265
13799839                      Education     0       9         896
15647011                         Office     0       9         973
2524294                 Public services     0       8         813
10016102  Entertainment/public assembly     0       

25410971         1028.400024         0.0  
-- setCatTypes done---------------------------
                            primary_use meter site_id building_id
3573457                       Education     0       2         173
8315486   Entertainment/public assembly     1       2         222
40305643                      Education     2      15        1354
16083617                      Education     0       5         712
37204119                      Education     2      15        1344
32144852                         Office     1      13        1119
5105044   Entertainment/public assembly     0       2         249
36982844                     Healthcare     1      14        1303
20487823                         Office     2       9         945
8404196                       Education     1       2         217
6889602   Entertainment/public assembly     0       2         241
16963616                      Education     0       6         784
39666699                         Office     2      1

In [899]:
print('--NaN Checks')
print(train.isnull().sum())


--NaN Checks
building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
log_square_feet             0
year_built                  0
floor_count                 0
air_temperature             0
cloud_coverage              0
dew_temperature             0
precip_depth_1_hr           0
sea_level_pressure          0
wind_direction        1449048
wind_speed                  0
dayofweek                   0
weekday                     0
hour                        0
building_age                0
cloud_time_cat              0
_meter_0                    0
_meter_1                    0
_meter_2                    0
_meter_3                    0
dtype: int64


In [900]:
drop_cols = ['row_id','wind_direction','year_built','meter_reading','timestamp','precip_depth_1_hr','cloud_coverage', ] # ''# create test train
train_y =  np.log1p(train["meter_reading"]) # ask why
train_X = train.drop(filter(lambda i: i!='row_id', drop_cols), axis=1)
test_X = test.drop(filter(lambda i: i!='meter_reading', drop_cols), axis=1)

gc.collect();

print(train_X.dtypes)

building_id           category
meter                 category
site_id               category
primary_use           category
log_square_feet        float16
floor_count            float32
air_temperature        float32
dew_temperature        float32
sea_level_pressure     float32
wind_speed             float32
dayofweek             category
weekday               category
hour                  category
building_age             uint8
cloud_time_cat        category
_meter_0                  bool
_meter_1                  bool
_meter_2                  bool
_meter_3                  bool
dtype: object


In [901]:
# declare model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

def rmsee(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(np.expm1(y.clip(0)), np.expm1(y_pred.clip(0))))
    
rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmsee_scorer = make_scorer(
    lambda y_true, y_pred : rmsee(y_true, y_pred), 
    greater_is_better=False)


gbm=LGBMRegressor(n_estimators=100, # for accuracy use large numbers like 6000 
                  learning_rate=0.23,
                  feature_fraction=0.9,
                  subsample=0.2,  # batches of 20% of the data
                  subsample_freq=1,
                  num_leaves=20,
                  metric='rmse',
                  verbose= 100)


In [902]:
# Cross val testing - can be skipped

scores = cross_val_score(gbm, train_X, train_y, cv=5, 
                         scoring=rmsee_scorer)
print("rmsee scores:\n", scores)

KeyboardInterrupt: 

In [None]:
# fit on all the data
gbm.fit(train_X, train_y, eval_metric=rmsee, verbose=100)


In [None]:
print( gbm.feature_importances_)
imprtc_df = pd.DataFrame()
imprtc_df["feature"] = train_X.columns   
imprtc_df["importance"] = gbm.feature_importances_
print(imprtc_df.sort_values('importance', ascending=False))

In [None]:
gc.collect();

from tqdm import tqdm
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(gbm.predict(test_X.iloc[i:i+step_size]))
    i+=step_size

In [None]:
sub = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
res = np.concatenate(res)
# hack to prevent negative numbers
sub["meter_reading"] = np.expm1(res.clip(0))
sub.to_csv("submission.csv", index = False)