In [234]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from sklearn.preprocessing import LabelEncoder

warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

In [309]:
# import
building = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')
weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv')
train = pd.read_csv('../input/ashrae-energy-prediction/train.csv')
test = pd.read_csv('../input/ashrae-energy-prediction/test.csv')

In [310]:
# merge
train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')
del weather_train, weather_test,building
gc.collect();

In [311]:
# Fill NaNs
fill_w_neg_one = ['site_id']
fill_w_zero = ['floor_count']
fill_w_popular = ['primary_use']
fill_w_mean = ['cloud_coverage','year_built','air_temperature','dew_temperature', 
              "precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed"]

for df in [train, test]:
    for col in fill_w_neg_one:
        df[col].fillna(-1, inplace=True)
    for col in fill_w_popular:
        df[col].fillna(df[col].value_counts()[0], inplace=True)
    for col in fill_w_zero:
        df[col].fillna(0, inplace=True)
    for col in fill_w_mean:
        df[col].fillna(df[col].mean(), inplace=True)
print(train.isnull().sum())

building_id           0
meter                 0
timestamp             0
meter_reading         0
site_id               0
primary_use           0
square_feet           0
year_built            0
floor_count           0
air_temperature       0
cloud_coverage        0
dew_temperature       0
precip_depth_1_hr     0
sea_level_pressure    0
wind_direction        0
wind_speed            0
dtype: int64


In [312]:
# label encoding
le = LabelEncoder()

for df in [train, test]:
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
train["primary_use"].unique()

array([ 0,  4,  6,  1,  7, 11,  8,  9, 15,  2, 10,  3, 14, 13,  5, 12],
      dtype=int64)

In [313]:
# Saving some memory setting types
d_types = {'building_id': 'category', #np.int16,
          'meter': 'category', # np.int8,
          'site_id': 'category', #np.int8,
          'primary_use': 'category',
          'floor_count': 'category',
          'square_feet': np.int32,
          'year_built': np.float16,
          'air_temperature': np.float32,
          'cloud_coverage': np.float16,
          'dew_temperature': np.float32,
          'precip_depth_1_hr': np.float16,
          'sea_level_pressure': np.float32,
          'wind_direction': np.float16,
          'wind_speed': np.float32}

for df in [train, test]:
    for feature in d_types: 
        df[feature] = df[feature].astype(d_types[feature])
    
print(train.dtypes)

building_id           category
meter                 category
timestamp               object
meter_reading          float64
site_id               category
primary_use           category
square_feet              int32
year_built             float16
floor_count           category
air_temperature        float32
cloud_coverage         float16
dew_temperature        float32
precip_depth_1_hr      float16
sea_level_pressure     float32
wind_direction         float16
wind_speed             float32
dtype: object


In [314]:
# drop cols
drop_cols = ['wind_direction']
for df in [train, test]: 
    df = df.drop(drop_cols, axis=1, inplace=True)

In [315]:
# add non-imputed features 
def nonIFeatures(df):
    #time_stamps
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df['dayofweek'] = df["timestamp"].dt.dayofweek.astype('category') # vs weekend?
    df["hour"] = df["timestamp"].dt.hour.astype('category')
    #df["month"] = df["timestamp"].dt.month.astype('category')
    
    # each row should know about other meters 
    for i in range(4):
        df["_meter_"+str(i)] = df['building_id'].isin(
            train.loc[train['meter'] == i].building_id.unique()).astype('category')
    
    df.rename(columns={"square_feet": "log_square_feet"}, inplace=True)
    df['log_square_feet'] = np.float16(np.log(df['log_square_feet']))
    df['year_built'] = np.uint8(df['year_built']-1900)    
    
for df in [train, test]:
    df = nonIFeatures(df)
    
print(train.head())

  building_id meter  timestamp  meter_reading site_id primary_use  \
0           0     0 2016-01-01            0.0       0           0   
1           1     0 2016-01-01            0.0       0           0   
2           2     0 2016-01-01            0.0       0           0   
3           3     0 2016-01-01            0.0       0           0   
4           4     0 2016-01-01            0.0       0           0   

   log_square_feet  year_built floor_count  air_temperature  ...  \
0         8.914062         108         0.0             25.0  ...   
1         7.910156         104         0.0             25.0  ...   
2         8.585938          91         0.0             25.0  ...   
3        10.070312         102         0.0             25.0  ...   
4        11.664062          75         0.0             25.0  ...   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_speed  \
0             20.0           0.796387         1019.700012         0.0   
1             20.0           0

In [316]:
# create test train
train_y =  np.log1p(train["meter_reading"]) # ask why
train_X = train.drop(["meter_reading","timestamp"], axis=1)
test_X = test.drop(["row_id","timestamp"], axis=1)

gc.collect();

print(train_X.columns)
train.loc[train['meter_reading'] < 0]

Index(['building_id', 'meter', 'site_id', 'primary_use', 'log_square_feet',
       'year_built', 'floor_count', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_speed', 'dayofweek', 'hour', '_meter_0', '_meter_1', '_meter_2',
       '_meter_3'],
      dtype='object')


Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,log_square_feet,year_built,floor_count,air_temperature,...,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_speed,dayofweek,hour,_meter_0,_meter_1,_meter_2,_meter_3


In [321]:
# Rob hacking

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

gbm=LGBMRegressor( task= 'train',
                  boosting_type= 'gbdt',
                  objective= 'regression',
                  #n_estimators=6000,
                  learning_rate= 0.09,
                  feature_fraction= 0.9,
                  bagging_fraction= 0.9,
                  subsample=0.2,  # batches of 20% of the data
                  subsample_freq=1,
                  num_leaves=20,
                  verbose= 100)

scores = cross_val_score(gbm, train_X, train_y, cv=5, 
                         scoring=rmsle_scorer)
print("rmsle scores:\n", scores)

rmsle scores:
 [-0.50787072 -0.46765246 -0.38312615 -0.40704553 -0.42841516]


In [322]:
gbm.fit(train_X, train_y, eval_metric=rmsle)


LGBMRegressor(bagging_fraction=0.9, boosting_type='gbdt', class_weight=None,
              colsample_bytree=1.0, feature_fraction=0.9,
              importance_type='split', learning_rate=0.09, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=20,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=0.2,
              subsample_for_bin=200000, subsample_freq=1, task='train',
              verbose=100)

In [323]:
print( gbm.feature_importances_)
imprtc_df = pd.DataFrame()
imprtc_df["feature"] = train_X.columns   
imprtc_df["importance"] = gbm.feature_importances_
print(imprtc_df.sort_values('importance', ascending=False))

[1149  249  100   13   62    3   10  128    1   86    0   18    2   28
   50    0    0    0    1]
               feature  importance
0          building_id        1149
1                meter         249
7      air_temperature         128
2              site_id         100
9      dew_temperature          86
4      log_square_feet          62
14                hour          50
13           dayofweek          28
11  sea_level_pressure          18
3          primary_use          13
6          floor_count          10
5           year_built           3
12          wind_speed           2
8       cloud_coverage           1
18            _meter_3           1
10   precip_depth_1_hr           0
15            _meter_0           0
16            _meter_1           0
17            _meter_2           0


In [324]:
from tqdm import tqdm
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(gbm.predict(test_X.iloc[i:i+step_size]))
    i+=step_size

100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [01:51<00:00,  7.26it/s]


In [325]:
sub = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
res = np.concatenate(res)
# hack to prevent negative numbers
sub["meter_reading"] = np.expm1(res.clip(0))
sub.to_csv("submission.csv", index = False)