In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
import gc
from sklearn.preprocessing import LabelEncoder

warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

In [30]:
# import - second run you can skip
building = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')
weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv')
train = pd.read_csv('../input/ashrae-energy-prediction/train.csv')
test = pd.read_csv('../input/ashrae-energy-prediction/test.csv')

train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

In [31]:
# add features we are 100% sure about
def accpetedFeatures(df):
    #time_stamps
    df["timestamp"] = pd.to_datetime(df["timestamp"])    
    df.rename(columns={"square_feet": "log_square_feet"}, inplace=True)
    df['log_square_feet'] = np.float16(np.log(df['log_square_feet']))
    df['year_built'] = np.uint8(df['year_built']-1900)    
    
for df in [train, test]:
    df = accpetedFeatures(df)
    
# label encoding
le = LabelEncoder()

for df in [train, test]:
    df["primary_use"] = np.uint8(le.fit_transform(df["primary_use"]))
    
train["primary_use"].unique()
    

array([ 0,  4,  6,  1,  7, 11,  8,  9, 15,  2, 10,  3, 14, 13,  5, 12],
      dtype=uint64)

In [32]:
# Make a pickle - second run you can skip
mergePickle = {
    'train': train,
    'test': test
}

del weather_train, weather_test,building
gc.collect();
pickle.dump(mergePickle, open( "mergePickle.p", "wb" ))

In [3]:
# Read pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
import gc
from sklearn.preprocessing import LabelEncoder

warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

mergePickle = pickle.load( open( "mergePickle.p", "rb" ) )


In [4]:
train = mergePickle['train']
test = mergePickle['test']
gc.collect();

In [5]:
# Fill NaNs
fill_w_neg_one = ['site_id']
fill_w_zero = ['floor_count']
fill_w_popular = ['primary_use']
fill_w_mean = ['cloud_coverage','year_built','air_temperature','dew_temperature', 
              "precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed"]

for df in [train, test]:
    for col in fill_w_neg_one:
        df[col].fillna(-1, inplace=True)
    for col in fill_w_popular:
        df[col].fillna(df[col].value_counts()[0], inplace=True)
    for col in fill_w_zero:
        df[col].fillna(0, inplace=True)
    for col in fill_w_mean:
        df[col].fillna(df[col].mean(), inplace=True)
print(train.isnull().sum())

building_id           0
meter                 0
timestamp             0
meter_reading         0
site_id               0
primary_use           0
log_square_feet       0
year_built            0
floor_count           0
air_temperature       0
cloud_coverage        0
dew_temperature       0
precip_depth_1_hr     0
sea_level_pressure    0
wind_direction        0
wind_speed            0
dtype: int64


In [6]:
# Saving some memory setting types
d_types = {'building_id': 'category', #np.int16,
          'meter': 'category', # np.int8,
          'site_id': 'category', #np.int8,
          'primary_use': 'category',
          'floor_count': 'category',
          'year_built': np.float16,
          'air_temperature': np.float32,
          'cloud_coverage': np.float16,
          'dew_temperature': np.float32,
          'precip_depth_1_hr': np.float16,
          'sea_level_pressure': np.float32,
          'wind_direction': np.float16,
          'wind_speed': np.float32}

for df in [train, test]:
    for feature in d_types: 
        df[feature] = df[feature].astype(d_types[feature])
    
print(train.dtypes)

building_id                 category
meter                       category
timestamp             datetime64[ns]
meter_reading                float64
site_id                     category
primary_use                 category
log_square_feet              float16
year_built                   float16
floor_count                 category
air_temperature              float32
cloud_coverage               float16
dew_temperature              float32
precip_depth_1_hr            float16
sea_level_pressure           float32
wind_direction               float16
wind_speed                   float32
dtype: object


In [21]:
# drop cols
drop_cols = ['wind_direction']
for df in [train, test]: 
    df = df.drop(drop_cols, axis=1, inplace=True)

KeyError: "['wind_direction'] not found in axis"

In [8]:
# add features we are less than 100% sure about
def preAccpetedFeatures(df):
    df['dayofweek'] = df["timestamp"].dt.dayofweek.astype('category') # vs weekend?
    df['weekday'] = df["timestamp"].dt.weekday.astype('category')
    df["hour"] = df["timestamp"].dt.hour.astype('category')
    #df["month"] = df["timestamp"].dt.month.astype('category')
    
    # each row should know about other meters 
    for i in range(4):
        df["_meter_"+str(i)] = df['building_id'].isin(
            train.loc[train['meter'] == i].building_id.unique()).astype('category')
        
for df in [train, test]:
    df = preAccpetedFeatures(df)
    
print(train.head())

  building_id meter  timestamp  meter_reading site_id primary_use  \
0           0     0 2016-01-01            0.0       0           0   
1           1     0 2016-01-01            0.0       0           0   
2           2     0 2016-01-01            0.0       0           0   
3           3     0 2016-01-01            0.0       0           0   
4           4     0 2016-01-01            0.0       0           0   

   log_square_feet  year_built floor_count  air_temperature  ...  \
0         8.914062       108.0         0.0             25.0  ...   
1         7.910156       104.0         0.0             25.0  ...   
2         8.585938        91.0         0.0             25.0  ...   
3        10.070312       102.0         0.0             25.0  ...   
4        11.664062        75.0         0.0             25.0  ...   

   precip_depth_1_hr  sea_level_pressure  wind_speed  dayofweek  weekday hour  \
0           0.796387         1019.700012         0.0          4        4    0   
1           0.

In [9]:
# create test train
train_y =  np.log1p(train["meter_reading"]) # ask why
train_X = train.drop(["meter_reading","timestamp"], axis=1)
test_X = test.drop(["row_id","timestamp"], axis=1)

gc.collect();

print(train_X.columns)
train.loc[train['meter_reading'] < 0]

Index(['building_id', 'meter', 'site_id', 'primary_use', 'log_square_feet',
       'year_built', 'floor_count', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_speed', 'dayofweek', 'weekday', 'hour', '_meter_0', '_meter_1',
       '_meter_2', '_meter_3'],
      dtype='object')


Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,log_square_feet,year_built,floor_count,air_temperature,...,precip_depth_1_hr,sea_level_pressure,wind_speed,dayofweek,weekday,hour,_meter_0,_meter_1,_meter_2,_meter_3


In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor


def rmsle(y, y_pred):
    # hack to prevent negative numbers
    return np.sqrt(mean_squared_log_error(y, y_pred.clip(0)))

def rmse(y, y_pred):
    # hack to prevent negative numbers
    return mean_squared_error(y, y_pred.clip(0))

rmsle_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)

rmse_scorer = make_scorer(
    lambda y_true, y_pred : rmsle(y_true, y_pred), 
    greater_is_better=False)


gbm=LGBMRegressor(n_estimators=100, # for accuracy use large numbers like 6000 
                  learning_rate=0.28,
                  feature_fraction=0.9,
                  subsample=0.2,  # batches of 20% of the data
                  subsample_freq=1,
                  num_leaves=20,
                  metric='rmse',
                  verbose= 100)


In [27]:
# Rob hacking

scores = cross_val_score(gbm, train_X, train_y, cv=5, 
                         scoring=rmse_scorer)
print("rmse scores:\n", scores)

rmse scores:
 [-0.47824034 -0.45746087 -0.36639561 -0.39887954 -0.43823683]


In [28]:
gbm.fit(train_X, train_y, eval_metric=rmse, verbose=100)


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.9, importance_type='split', learning_rate=0.28,
              max_depth=-1, metric='rmse', min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
              n_jobs=-1, num_leaves=20, objective=None, random_state=None,
              reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.2,
              subsample_for_bin=200000, subsample_freq=1, verbose=100)

In [29]:
print( gbm.feature_importances_)
imprtc_df = pd.DataFrame()
imprtc_df["feature"] = train_X.columns   
imprtc_df["importance"] = gbm.feature_importances_
print(imprtc_df.sort_values('importance', ascending=False))

[859 225  66   9  60  12   9 137  40 192   1 103  17  51   8  92   2   6
   6   5]
               feature  importance
0          building_id         859
1                meter         225
9      dew_temperature         192
7      air_temperature         137
11  sea_level_pressure         103
15                hour          92
2              site_id          66
4      log_square_feet          60
13           dayofweek          51
8       cloud_coverage          40
12          wind_speed          17
5           year_built          12
6          floor_count           9
3          primary_use           9
14             weekday           8
17            _meter_1           6
18            _meter_2           6
19            _meter_3           5
16            _meter_0           2
10   precip_depth_1_hr           1


In [23]:
gc.collect();

from tqdm import tqdm
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_X.shape[0]/50000)))):
    res.append(gbm.predict(test_X.iloc[i:i+step_size]))
    i+=step_size




  0%|                                                                                          | 0/834 [00:00<?, ?it/s]


  0%|                                                                                | 1/834 [00:05<1:21:20,  5.86s/it]


  0%|▏                                                                               | 2/834 [00:11<1:21:40,  5.89s/it]


  0%|▎                                                                               | 3/834 [00:17<1:22:26,  5.95s/it]


  0%|▍                                                                               | 4/834 [00:23<1:22:07,  5.94s/it]


  1%|▍                                                                               | 5/834 [00:29<1:21:51,  5.92s/it]


  1%|▌                                                                               | 6/834 [00:35<1:21:44,  5.92s/it]


  1%|▋                                                                               | 7/834 [00:41<1:22:14,  5.97s/it]


  1%|▊               

  8%|██████▎                                                                        | 67/834 [07:41<1:35:34,  7.48s/it]


  8%|██████▍                                                                        | 68/834 [07:48<1:33:41,  7.34s/it]


  8%|██████▌                                                                        | 69/834 [07:56<1:37:02,  7.61s/it]


  8%|██████▋                                                                        | 70/834 [08:04<1:36:58,  7.62s/it]


  9%|██████▋                                                                        | 71/834 [08:11<1:35:15,  7.49s/it]


  9%|██████▊                                                                        | 72/834 [08:18<1:32:29,  7.28s/it]


  9%|██████▉                                                                        | 73/834 [08:25<1:31:05,  7.18s/it]


  9%|███████                                                                        | 74/834 [08:32<1:32:27,  7.30s/it]


  9%|███████            

 16%|████████████▌                                                                 | 134/834 [15:27<1:19:24,  6.81s/it]


 16%|████████████▋                                                                 | 135/834 [15:34<1:18:25,  6.73s/it]


 16%|████████████▋                                                                 | 136/834 [15:41<1:18:23,  6.74s/it]


 16%|████████████▊                                                                 | 137/834 [15:47<1:17:29,  6.67s/it]


 17%|████████████▉                                                                 | 138/834 [15:54<1:16:37,  6.61s/it]


 17%|█████████████                                                                 | 139/834 [16:00<1:16:16,  6.59s/it]


 17%|█████████████                                                                 | 140/834 [16:07<1:16:27,  6.61s/it]


 17%|█████████████▏                                                                | 141/834 [16:14<1:17:11,  6.68s/it]


 17%|█████████████▎     

 24%|██████████████████▊                                                           | 201/834 [22:47<1:10:00,  6.64s/it]


 24%|██████████████████▉                                                           | 202/834 [22:54<1:10:07,  6.66s/it]


 24%|██████████████████▉                                                           | 203/834 [23:00<1:10:00,  6.66s/it]


 24%|███████████████████                                                           | 204/834 [23:07<1:09:55,  6.66s/it]


 25%|███████████████████▏                                                          | 205/834 [23:14<1:09:49,  6.66s/it]


 25%|███████████████████▎                                                          | 206/834 [23:20<1:09:42,  6.66s/it]


 25%|███████████████████▎                                                          | 207/834 [23:27<1:09:29,  6.65s/it]


 25%|███████████████████▍                                                          | 208/834 [23:34<1:09:21,  6.65s/it]


 25%|███████████████████

 32%|█████████████████████████                                                     | 268/834 [30:13<1:02:57,  6.67s/it]


 32%|█████████████████████████▏                                                    | 269/834 [30:20<1:02:53,  6.68s/it]


 32%|█████████████████████████▎                                                    | 270/834 [30:26<1:02:41,  6.67s/it]


 32%|█████████████████████████▎                                                    | 271/834 [30:33<1:02:27,  6.66s/it]


 33%|█████████████████████████▍                                                    | 272/834 [30:40<1:02:16,  6.65s/it]


 33%|█████████████████████████▌                                                    | 273/834 [30:46<1:02:11,  6.65s/it]


 33%|█████████████████████████▋                                                    | 274/834 [30:53<1:02:20,  6.68s/it]


 33%|█████████████████████████▋                                                    | 275/834 [31:00<1:03:48,  6.85s/it]


 33%|███████████████████

 40%|████████████████████████████████▏                                               | 335/834 [37:29<52:02,  6.26s/it]


 40%|████████████████████████████████▏                                               | 336/834 [37:35<51:55,  6.26s/it]


 40%|████████████████████████████████▎                                               | 337/834 [37:41<51:47,  6.25s/it]


 41%|████████████████████████████████▍                                               | 338/834 [37:47<51:40,  6.25s/it]


 41%|████████████████████████████████▌                                               | 339/834 [37:54<51:52,  6.29s/it]


 41%|████████████████████████████████▌                                               | 340/834 [38:00<51:30,  6.26s/it]


 41%|████████████████████████████████▋                                               | 341/834 [38:06<51:33,  6.27s/it]


 41%|████████████████████████████████▊                                               | 342/834 [38:12<51:34,  6.29s/it]


 41%|███████████████████

 48%|██████████████████████████████████████▌                                         | 402/834 [44:34<46:24,  6.44s/it]


 48%|██████████████████████████████████████▋                                         | 403/834 [44:40<46:08,  6.42s/it]


 48%|██████████████████████████████████████▊                                         | 404/834 [44:47<46:07,  6.44s/it]


 49%|██████████████████████████████████████▊                                         | 405/834 [44:53<46:04,  6.44s/it]


 49%|██████████████████████████████████████▉                                         | 406/834 [45:00<45:57,  6.44s/it]


 49%|███████████████████████████████████████                                         | 407/834 [45:06<45:45,  6.43s/it]


 49%|███████████████████████████████████████▏                                        | 408/834 [45:13<45:35,  6.42s/it]


 49%|███████████████████████████████████████▏                                        | 409/834 [45:19<45:30,  6.42s/it]


 49%|███████████████████

 56%|████████████████████████████████████████████▉                                   | 469/834 [51:47<39:11,  6.44s/it]


 56%|█████████████████████████████████████████████                                   | 470/834 [51:53<39:03,  6.44s/it]


 56%|█████████████████████████████████████████████▏                                  | 471/834 [52:00<39:09,  6.47s/it]


 57%|█████████████████████████████████████████████▎                                  | 472/834 [52:06<39:02,  6.47s/it]


 57%|█████████████████████████████████████████████▎                                  | 473/834 [52:13<38:52,  6.46s/it]


 57%|█████████████████████████████████████████████▍                                  | 474/834 [52:19<39:23,  6.57s/it]


 57%|█████████████████████████████████████████████▌                                  | 475/834 [52:26<39:29,  6.60s/it]


 57%|█████████████████████████████████████████████▋                                  | 476/834 [52:33<39:35,  6.63s/it]


 57%|███████████████████

 64%|███████████████████████████████████████████████████▍                            | 536/834 [59:22<32:52,  6.62s/it]


 64%|███████████████████████████████████████████████████▌                            | 537/834 [59:29<32:26,  6.55s/it]


 65%|███████████████████████████████████████████████████▌                            | 538/834 [59:36<32:27,  6.58s/it]


 65%|███████████████████████████████████████████████████▋                            | 539/834 [59:42<32:13,  6.55s/it]


 65%|███████████████████████████████████████████████████▊                            | 540/834 [59:48<31:54,  6.51s/it]


 65%|███████████████████████████████████████████████████▉                            | 541/834 [59:56<33:30,  6.86s/it]


 65%|██████████████████████████████████████████████████▋                           | 542/834 [1:00:03<32:45,  6.73s/it]


 65%|██████████████████████████████████████████████████▊                           | 543/834 [1:00:09<32:15,  6.65s/it]


 65%|███████████████████

 72%|████████████████████████████████████████████████████████▍                     | 603/834 [1:06:35<24:36,  6.39s/it]


 72%|████████████████████████████████████████████████████████▍                     | 604/834 [1:06:42<24:33,  6.40s/it]


 73%|████████████████████████████████████████████████████████▌                     | 605/834 [1:06:48<24:33,  6.43s/it]


 73%|████████████████████████████████████████████████████████▋                     | 606/834 [1:06:55<24:30,  6.45s/it]


 73%|████████████████████████████████████████████████████████▊                     | 607/834 [1:07:01<24:26,  6.46s/it]


 73%|████████████████████████████████████████████████████████▊                     | 608/834 [1:07:07<24:12,  6.43s/it]


 73%|████████████████████████████████████████████████████████▉                     | 609/834 [1:07:14<24:02,  6.41s/it]


 73%|█████████████████████████████████████████████████████████                     | 610/834 [1:07:20<23:53,  6.40s/it]


 73%|███████████████████

 80%|██████████████████████████████████████████████████████████████▋               | 670/834 [1:13:41<16:56,  6.20s/it]


 80%|██████████████████████████████████████████████████████████████▊               | 671/834 [1:13:47<16:50,  6.20s/it]


 81%|██████████████████████████████████████████████████████████████▊               | 672/834 [1:13:53<16:46,  6.21s/it]


 81%|██████████████████████████████████████████████████████████████▉               | 673/834 [1:14:00<16:39,  6.21s/it]


 81%|███████████████████████████████████████████████████████████████               | 674/834 [1:14:06<16:35,  6.22s/it]


 81%|███████████████████████████████████████████████████████████████▏              | 675/834 [1:14:12<16:28,  6.22s/it]


 81%|███████████████████████████████████████████████████████████████▏              | 676/834 [1:14:18<16:22,  6.22s/it]


 81%|███████████████████████████████████████████████████████████████▎              | 677/834 [1:14:25<16:21,  6.25s/it]


 81%|███████████████████

 88%|████████████████████████████████████████████████████████████████████▉         | 737/834 [1:20:38<10:14,  6.34s/it]


 88%|█████████████████████████████████████████████████████████████████████         | 738/834 [1:20:44<10:05,  6.31s/it]


 89%|█████████████████████████████████████████████████████████████████████         | 739/834 [1:20:50<09:58,  6.30s/it]


 89%|█████████████████████████████████████████████████████████████████████▏        | 740/834 [1:20:57<09:49,  6.27s/it]


 89%|█████████████████████████████████████████████████████████████████████▎        | 741/834 [1:21:03<09:40,  6.25s/it]


 89%|█████████████████████████████████████████████████████████████████████▍        | 742/834 [1:21:09<09:33,  6.24s/it]


 89%|█████████████████████████████████████████████████████████████████████▍        | 743/834 [1:21:15<09:26,  6.23s/it]


 89%|█████████████████████████████████████████████████████████████████████▌        | 744/834 [1:21:21<09:19,  6.22s/it]


 89%|███████████████████

 96%|███████████████████████████████████████████████████████████████████████████▏  | 804/834 [1:27:10<02:53,  5.79s/it]


 97%|███████████████████████████████████████████████████████████████████████████▎  | 805/834 [1:27:15<02:47,  5.77s/it]


 97%|███████████████████████████████████████████████████████████████████████████▍  | 806/834 [1:27:21<02:41,  5.78s/it]


 97%|███████████████████████████████████████████████████████████████████████████▍  | 807/834 [1:27:27<02:35,  5.78s/it]


 97%|███████████████████████████████████████████████████████████████████████████▌  | 808/834 [1:27:33<02:30,  5.78s/it]


 97%|███████████████████████████████████████████████████████████████████████████▋  | 809/834 [1:27:39<02:24,  5.77s/it]


 97%|███████████████████████████████████████████████████████████████████████████▊  | 810/834 [1:27:44<02:19,  5.81s/it]


 97%|███████████████████████████████████████████████████████████████████████████▊  | 811/834 [1:27:50<02:13,  5.81s/it]


 97%|███████████████████

In [24]:
sub = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
res = np.concatenate(res)
# hack to prevent negative numbers
sub["meter_reading"] = np.expm1(res.clip(0))
sub.to_csv("submission.csv", index = False)