# ASHRAE - Great Energy Predictor III
### refer from...
- https://www.kaggle.com/rohanrao/ashrae-half-and-half

In [2]:
%run '../../../utils.ipynb'

done


In [3]:
import gc
import os
import random

import lightgbm as lgb
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

path_data = "../../input/"
path_train = path_data + "train.csv"
path_test = path_data + "test.csv"
path_building = path_data + "building_metadata.csv"
path_weather_train = path_data + "weather_train.csv"
path_weather_test = path_data + "weather_test.csv"

plt.style.use("seaborn")
sns.set(font_scale=1)

myfavouritenumber = 0
seed = myfavouritenumber
random.seed(seed)

import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

import pickle
from tqdm import tqdm

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Read in Data

In [4]:
df_train = pd.read_csv(path_train)

building = pd.read_csv(path_building)
le = LabelEncoder()
building.primary_use = le.fit_transform(building.primary_use)

weather_train = pd.read_csv(path_weather_train)

In [5]:
df_train = reduce_mem_usage(df_train)
building = reduce_mem_usage(building)
weather_train = reduce_mem_usage(weather_train)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to  0.02 Mb (74.9% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)


## Preparing data

In [6]:
# delete site_id: 0
df_train = df_train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

In [7]:
df_train['meter_reading_log1p'] = np.log1p(df_train['meter_reading'])
df_group = df_train.groupby('building_id')['meter_reading_log1p']
building_mean = df_group.mean().astype(np.float16)
building_median = df_group.median().astype(np.float16)
building_min = df_group.min().astype(np.float16)
building_max = df_group.max().astype(np.float16)
building_std = df_group.std().astype(np.float16)

df_train['building_mean'] = df_train['building_id'].map(building_mean)
df_train['building_median'] = df_train['building_id'].map(building_median)
df_train['building_min'] = df_train['building_id'].map(building_min)
df_train['building_max'] = df_train['building_id'].map(building_max)
df_train['building_std'] = df_train['building_id'].map(building_std)

In [12]:
def prepare_data(X, building_data, weather_data, test=False):
    """
    Preparing final dataset with all features.
    """
    
    X = X.merge(building_data, on="building_id", how="left")
    X = X.merge(weather_data, on=["site_id", "timestamp"], how="left")
    
    X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S")
    X.square_feet = np.log1p(X.square_feet)
    
    if not test:
        X.sort_values("timestamp", inplace=True)
        X.reset_index(drop=True, inplace=True)
    
    gc.collect()
    
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    
    X["hour"] = X.timestamp.dt.hour
    X["weekday"] = X.timestamp.dt.weekday
    X["is_holiday"] = (X.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    for window in [3, 72]:
        group_df = X.groupby('site_id')
        cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
        rolled = group_df[cols].rolling(window=window, min_periods=0)
        lag_mean = rolled.mean().reset_index().astype(np.float16)
        lag_max = rolled.max().reset_index().astype(np.float16)
        lag_min = rolled.min().reset_index().astype(np.float16)
        lag_std = rolled.std().reset_index().astype(np.float16)
        for col in cols:
            X[f'{col}_mean_lag{window}'] = lag_mean[col]
            X[f'{col}_max_lag{window}'] = lag_max[col]
            X[f'{col}_min_lag{window}'] = lag_min[col]
            X[f'{col}_std_lag{window}'] = lag_std[col]
    
    drop_features = ["timestamp", "sea_level_pressure", "wind_direction", "wind_speed"]

    X.drop(drop_features, axis=1, inplace=True)

    if test:
        row_ids = X.row_id
        X.drop("row_id", axis=1, inplace=True)
        return X, row_ids
    else:
        y = np.log1p(X.meter_reading)
        X.drop("meter_reading", axis=1, inplace=True)
        return X, y

In [9]:
X_train, y_train = prepare_data(df_train, building, weather_train)

del df_train, weather_train
gc.collect()

175

## Two-fold LightGBM Model split half-and-half

In [10]:
%%time

X_half_1 = X_train[:int(X_train.shape[0] / 2)]
X_half_2 = X_train[int(X_train.shape[0] / 2):]

y_half_1 = y_train[:int(X_train.shape[0] / 2)]
y_half_2 = y_train[int(X_train.shape[0] / 2):]

categorical_features = ["building_id", "site_id", "meter", "primary_use", "hour", "weekday"]

d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)

Building model with first half and validating on second half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.848882	valid_1's rmse: 1.14964
[400]	training's rmse: 0.791538	valid_1's rmse: 1.14681
Early stopping, best iteration is:
[353]	training's rmse: 0.80131	valid_1's rmse: 1.14626
Building model with second half and validating on first half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.843844	valid_1's rmse: 1.15236
[400]	training's rmse: 0.784397	valid_1's rmse: 1.14981
Early stopping, best iteration is:
[352]	training's rmse: 0.794341	valid_1's rmse: 1.14925
CPU times: user 2h 2min 6s, sys: 2min 51s, total: 2h 4min 58s
Wall time: 44min 55s


In [8]:
intermed_path = './tmp/Vopani/'

with open(os.path.join(intermed_path, 'model_half_1_2.pickle'), mode='wb') as fp:
    pickle.dump(model_half_1, fp)
    
with open(os.path.join(intermed_path, 'model_half_2_2.pickle'), mode='wb') as fp:
    pickle.dump(model_half_2, fp)

NameError: name 'model_half_1' is not defined

In [9]:
intermed_path = './tmp/Vopani/'

model_half_1 = pd.read_pickle(os.path.join(intermed_path, 'model_half_1.pickle'))
model_half_2 = pd.read_pickle(os.path.join(intermed_path, 'model_half_2.pickle'))

## Feature Importance

In [10]:
df_fimp_1 = pd.DataFrame()
df_fimp_1["feature"] = X_train.columns.values
df_fimp_1["importance"] = model_half_1.feature_importance()
df_fimp_1["half"] = 1

df_fimp_2 = pd.DataFrame()
df_fimp_2["feature"] = X_train.columns.values
df_fimp_2["importance"] = model_half_2.feature_importance()
df_fimp_2["half"] = 2

df_fimp = pd.concat([df_fimp_1, df_fimp_2], axis=0)

plt.figure(figsize=(14, 7))
sns.barplot(x="importance", y="feature", data=df_fimp.sort_values(by="importance", ascending=False))
plt.title("LightGBM Feature Importance")
plt.tight_layout()

NameError: name 'X_train' is not defined

## Preparing test data

In [13]:
df_test = pd.read_csv(path_test)
weather_test = pd.read_csv(path_weather_test)

df_test = reduce_mem_usage(df_test)
weather_test = reduce_mem_usage(weather_test)

X_test, row_ids = prepare_data(df_test, building, weather_test, test=True)

Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)


In [14]:
df_test['building_mean'] = df_test['building_id'].map(building_mean)
df_test['building_median'] = df_test['building_id'].map(building_median)
df_test['building_min'] = df_test['building_id'].map(building_min)
df_test['building_max'] = df_test['building_id'].map(building_max)
df_test['building_std'] = df_test['building_id'].map(building_std)

## Scoring test data

In [15]:
%%time
pred = np.expm1(model_half_1.predict(X_test, num_iteration=model_half_1.best_iteration)) / 2

del model_half_1
gc.collect()

pred += np.expm1(model_half_2.predict(X_test, num_iteration=model_half_2.best_iteration)) / 2

del model_half_2
gc.collect()

CPU times: user 2h 49min 45s, sys: 7min 47s, total: 2h 57min 32s
Wall time: 1h 6min 1s


In [16]:
%%time

submission = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(pred, 0, a_max=None)})
submission.loc[submission['meter_reading'] < 0, 'meter_reading'] = 0
submission.to_csv('../../output/' + 'sub_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)

CPU times: user 3min 12s, sys: 6.59 s, total: 3min 19s
Wall time: 3min 30s


In [None]:
%%time

i = 0
res = []
step_size = 50000
for j in tqdm(range(int(np.ceil(X_test.shape[0] / 50000)))):
    res.append(np.expm1(sum([model.predict(X_test.iloc[i:i + step_size]) for model in [model_half_1, model_half_2]]) / 2))
    i += step_size