## Building (and meter) models with Half and Half.
The concept of this notebook is creating models for each building and the meter.  
The feature importance by LGBM model is dominated by buliding_id. Then, it would be nice to make a model specific to that building (or even more for each meter in that building) isn't it?  

Note that there might be some mistakes in code or logic since this is my first kernel, as well as the first Kaggle competition.

Also based on notebook "Half and Half"(https://www.kaggle.com/rohanrao/ashrae-half-and-half)  
Thanks

In [None]:
import gc
import os
import random

import lightgbm as lgb
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

path_data = "/kaggle/input/ashrae-energy-prediction/"
path_train = path_data + "train.csv"
path_test = path_data + "test.csv"
path_building = path_data + "building_metadata.csv"
path_weather_train = path_data + "weather_train.csv"
path_weather_test = path_data + "weather_test.csv"
path_drops = path_data + "../lier-list/rows_to_drop.csv"
path_model = path_data + "../building-models/building_meter_models.pkl"

plt.style.use("seaborn")
sns.set(font_scale=1)

myfavouritenumber = 0
seed = myfavouritenumber
random.seed(seed)

## Reading train data
Reading train data along with building and weather metadata.  
To remove outliers, using data published in the following notebooks. 
(https://www.kaggle.com/purist1024/ashrae-simple-data-cleanup-lb-1-08-no-leaks/output)

In [None]:
df_train = pd.read_csv(path_train)

building = pd.read_csv(path_building)
le = LabelEncoder()
building.primary_use = le.fit_transform(building.primary_use)

weather_train = pd.read_csv(path_weather_train)

# remove outlier
bad_rows = pd.read_csv(path_drops)
df_train = df_train.drop(index=bad_rows["0"])

In [None]:
## Memory optimization

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
df_train = reduce_mem_usage(df_train, use_float16=True)
building = reduce_mem_usage(building, use_float16=True)
weather_train = reduce_mem_usage(weather_train, use_float16=True)

## Preparing data
There are two files with features that need to be merged with the data. One is building metadata that has information on the buildings and the other is weather data that has information on the weather.   

Although it is not written for simplicity, it is better to perform feature engineering especially on weather data.

In [None]:
def prepare_data(X, building_data, weather_data, test=False):
    """
    Preparing final dataset with all features.
    """    
    
    X = X.merge(building_data, on="building_id", how="left")
    X = X.merge(weather_data, on=["site_id", "timestamp"], how="left")
    
    X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S")
#     X.square_feet = np.log1p(X.square_feet)
#     X.year_built = 2020 - X.year_built

    
    if not test:
        X.sort_values("timestamp", inplace=True)
        X.reset_index(drop=True, inplace=True)
    
    gc.collect()
    
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    
    X["hour"] = X.timestamp.dt.hour
    X["weekday"] = X.timestamp.dt.weekday
    X["is_holiday"] = (X.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    drop_features = ["timestamp", "sea_level_pressure", "wind_direction", "wind_speed",
                    "site_id", "primary_use", "square_feet", "year_built", "floor_count"]
    X.drop(drop_features, axis=1, inplace=True)
    
#     if test:
#         row_ids = X.row_id
#         X.drop("row_id", axis=1, inplace=True)
#         return X, row_ids
#     else:
#         y = np.log1p(X.meter_reading)
#         X.drop("meter_reading", axis=1, inplace=True)
#         return X, y
    return X

In [None]:
X_train = prepare_data(df_train, building, weather_train)

In [None]:
del df_train
gc.collect()

## Two-fold LightGBM Model split half-and-half
The data is split into two based on time. Each half is used as the training data for a model.

**Half 1:** The first 50% rows of train data   
**Half 2:** The last 50% rows of train data

In [None]:
def LGBM(X_train, y_train):
    X_half_1 = X_train[:int(X_train.shape[0] / 2)]
    X_half_2 = X_train[int(X_train.shape[0] / 2):]

    y_half_1 = y_train[:int(X_train.shape[0] / 2)]
    y_half_2 = y_train[int(X_train.shape[0] / 2):]

    # categorical_features = ["building_id", "site_id", "meter", "primary_use", "hour", "weekday"]
    categorical_features = ["hour", "weekday"]

    ###
    # LGBM Setting

    d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
    d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

    watchlist_1 = [d_half_1, d_half_2]
    watchlist_2 = [d_half_2, d_half_1]

    params = {
        "objective": "regression",
        "boosting": "gbdt",
        "num_leaves": 40,
        "learning_rate": 0.05,
        "feature_fraction": 0.85,
        "reg_lambda": 2,
        "metric": "rmse"
    }
#     print("Building model with first half and validating on second half:")
    model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

#     print("Building model with second half and validating on first half:")
    model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)
    return model_half_1, model_half_2

## Modeling
A model is created by extracting the training data corresponding to each building and meter.   
By repeating this, we finally get 2380 models.

In [None]:
def modeling(X_train, stop=999999):
    drop_list = ["building_id", "meter", "meter_reading"]
    ids = np.sort(X_train["building_id"].unique())
    df_model = pd.DataFrame(columns=["building_id", "meter", "half_1", "half_2"])

    index = 0
    for bid in ids:
        if stop < bid:
            break
        meters = np.sort(X_train[X_train["building_id"] == bid]["meter"].unique())
        for meter in meters:
#             print("---------\nModeling building_id:{}, meter:{}\n---------\n".format(bid, meter))

            # get train data
            X_each = X_train[(X_train["building_id"] == bid) & (X_train["meter"] == meter)]
            y_each = np.log1p(X_each.meter_reading)
            X_each.drop(drop_list, axis=1, inplace=True)
            
            # LGBM
            model_half_1, model_half_2 = LGBM(X_each, y_each)
            
            # make DataFrame of models
            df_model.loc[index] = [bid, meter, model_half_1, model_half_2]
            index+=1
    return df_model

## ShortCut
Here, I'm using a previously saved model for shortening.  
Plese run the code cell below when modeling.

In [None]:
# Normal
# df_model = modeling(X_train)
# df_model.to_pickle("building_meter_models.pkl")

In [None]:
# Short Cut
df_model = pd.read_pickle(path_model)
df_model

## Feature Importance
Plotting some of the feature importance from LGBM.

In [None]:
def feature_importance(columns, models, building_id, meter):
    df_fimp = pd.DataFrame()
    for model in models:
        df_fimp_each = pd.DataFrame()
        df_fimp_each["feature"] = columns
        df_fimp_each["importance"] = model.feature_importance()
        df_fimp = pd.concat([df_fimp, df_fimp_each], axis=0)

    plt.figure(figsize=(14, 7))
    sns.barplot(x="importance", y="feature", data=df_fimp.sort_values(by="importance", ascending=False))
    plt.title("LightGBM Feature Importance building_id:{}, meter:{}".format(building_id, meter))
    plt.tight_layout()

In [None]:
drop_list = ["building_id", "meter", "meter_reading"]
model_names = ["half_1", "half_2"]
colmuns = X_train[(X_train["building_id"] == 0) & (X_train["meter"] == 0)].drop(drop_list, axis=1).columns.values
for bid in [31, 15, 75]:
    meters = np.sort(X_train[X_train["building_id"] == bid]["meter"].unique())
    for meter in meters:
        models = []
        for m_name in model_names:
            model = df_model[(df_model["building_id"] == bid) & (df_model["meter"] == meter)][m_name].values[0]
            models.append(model)
        feature_importance(colmuns, models, bid, meter)

In [None]:
del X_train
gc.collect()

## Preparing test data
Preparing test data with same features as train data.

In [None]:
df_test = pd.read_csv(path_test)
weather_test = pd.read_csv(path_weather_train)

df_test = reduce_mem_usage(df_test)
weather_test = reduce_mem_usage(weather_test)

X_test = prepare_data(df_test, building, weather_test, test=True)

In [None]:
del df_test, building
gc.collect()

## Main Predicting
Similar to training, extract test data corresponding to building_id and meter and make predictions.

In [None]:
def predicting(X_test, df_model, stop=9999999):
    drop_list = ["building_id", "meter", "row_id"]

    ids = np.sort(X_test["building_id"].unique())
    submission = pd.DataFrame(columns=["row_id", "meter_reading"])
    for bid in ids:
        if stop < bid:
            break
        meters = np.sort(X_test[X_test["building_id"] == bid]["meter"].unique())
        for meter in meters:
#             print("---------\nPredecting building_id:{}, meter:{}\n---------\n".format(bid, meter))
            
            # make test data and rows
            X_each = X_test[(X_test["building_id"] == bid) & (X_test["meter"] == meter)]
            row_ids = X_each.row_id
            X_each.drop(drop_list, axis=1, inplace=True)
            
            # predicting
            model_half_1 = df_model[(df_model["building_id"] == bid) & (df_model["meter"] == meter)]["half_1"].values[0]
            model_half_2 = df_model[(df_model["building_id"] == bid) & (df_model["meter"] == meter)]["half_2"].values[0]
            pred = np.expm1(model_half_1.predict(X_each, num_iteration=model_half_1.best_iteration)) / 2
            pred += np.expm1(model_half_2.predict(X_each, num_iteration=model_half_2.best_iteration)) / 2
            
            # concating submission
            sub_each = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(pred, 0, a_max=None)})
            submission = pd.concat([submission, sub_each])
    return submission.sort_values("row_id")

## Scoring test data
Averaging predictions from the two half train data models.

In [None]:
submission = predicting(X_test, df_model)
submission

## Submission
Preparing final file for submission.

In [None]:
# submission = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(pred, 0, a_max=None)})
submission.to_csv("submission.csv", index=False)

**P.S.** If you vote up this kernel, please don't forget to vote up the original Half and Half: https://www.kaggle.com/rohanrao/ashrae-half-and-half