## Half and Half


In [1]:
import gc
import os
import random

import lightgbm as lgb
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

seed = 0
random.seed(seed)

## Reading train data
Reading train data along with building and weather metadata.

In [11]:
df_train = pd.read_csv("data/train.csv")

building = pd.read_csv('data/building_metadata_external.csv')
le = LabelEncoder()
building.primary_use = le.fit_transform(building.primary_use)

weather_train = pd.read_csv('data/weather_train.csv')
weather_test = pd.read_csv('data/weather_test.csv')

In [12]:
from tools import reduce_mem_usage

# df_train = reduce_mem_usage(df_train, use_float16=True)
building = reduce_mem_usage(building, use_float16=True)
weather_train = reduce_mem_usage(weather_train, use_float16=True)
weather_test = reduce_mem_usage(weather_test, use_float16=True)

Memory usage of dataframe is 0.09 MB
Memory usage after optimization is: 0.02 MB
Decreased by 76.4%
Memory usage of dataframe is 8.58 MB
Memory usage after optimization is: 2.26 MB
Decreased by 73.7%
Memory usage of dataframe is 17.11 MB
Memory usage after optimization is: 4.50 MB
Decreased by 73.7%


## Preparing data

In [13]:
import holidays

in_us = [0,2,3,4,6,8,9,10,13,14,15]
in_ca = [7,11]
in_uk = [1,5]
in_ie = [12]

us_cal =  holidays.US()
ca_cal = holidays.CA()
ie_cal = holidays.IE()
uk_cal = holidays.UK()

In [14]:
def holiday_name(timestamp, site_id):
    if site_id in in_ca:
        return ca_cal.get(timestamp)
    elif site_id in in_uk:
        return uk_cal.get(timestamp)
    elif site_id in in_ie:
        return ie_cal.get(timestamp)
    else:
        return us_cal.get(timestamp)

In [15]:
def transform_holidays(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['holiday_name'] = df.apply(lambda x: holiday_name(x.timestamp, x.site_id), axis=1)
    df['holiday_name'] = df['holiday_name'].astype('category')
    df['holiday_name'] = df['holiday_name'].cat.add_categories(['NONE'])
    df['holiday_name'] = df['holiday_name'].fillna('NONE')
    return df

In [16]:
# def relative_humidity(df):
#     exp = pd.np.exp
#     df['relative_humidity'] = 100*(exp((17.625*df['dew_temperature'])/(243.04+df['dew_temperature'])) / exp((17.625*df['air_temperature'])/(243.04+df['air_temperature'])))
    
#     return df

In [17]:
# def add_lag_feature(weather_df, window=3):
#     group_df = weather_df.groupby('site_id')
#     cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'wind_direction', 'wind_speed']
#     rolled = group_df[cols].rolling(window=window, min_periods=0)
#     lag_mean = rolled.mean().reset_index().astype(np.float16)
#     lag_max = rolled.max().reset_index().astype(np.float16)
#     lag_min = rolled.min().reset_index().astype(np.float16)
#     lag_std = rolled.std().reset_index().astype(np.float16)
#     for col in cols:
#         weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
#         weather_df[f'{col}_max_lag{window}'] = lag_max[col]
#         weather_df[f'{col}_min_lag{window}'] = lag_min[col]
#         weather_df[f'{col}_std_lag{window}'] = lag_std[col]
        

In [18]:
weather_train = transform_holidays(weather_train)
# le_weather = LabelEncoder()
# weather_train.holiday_name = le_weather.fit_transform(weather_train.holiday_name)

weather_test = transform_holidays(weather_test)
# weather_test.holiday_name = le_weather.transform(weather_test.holiday_name)

In [19]:
# def scale_data(df, scale_list):
#     mean = df[scale_list].mean(axis=0)
#     df[scale_list] = df[scale_list].astype('float32')
#     df[scale_list] -= df[scale_list].mean(axis=0)
#     std = df[scale_list].std(axis=0)
#     df[scale_list] /= df[scale_list].std(axis=0)
    
#     return df

In [20]:
class DataPrep:
    def __init__(self):
        pass
    
    def prepare_data(self, X, building_data, weather_data, test=False):

        X.timestamp = pd.to_datetime(X.timestamp)
        X.timestamp = X.timestamp.astype('datetime64[ns]')

        X = X.merge(building_data, on="building_id", how="left")
        X = X.merge(weather_data, on=["site_id", "timestamp"], how="left")

        if not test:
            df_group = X.groupby('building_id')['meter_reading']
            self.building_mean = df_group.mean().astype(np.float16)
            self.building_median = df_group.median().astype(np.float16)
            self.building_min = df_group.min().astype(np.float16)
            self.building_max = df_group.max().astype(np.float16)
            self.building_std = df_group.std().astype(np.float16)

        X['building_mean'] = X['building_id'].map(self.building_mean)
        X['building_median'] = X['building_id'].map(self.building_median)
        X['building_min'] = X['building_id'].map(self.building_min)
        X['building_max'] = X['building_id'].map(self.building_max)
        X['building_std'] = X['building_id'].map(self.building_std)

        X.sort_values("timestamp")
        X.reset_index(drop=True)

        gc.collect()



        X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S")
        X.square_feet = np.log1p(X.square_feet)

        X["hour"] = X.timestamp.dt.hour
        X["weekday"] = X.timestamp.dt.weekday
    #     X['month'] = X.timestamp.dt.month
    #     X['day'] = X.timestamp.dt.day
    #     X = encode_date(X)

        X['is_holiday'] = X['holiday_name'].apply(lambda x: 1 if x != "NONE" else 0)



        if not test:
            X.drop(index=X[(X.meter_reading <=0) &
                           (X.meter == 0)].index, inplace=True)

    #     drop_features = ["timestamp", "wind_direction", "wind_speed", 'holiday_name']
        drop_features = ["wind_direction", "wind_speed", 'holiday_name']



        X.drop(drop_features, axis=1, inplace=True)

        if test:
            row_ids = X.row_id
            X.drop("row_id", axis=1, inplace=True)
            return X, row_ids
        else:
            y = np.log1p(X.meter_reading)
            X.drop(["meter_reading",'timestamp'], axis=1, inplace=True)
            return X, y

In [21]:
prepr = DataPrep()

In [22]:
X_train, y_train = prepr.prepare_data(df_train, building, weather_train, False)
del df_train, weather_train
gc.collect()

30

In [23]:
X_train = pd.read_pickle('pseudo_data/X_pseudo.pkl')
y_train = pd.read_pickle('pseudo_data/y_pseudo.pkl')

In [14]:
# # weather_train = timestamp_align(weather_train)
# X_train, y_train = prepare_data(df_train, building, weather_train)

# # del df_train, weather_train
# gc.collect()

## Two-fold LightGBM Model split half-and-half
The data is split into two based on time. Each half is used as the training data for a model.

In [28]:
X_half_1 = X_train[:int(X_train.shape[0] / 2)]
X_half_2 = X_train[int(X_train.shape[0] / 2):]

y_half_1 = y_train[:int(X_train.shape[0] / 2)]
y_half_2 = y_train[int(X_train.shape[0] / 2):]



categorical_features = ["building_id", "site_id", "meter", "primary_use",
                        "hour", "weekday", 'is_holiday']



d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 50,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
    'n_jobs': -1
}


print("Building model with first half and validating on second half:")
model_half_1_a = lgb.train(params, train_set=d_half_1, num_boost_round=1500, valid_sets=watchlist_1, verbose_eval=300, early_stopping_rounds=300)

print("Building model with second half and validating on first half:")
model_half_2_a = lgb.train(params, train_set=d_half_2, num_boost_round=3000, valid_sets=watchlist_2, verbose_eval=300, early_stopping_rounds=300)


Building model with first half and validating on second half:
Training until validation scores don't improve for 300 rounds.
[300]	training's rmse: 0.785985	valid_1's rmse: 1.05302


KeyboardInterrupt: 

In [17]:
del X_train, X_half_1, X_half_2, y_half_1, y_half_2, d_half_1, d_half_2, watchlist_1, watchlist_2
gc.collect()

390

## Preparing test data
Preparing test data with same features as train data.

In [29]:
df_test = pd.read_csv('data/test.csv')
leak = pd.read_pickle('data/site0.pkl')

df_test = reduce_mem_usage(df_test)

X_test, row_ids = prepr.prepare_data(df_test, building, weather_test, test=True)

Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.65 MB
Decreased by 71.8%


In [19]:
# def smart_test(df_test):
#     model_part_1 = df_test[df_test.timestamp < pd.to_datetime('2017-07-07 22:00:00')]

#     second_part = df_test[(df_test.timestamp >= pd.to_datetime('2017-07-07 22:00:00')) & (df_test.timestamp <= pd.to_datetime('2017-12-31 23:00:00'))]

#     model_part_1a = df_test[df_test.timestamp >= pd.to_datetime('2018-01-01 00:00:00')]
    
#     return model_part_1.drop('timestamp', axis=1), second_part.drop('timestamp', axis=1),\
#            model_part_1a.drop('timestamp', axis=1)

In [20]:
# a_1, b_2, c_1 = smart_test(X_test)

In [21]:
# del df_test, building, weather_test
# gc.collect()

In [22]:
# pred_a_1 = np.expm1(model_half_1.predict(a_1, num_iteration=model_half_1.best_iteration))
# pred_b_2 = np.expm1(model_half_2.predict(b_2, num_iteration=model_half_2.best_iteration))
# pred_c_1 = np.expm1(model_half_1.predict(c_1, num_iteration=model_half_1.best_iteration))

In [23]:
# total_pred = np.concatenate([pred_a_1, pred_b_2, pred_c_1])

In [24]:
# total_pred.shape

## Scoring test data
Averaging predictions from the two half train data models.

In [31]:
pred = np.expm1(model_half_1.predict(X_test.drop('timestamp', axis=1), num_iteration=model_half_1.best_iteration)) / 2

del model_half_1
gc.collect()

pred += np.expm1(model_half_2.predict(X_test.drop('timestamp', axis=1), num_iteration=model_half_2.best_iteration)) / 2
    
del model_half_2
gc.collect()

12

In [32]:
def apply_leaks(leak, df_test):
    X = df_test.merge(leak, on=["building_id",'timestamp','meter'], how="left")
    X.drop('meter_reading_original', axis=1, inplace=True)
    leak_target = X['meter_reading_scraped']
    
    return leak_target

In [33]:
leaked = apply_leaks(leak, df_test)

In [34]:
leak_df = pd.DataFrame(leaked)

In [35]:
leak_df['pred'] = pred

In [36]:
leak_df

Unnamed: 0,meter_reading_scraped,pred
0,173.370293,154.514187
1,53.512720,75.474717
2,6.143042,9.710868
3,101.701470,211.103380
4,1141.240666,952.718400
...,...,...
41697595,,7.149443
41697596,,4.436384
41697597,,4.565448
41697598,,177.651064


In [37]:
leak_df.meter_reading_scraped.fillna(leak_df.pred, inplace=True)

In [38]:
leaked_pred = leak_df['meter_reading_scraped'].values

In [39]:
leaked_pred

array([173.37029336,  53.51271968,   6.14304201, ...,   4.56544832,
       177.65106441,   3.38731016])

In [40]:
# pd.DataFrame((pred + total_pred) / 2)

## Submission
Preparing final file for submission.

In [41]:
submission = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(leaked_pred, 0, a_max=None)})
submission.to_csv("submission_leak2.csv", index=False)