# Acknowledgements

Original Kernel: https://www.kaggle.com/yamsam/ashrae-leak-validation-and-more/notebook#Leak-Validation-for-public-kernels(not-used-leak-data),

https://www.kaggle.com/khoongweihao/ashrae-leak-validation-bruteforce-heuristic-search

Additions: Added a search method based on gradient update

# All we need is Leak Validation(LV) ?

* **if you like this kernel, please upvote original kernels.**
* update site-4 and site-15
* Turn GPU on for better performance

this kernel is still work in progress, but i hope you can find something usefull from this.

In [1]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.metrics import mean_squared_error

In [2]:

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
%%time
root = Path('../input/ashrae-feather-format-for-fast-loading')

#train_df = pd.read_feather(root/'train.feather')
test_df = pd.read_feather(root/'test.feather')
#weather_train_df = pd.read_feather(root/'weather_train.feather')
#weather_test_df = pd.read_feather(root/'weather_test.feather')
building_meta_df = pd.read_feather(root/'building_metadata.feather')

CPU times: user 256 ms, sys: 488 ms, total: 744 ms
Wall time: 873 ms


In [4]:
# i'm now using my leak data station kernel to shortcut.
leak_df = pd.read_feather('../input/ashrae-leak-data-station/leak.feather')

leak_df.fillna(0, inplace=True)
leak_df = leak_df[(leak_df.timestamp.dt.year > 2016) & (leak_df.timestamp.dt.year < 2019)]
leak_df.loc[leak_df.meter_reading < 0, 'meter_reading'] = 0 # remove large negative values
leak_df = leak_df[leak_df.building_id!=245]

# Leak Validation for public kernels(not used leak data)

In [5]:
sample_submission1 = pd.read_csv('../input/ashrae-kfold-lightgbm-without-leak-1-08/submission.csv', index_col=0)
sample_submission2 = pd.read_csv('../input/ashrae-half-and-half/submission.csv', index_col=0)
sample_submission3 = pd.read_csv('../input/ashrae-highway-kernel-route4/submission.csv', index_col=0)
sample_submission4 = pd.read_csv('../input/ashrae-energy-prediction-using-stratified-kfold/fe2_lgbm.csv', index_col=0)
sample_submission5 = pd.read_csv('../input/ashrae-2-lightgbm-without-leak-data/submission.csv', index_col=0)
sample_submission6 = pd.read_csv('../input/ashrae-stratified-kfold-v4/submission_SK_drop_ws.csv', index_col=0)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [6]:
test_df['pred1'] = sample_submission1.meter_reading
test_df['pred2'] = sample_submission2.meter_reading
test_df['pred3'] = sample_submission3.meter_reading
test_df['pred4'] = sample_submission4.meter_reading
test_df['pred5'] = sample_submission5.meter_reading
test_df['pred6'] = sample_submission6.meter_reading

test_df.loc[test_df.pred1<0, 'pred1'] = 0
test_df.loc[test_df.pred2<0, 'pred2'] = 0
test_df.loc[test_df.pred3<0, 'pred3'] = 0
test_df.loc[test_df.pred4<0, 'pred4'] = 0
test_df.loc[test_df.pred5<0, 'pred5'] = 0 
test_df.loc[test_df.pred6<0, 'pred6'] = 0 

del  sample_submission1,  sample_submission2,  sample_submission3,  sample_submission4,  sample_submission5,  sample_submission6
gc.collect()

test_df = reduce_mem_usage(test_df)
leak_df = reduce_mem_usage(leak_df)

Memory usage of dataframe is 2505.25 MB
Memory usage after optimization is: 1550.87 MB
Decreased by 38.1%
Memory usage of dataframe is 460.05 MB
Memory usage after optimization is: 299.03 MB
Decreased by 35.0%


In [7]:
leak_df = leak_df.merge(test_df[['building_id', 'meter', 'timestamp', 
                                 'pred1', 'pred2', 'pred3', 'pred4', 'pred5', 'pred6',
                                 'row_id']], left_on = ['building_id', 'meter', 'timestamp'], right_on = ['building_id', 'meter', 'timestamp'], how = "left")
leak_df = leak_df.merge(building_meta_df[['building_id', 'site_id']], on='building_id', how='left')

In [8]:
leak_df['pred1_l1p'] = np.log1p(leak_df.pred1)
leak_df['pred2_l1p'] = np.log1p(leak_df.pred2)
leak_df['pred3_l1p'] = np.log1p(leak_df.pred3)
leak_df['pred4_l1p'] = np.log1p(leak_df.pred4)
leak_df['pred5_l1p'] = np.log1p(leak_df.pred5)
leak_df['pred6_l1p'] = np.log1p(leak_df.pred6)
leak_df['meter_reading_l1p'] = np.log1p(leak_df.meter_reading)

In [9]:
# best : 0.867698
leak_df[['pred1_l1p', 'pred2_l1p', 'pred3_l1p', 'pred4_l1p', 'pred5_l1p', 'pred6_l1p',
         'meter_reading_l1p']].corr()

Unnamed: 0,pred1_l1p,pred2_l1p,pred3_l1p,pred4_l1p,pred5_l1p,pred6_l1p,meter_reading_l1p
pred1_l1p,1.0,0.986133,0.988106,0.977856,0.988009,0.97755,0.862842
pred2_l1p,0.986133,1.0,0.977993,0.967448,0.978677,0.967188,0.856097
pred3_l1p,0.988106,0.977993,1.0,0.966034,0.979566,0.965632,0.85957
pred4_l1p,0.977856,0.967448,0.966034,1.0,0.969502,0.999776,0.867698
pred5_l1p,0.988009,0.978677,0.979566,0.969502,1.0,0.969328,0.861896
pred6_l1p,0.97755,0.967188,0.965632,0.999776,0.969328,1.0,0.86773
meter_reading_l1p,0.862842,0.856097,0.85957,0.867698,0.861896,0.86773,1.0


# Combination Search by using gradient descent

$x$ \- input 

$y$ \- target

$w$ \- weights

Let $f(x)=w^\top x$, we want to minimize

$$L(x,y)=(\log(f(x)+1)-\log(y+1))^2$$

In [10]:
# Prepare data

X_train = np.array([leak_df['pred1'].values,
                    leak_df['pred2'].values, 
                    leak_df['pred3'].values,
                    #leak_df['pred4'].values,
                    leak_df['pred6'].values
                   ]).T
y_train = leak_df.meter_reading_l1p

In [11]:
import tensorflow as tf

weights = tf.Variable([[0.3],
                       [0.3],
                       [0.3],
                       #[0.3],
                       [0.3]
                      ])
steps = 1000

lr = 0.1

opt = tf.optimizers.SGD(lr)

#Speed up the train step by precompiling
@tf.function()
def train_step(opt):
    with tf.GradientTape() as tape:
        y  = tf.matmul(X_train, weights)[:,0]
        loss = tf.reduce_mean((tf.math.log1p(y) - y_train) ** 2)
    grads = tape.gradient(loss, weights)
    opt.apply_gradients([(grads, weights)])
    
    return loss

prev_loss = 9999
for i in range(steps):
    loss = train_step(opt)
    if loss > prev_loss:
        lr /= 2
        opt.lr = lr
        
    prev_loss = loss
    print(f'step: {i} {loss.numpy()}')  

step: 0 0.9793186783790588
step: 1 0.9303372502326965
step: 2 0.918653666973114
step: 3 0.917375922203064
step: 4 0.9171513319015503
step: 5 0.9169761538505554
step: 6 0.9168081283569336
step: 7 0.9166461229324341
step: 8 0.9164896607398987
step: 9 0.9163385033607483
step: 10 0.9161926507949829
step: 11 0.9160512685775757
step: 12 0.9159148335456848
step: 13 0.9157826900482178
step: 14 0.9156548380851746
step: 15 0.9155312180519104
step: 16 0.9154112339019775
step: 17 0.9152951240539551
step: 18 0.9151828289031982
step: 19 0.9150737524032593
step: 20 0.9149682521820068
step: 21 0.9148656725883484
step: 22 0.9147664308547974
step: 23 0.9146701097488403
step: 24 0.9145766496658325
step: 25 0.9144858121871948
step: 26 0.9143977761268616
step: 27 0.914312481880188
step: 28 0.9142294526100159
step: 29 0.9141489267349243
step: 30 0.9140706658363342
step: 31 0.9139946103096008
step: 32 0.9139207601547241
step: 33 0.9138489961624146
step: 34 0.9137791991233826
step: 35 0.913711428642273
step: 

In [12]:
# MSE
## 2019-12-16 : 0.95496225 blend 1234
## 2019-12-18 : 0.9547418 blend 1234
## 2019-12-19 : 0.9513415 blend 12345
## 2019-12-19 : 0.95134723 blend 1345
## 2019-12-19 : 0.95151687 blend 2345
## 2019-12-19 : 0.9515808 blend 345
## 2019-12-19 : 0.9544586 blend 1236
np.sqrt(loss)

0.9544586

# Submit

In [13]:
sample_submission = pd.read_feather(os.path.join(root, 'sample_submission.feather'))

ws = weights.numpy()

w1 = ws[0,0]
w2 = ws[1,0]
w3 = ws[2,0]
w4 = ws[3,0]
#w5 = ws[4,0]
print("The weights are: w1=" + str(w1) + ", w2=" + str(w2) + ", w3=" + str(w3) + ", w4=" + str(w4))

sample_submission['meter_reading'] = w1 * test_df.pred1 +  w2 * test_df.pred2  + w3 * test_df.pred3 + w4 * test_df.pred6
sample_submission.loc[sample_submission.meter_reading < 0, 'meter_reading'] = 0

The weights are: w1=0.091667734, w2=0.040769067, w3=0.38903582, w4=0.3867224


In [14]:
sample_submission.to_csv('submission_sgd_blend_4_final.csv', index=False, float_format='%.4f')

In [15]:
leak_df = leak_df[['meter_reading', 'row_id']].set_index('row_id').dropna()
sample_submission.loc[leak_df.index, 'meter_reading'] = leak_df['meter_reading']

In [16]:
sample_submission.to_csv('submission_sgd_blend_4_final_replaced.csv', index=False, float_format='%.4f')