In [1]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.metrics import mean_squared_error

In [2]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
%%time
root = Path('../input/ashrae-feather-format-for-fast-loading')

#train_df = pd.read_feather(root/'train.feather')
test_df = pd.read_feather(root/'test.feather')
#weather_train_df = pd.read_feather(root/'weather_train.feather')
#weather_test_df = pd.read_feather(root/'weather_test.feather')
building_meta_df = pd.read_feather(root/'building_metadata.feather')

CPU times: user 312 ms, sys: 932 ms, total: 1.24 s
Wall time: 1.05 s


In [4]:
# i'm now using my leak data station kernel to shortcut.
leak_df = pd.read_feather('../input/ashrae-leak-data-station/leak.feather')

leak_df.fillna(0, inplace=True)
leak_df = leak_df[(leak_df.timestamp.dt.year > 2016) & (leak_df.timestamp.dt.year < 2019)]
leak_df.loc[leak_df.meter_reading < 0, 'meter_reading'] = 0 # remove large negative values
leak_df = leak_df[leak_df.building_id!=245]

In [5]:
sample_submission1 = pd.read_csv('../input/ashrae-kfold-lightgbm-without-leak-1-08/submission.csv', index_col=0)
sample_submission2 = pd.read_csv('../input/ashrae-half-and-half/submission.csv', index_col=0)
sample_submission3 = pd.read_csv('../input/ashrae-highway-kernel-route4/submission.csv', index_col=0)
#sample_submission4 = pd.read_csv('../input/ashrae-exploiting-leak-site-5/submission.csv', index_col=0)
sample_submission4 = pd.read_csv('../input/ashrae-energy-prediction-using-stratified-kfold/fe2_lgbm.csv', index_col=0)
#sample_submission5 = pd.read_csv('../input/ashrae-simple-data-cleanup-lb-1-08-no-leaks/submission.csv', index_col=0)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [6]:
test_df['pred1'] = sample_submission1.meter_reading
test_df['pred2'] = sample_submission2.meter_reading
test_df['pred3'] = sample_submission3.meter_reading
test_df['pred4'] = sample_submission4.meter_reading

test_df.loc[test_df.pred1<0, 'pred1'] = 0
test_df.loc[test_df.pred2<0, 'pred2'] = 0
test_df.loc[test_df.pred3<0, 'pred3'] = 0
test_df.loc[test_df.pred4<0, 'pred4'] = 0

del  sample_submission1,  sample_submission2,  sample_submission3, sample_submission4
gc.collect()

test_df = reduce_mem_usage(test_df)
leak_df = reduce_mem_usage(leak_df)

Memory usage of dataframe is 1869.00 MB
Memory usage after optimization is: 1232.74 MB
Decreased by 34.0%
Memory usage of dataframe is 460.05 MB
Memory usage after optimization is: 299.03 MB
Decreased by 35.0%


In [7]:
leak_df = leak_df.merge(test_df[['building_id', 'meter', 'timestamp', 
                                 'pred1', 'pred2', 'pred3', 'pred4',
                                 'row_id']], 
                        left_on = ['building_id', 'meter', 'timestamp'], right_on = ['building_id', 'meter', 'timestamp'], how = "left")
leak_df = leak_df.merge(building_meta_df[['building_id', 'site_id']], on='building_id', how='left')

In [8]:
leak_df['pred1_l1p'] = np.log1p(leak_df.pred1)
leak_df['pred2_l1p'] = np.log1p(leak_df.pred2)
leak_df['pred3_l1p'] = np.log1p(leak_df.pred3)
leak_df['pred4_l1p'] = np.log1p(leak_df.pred4)
leak_df['meter_reading_l1p'] = np.log1p(leak_df.meter_reading)

In [9]:
leak_df[['pred1_l1p', 'pred2_l1p', 'pred3_l1p', 'pred4_l1p',
         'meter_reading_l1p']].corr()

Unnamed: 0,pred1_l1p,pred2_l1p,pred3_l1p,pred4_l1p,meter_reading_l1p
pred1_l1p,1.0,0.986133,0.988106,0.978432,0.862842
pred2_l1p,0.986133,1.0,0.977993,0.968697,0.856097
pred3_l1p,0.988106,0.977993,1.0,0.966693,0.85957
pred4_l1p,0.978432,0.968697,0.966693,1.0,0.867602
meter_reading_l1p,0.862842,0.856097,0.85957,0.867602,1.0


In [10]:
import itertools

#all_combinations = list(np.linspace(0.1,0.9,17))
all_combinations = list(np.linspace(0,1,21))
l = [all_combinations, all_combinations, all_combinations, all_combinations]
# remember to do the reverse!
all_l = list(itertools.product(*l)) + list(itertools.product(*reversed(l)))

filtered_combis = [l for l in all_l if l[0] + l[1] + l[2] + l[3] > 0.93 and 
                   l[0] + l[1] + l[2] + l[3] < 1.03]

In [11]:
%%time
final_combi = []
for m in [0,1,2,3]:
    temp_df = leak_df[leak_df.meter==m]
    best_combi = [] # of the form (i, score)
    for i, combi in enumerate(filtered_combis):
        #print("Now at: " + str(i) + " out of " + str(len(filtered_combis))) # uncomment to view iterations
        score1 = combi[0]
        score2 = combi[1]
        score3 = combi[2]
        score4 = combi[3]
        #score5 = combi[4]
        v = score1 * temp_df['pred1'].values + \
            score2 * temp_df['pred2'].values + \
            score3 * temp_df['pred3'].values + \
            score4 * temp_df['pred4'].values
        vl1p = np.log1p(v)
        curr_score = np.sqrt(mean_squared_error(vl1p, temp_df.meter_reading_l1p))

        if best_combi:
            prev_score = best_combi[0][1]
            if curr_score < prev_score:
                best_combi[:] = []
                best_combi += [(i, curr_score)]
        else:
            best_combi += [(i, curr_score)]

    score = best_combi[0][1]
    final_combi.append(filtered_combis[best_combi[0][0]])
    print("Meter", m, ":", score)
    print("Weights", final_combi[-1])

Meter 0 : 0.68942237
Weights (0.0, 0.05, 0.2, 0.7000000000000001)
Meter 1 : 1.388696
Weights (0.0, 0.0, 0.45, 0.5)
Meter 2 : 0.9412642
Weights (0.0, 0.15000000000000002, 0.30000000000000004, 0.55)
Meter 3 : 1.0830786
Weights (0.0, 0.1, 0.8, 0.05)
CPU times: user 33min 8s, sys: 59.1 s, total: 34min 7s
Wall time: 34min 7s


In [12]:
sample_submission = pd.read_feather(os.path.join(root, 'sample_submission.feather'))

list_res = []
test_df['meter_reading'] = 0
for i in range(4):
    w1, w2, w3, w4 = final_combi[i]
    test_df.loc[test_df.meter==i, 'meter_reading'] = w1 * test_df.loc[test_df.meter==i, 'pred1'] + \
                                                     w2 * test_df.loc[test_df.meter==i, 'pred2'] + \
                                                     w3 * test_df.loc[test_df.meter==i, 'pred3'] + \
                                                     w4 * test_df.loc[test_df.meter==i, 'pred4']

sample_submission['meter_reading'] = test_df['meter_reading']
sample_submission.loc[sample_submission.meter_reading < 0, 'meter_reading'] = 0

In [13]:
leak_df = leak_df[['meter_reading', 'row_id']].set_index('row_id').dropna()
sample_submission.loc[leak_df.index, 'meter_reading'] = leak_df['meter_reading']

In [14]:
%%time
sample_submission.to_csv('submission_with_leak_by_site.csv', index=False, float_format='%.4f')

CPU times: user 4min 31s, sys: 2.02 s, total: 4min 33s
Wall time: 4min 34s
