In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from matplotlib import dates as md
import seaborn as sns
import plotly.graph_objs as go
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

from sklearn.metrics import r2_score

import lightgbm as lgb

from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load dataset

In [None]:
path_GEPIII = '/kaggle/input/ashrae-energy-prediction'

In [None]:
df_power_meter = pd.read_csv(os.path.join(path_GEPIII,'train.csv'))
df_meta = pd.read_csv(os.path.join(path_GEPIII,'building_metadata.csv'))

In [None]:
df_meta = df_meta.merge(df_power_meter[['building_id','meter']].drop_duplicates(), on='building_id')
df_meta['merged_id'] = df_meta['building_id'].astype('str') + '_' + df_meta['meter'].astype('str')
df_meta

In [None]:
df_power_meter = df_power_meter.pivot_table(index='timestamp', columns=['building_id','meter'], values='meter_reading')
df_power_meter.index = pd.to_datetime(df_power_meter.index)
df_power_meter.columns = df_power_meter.columns.get_level_values(0).astype('str')+'_'+df_power_meter.columns.get_level_values(1).astype('str')
df_power_meter

In [None]:
df_count = pd.DataFrame(df_power_meter.count()).reset_index().rename(columns={'index':'merged_id',0:'count'})
df_meta = df_meta.merge(df_count, on='merged_id')
df_meta

In [None]:
# Train data: 64% of power meters (site0~9); validation data (site10~12); test data: 36% of power meters ((site13~15))
df_power_meter = df_power_meter.loc['2016']
train_data = df_power_meter.loc[:, df_meta.loc[(df_meta['site_id']<10)&(df_meta['count']>8784*0.9), 'merged_id']].copy()
valid_data = df_power_meter.loc[:, df_meta.loc[(df_meta['site_id']<13)&(df_meta['site_id']>=10)&(df_meta['count']>8784*0.9), 'merged_id']].copy()
test_data = df_power_meter.loc[:, df_meta.loc[df_meta['count']>8784*0.9, 'merged_id']].drop(train_data.columns, axis=1).copy()

In [None]:
# Normalize meter readings for each meter
def normalize(df):
    mean = df.mean()
    df -= mean
    std = df.std()
    df /= std
    return df, mean, std

train_value, train_mean, train_std = normalize(train_data)
valid_value, valid_mean, valid_std = normalize(valid_data)
test_value, test_mean, test_std = normalize(test_data)

# Add noises
+- 1,2 and 3 std, 0.1% respectively

In [None]:
# Add noises to train data
train_value_noisy = train_value.copy()

for meter_name in tqdm(train_value.columns):
    df_noisy_data = train_value_noisy[[meter_name]].copy()
    df_noisy_data['noise'] = 0
    
    #Add noises (+-1,2,3 std)
    std = df_noisy_data[meter_name].std()
    for multiplier_std in [-3,-2,-1,1,2,3]:
        random_hours = df_noisy_data[df_noisy_data['noise']==0].sample(frac=0.001).index
        df_noisy_data.loc[random_hours, meter_name] = df_noisy_data.loc[random_hours, meter_name] + multiplier_std*std
        df_noisy_data.loc[random_hours, 'noise'] = 1
    
    df_noisy_data[meter_name] = df_noisy_data[meter_name].fillna(method='ffill').fillna(method='bfill') 
    
    train_value_noisy[meter_name] = df_noisy_data[meter_name].copy()    

In [None]:
# Add noises to valid data
valid_value_noisy = valid_value.copy()

for meter_name in tqdm(valid_value.columns):
    df_noisy_data = valid_value_noisy[[meter_name]].copy()
    df_noisy_data['noise'] = 0
    
    #Add noises (+-1,2,3 std)
    std = df_noisy_data[meter_name].std()
    for multiplier_std in [-3,-2,-1,1,2,3]:
        random_hours = df_noisy_data[df_noisy_data['noise']==0].sample(frac=0.001).index
        df_noisy_data.loc[random_hours, meter_name] = df_noisy_data.loc[random_hours, meter_name] + multiplier_std*std
        df_noisy_data.loc[random_hours, 'noise'] = 1
    
    df_noisy_data[meter_name] = df_noisy_data[meter_name].fillna(method='ffill').fillna(method='bfill') 
    
    valid_value_noisy[meter_name] = df_noisy_data[meter_name].copy()     

In [None]:
# Add noises to test data
test_value_noisy = test_value.copy()

for meter_name in tqdm(test_value.columns):
    df_noisy_data = test_value_noisy[[meter_name]].copy()
    df_noisy_data['noise'] = 0
    
    #Add noises (+-1,2,3 std)
    std = df_noisy_data[meter_name].std()
    for multiplier_std in [-3,-2,-1,1,2,3]:
        random_hours = df_noisy_data[df_noisy_data['noise']==0].sample(frac=0.001).index
        df_noisy_data.loc[random_hours, meter_name] = df_noisy_data.loc[random_hours, meter_name] + multiplier_std*std
        df_noisy_data.loc[random_hours, 'noise'] = 1
    
    df_noisy_data[meter_name] = df_noisy_data[meter_name].fillna(method='ffill').fillna(method='bfill') 
    
    test_value_noisy[meter_name] = df_noisy_data[meter_name].copy()      

In [None]:
# Plot of before and after adding noises
for meter_name in train_value.sample(n=10, axis=1, random_state=42).columns:
    fig, axes = plt.subplots(1,2,figsize=(15,3))
    
    ymin = train_value_noisy[meter_name].min()*1.05
    ymax = train_value_noisy[meter_name].max()*1.05
    
    train_value[meter_name].fillna(method='ffill').fillna(method='bfill').plot(title=meter_name+' (raw data)', ylim=(ymin, ymax),
                                                                               ax=axes[0],color='blue')
    train_value_noisy[meter_name].fillna(method='ffill').fillna(method='bfill').plot(title=meter_name+' (add noise)', ylim=(ymin, ymax),
                                                                                     ax=axes[1],color='orange')
    plt.show()