# Part I

There exists some value $x$ such that it minimizes the the RMSLE metric for train. 

Part 1 - Find the value that minimizes training RMSLE <br>
Part 2 - If the value that minimizes train RMSLE also translates to LB, we will go deeper and see if there is anything else we can discover

In [None]:
%matplotlib inline
import feather
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numba import jit
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = feather.read_dataframe('../input/ashrae-feather/train.ft')

RMSE is 2x as faster to calculate than RMSLE (log increases calculation time), however, we'll stick to RMSLE for our experiments.

In order to achieve optimal speed calculating the RMSLE, I made sure both prediction and ground truths are float32. Numpy makes sure to cast both arrays into the same dtype before calculating and if you don't specify float32, it will default to int64 and thus both arrays will end up casting into float64. Such a tiny detail can increase calulation time by 2x.

In [None]:
# loss functions

# Root Mean Squared Error
def rmse(ytrue, ypred):
    return np.sqrt(np.mean(np.square(ypred - ytrue), axis=0))

# Root Mean Squared Log Error
def rmsle(ytrue, ypred):
    return np.sqrt(np.mean(np.square(np.log1p(ypred) - np.log1p(ytrue)), axis=0))

# Function to find a singular minimum value
def minimize(i):
    return rmsle(train_df['meter_reading'].values, np.full(len(train_df), i, dtype=np.float32))

In [None]:
print(f"Mean Meter Reading: {np.mean(train_df['meter_reading'].values):.2f}")

In [None]:
%%time
m = []
for i in range(2117):
    m.append(minimize(i))

In [None]:
plt.title('Finding the minimum $x$')
plt.ylabel('RMSLE')
plt.xlabel('$x$')
plt.xscale('log')
plt.plot(np.arange(2117), m)
plt.show()

In [None]:
print(f'Min RMSLE of {np.min(m):.2f} is found at x={np.argmin(m)}')

# Part II

Now that we saw that the min value in pt. 1 translated over to LB nicely, we will look at the 'meter' variable

Now let $x$ represent the variable 'meter' and $x_i$ represent the values that minimize each meter [0 - 3]

These exists some minimum for $x_0$, $x_1$, $x_2$, $x_3$ respectively such that min($x_0$) + min($x_1$) + min($x_2$) + min($x_3$) achieves the lowest RSMLE

Using the same technique above, let's see if we can find the minimum values of $x_i$

In [None]:
def min_x(i, y_true):
    return rmsle(y_true, np.full(len(y_true), i, dtype=np.float32))

In [None]:
train_df.groupby('meter')['meter_reading'].agg(np.mean)

In [None]:
%%time
m0, m1, m2, m3 = [],[],[],[]
for i in tqdm(range(0,1000,10)):
    m0.append(min_x(i, train_df[train_df['meter'] == 0]['meter_reading'].values))
    
for i in tqdm(range(0,2000,10)):
    m1.append(min_x(i, train_df[train_df['meter'] == 1]['meter_reading'].values))
    
for i in tqdm(range(0,20000,10)):
    m2.append(min_x(i, train_df[train_df['meter'] == 2]['meter_reading'].values))
    
for i in tqdm(range(0,2000,10)):
    m3.append(min_x(i, train_df[train_df['meter'] == 3]['meter_reading'].values))

In [None]:
plt.figure(figsize=(12,12))

plt.subplot(221)
plt.title('Finding the minimum $x_0$')
plt.ylabel('RMSLE')
plt.xlabel('$x_0$')
plt.xscale('log')
plt.plot(np.arange(0,1000,10), m0)

plt.subplot(222)
plt.title('Finding the minimum $x_1$')
plt.ylabel('RMSLE')
plt.xlabel('$x_1$')
plt.xscale('log')
plt.plot(np.arange(0,2000,10), m1)

plt.subplot(223)
plt.title('Finding the minimum $x_2$')
plt.ylabel('RMSLE')
plt.xlabel('$x_2$')
plt.xscale('log')
plt.plot(np.arange(0,20000,10), m2)

plt.subplot(224)
plt.title('Finding the minimum $x_3$')
plt.ylabel('RMSLE')
plt.xlabel('$x_3$')
plt.xscale('log')
plt.plot(np.arange(0,2000,10), m3)

plt.show()

In [None]:
print(f'x0: Min RMSLE of {np.min(m0):.2f} is found at x={np.argmin(m0)*10}')
print(f'x1: Min RMSLE of {np.min(m1):.2f} is found at x={np.argmin(m1)*10}')
print(f'x2: Min RMSLE of {np.min(m2):.2f} is found at x={np.argmin(m2)*10}')
print(f'x3: Min RMSLE of {np.min(m3):.2f} is found at x={np.argmin(m3)*10}')

In [None]:
%%time
# do this one more time to get to a closer minimum for each x
m0, m1, m2, m3 = [],[],[],[]
for i in tqdm(range(40,60)):
    m0.append(min_x(i, train_df[train_df['meter'] == 0]['meter_reading'].values))
    
for i in tqdm(range(60,80)):
    m1.append(min_x(i, train_df[train_df['meter'] == 1]['meter_reading'].values))
    
for i in tqdm(range(160,180)):
    m2.append(min_x(i, train_df[train_df['meter'] == 2]['meter_reading'].values))
    
for i in tqdm(range(10,30)):
    m3.append(min_x(i, train_df[train_df['meter'] == 3]['meter_reading'].values))

In [None]:
print(f'x0: Min RMSLE of {np.min(m0):.3f} is found at x0={np.argmin(m0)+40}')
print(f'x1: Min RMSLE of {np.min(m1):.3f} is found at x1={np.argmin(m1)+60}')
print(f'x2: Min RMSLE of {np.min(m2):.3f} is found at x2={np.argmin(m2)+160}')
print(f'x3: Min RMSLE of {np.min(m3):.3f} is found at x3={np.argmin(m3)+10}')

In [None]:
p0 = np.full(len(train_df[train_df['meter']==0]), 52)
p1 = np.full(len(train_df[train_df['meter']==1]), 69)
p2 = np.full(len(train_df[train_df['meter']==2]), 167)
p3 = np.full(len(train_df[train_df['meter']==3]), 27)
g0 = train_df[train_df['meter']==0]['meter_reading'].values
g1 = train_df[train_df['meter']==1]['meter_reading'].values
g2 = train_df[train_df['meter']==2]['meter_reading'].values
g3 = train_df[train_df['meter']==3]['meter_reading'].values
p = []
g = []
p += list(p0)
p += list(p1)
p += list(p2)
p += list(p3)
g += list(g0)
g += list(g1)
g += list(g2)
g += list(g3)

In [None]:
print(f"Total RMSLE: {rmsle(np.array(g), np.array(p)):.3f}")

Slight decrease in RMSLE over pt. 1

# Submission File

In [None]:
%%time
sub = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv', dtype={'row_id': 'int32', 'meter_reading': 'int8'})
sub['meter_reading'] = 62
sub['meter_reading'] = sub['meter_reading'].astype('int8')

In [None]:
%%time
sub.to_csv('submission.csv', index=False)