In [7]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

In [8]:
'''
given a start date in datetime format "start_date" and an "end_date" returns a list of strings with the dates from
"start_date" to "end_date".

Example:

start_date = datetime.date(2019, 9 , 30)
end_date = datetime.date(2019, 10, 7)
get_date_range(start_date, end_date)
'''
def get_date_range(start_date, end_date):
    number_of_days = (end_date-start_date).days
    return [(start_date + datetime.timedelta(days = day)).isoformat() for day in range(number_of_days+1)]

'''
This function expects two dataframes with the same format: for the first seven columns, each column corresponds to a date 
and each row corresponds to a counter index. In position i,j there should be DELTA of counter i in date j. 
For the last two columns of the dataframes they should not reffer to a daily prediction but to the aggregated prediction 
of week_1 and week_2. Given these two dataframes (one for theprediction and one for the real values), 
the function returns de error according to the competition rules.

Examples:

import pandas as pd
import copy

test = pd.read_pickle('../data/test.pkl')

compute_error(test, test)

test_v3 = copy.copy(test)
test_v3.iloc[:,0] = test_v3.iloc[:,1]
compute_error(test_v3, test)

'''
def compute_error(pred, real):
    daily_rmses = []
    for i in range(7):
        daily_rmses.append((((real.iloc[:,i] - pred.iloc[:,i])**2/len(real.iloc[:,i])).sum())**(1/2))
    rmse_1 = sum(daily_rmses)/7
    
    first_week_pred_sum = pred.iloc[:,7].sum()
    second_week_pred_sum = pred.iloc[:,8].sum()
    first_week_real_sum = real.iloc[:,7].sum()
    second_week_real_sum = real.iloc[:,8].sum()
    
    first_week_rmse = (((first_week_real_sum - first_week_pred_sum)**2)/len(real.iloc[:,7]))**(1/2)
    second_week_rmse = (((second_week_real_sum - second_week_pred_sum)**2)/len(real.iloc[:,8]))**(1/2)
    rmse_2 = (first_week_rmse + second_week_rmse)/2
    
    return (rmse_1 + rmse_2)/2

In [13]:
pp.tail()

Unnamed: 0_level_0,1,2,3,4,5,6,7,Suma1,Suma2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2746,583.34973,584.182491,583.766111,584.598872,585.848014,582.93335,582.93335,4087.611918,4082.61535
2747,1501.06616,673.303716,680.775611,818.92213,422.527712,1276.719838,1075.502737,6448.817905,6043.996129
2748,525.725023,529.202746,540.523116,541.564011,654.51184,438.365268,487.14794,3717.039945,3295.713543
2749,116.78788,116.78788,116.78788,116.78788,116.78788,116.78788,116.78788,817.51516,817.51516
2756,1501.06616,673.303716,680.775611,818.92213,422.527712,1276.719838,1075.502737,6448.817905,6043.996129


In [9]:
pp = pd.read_pickle('../data/prueba.pkl')
pp.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,Suma1,Suma2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,268.59,297.58,322.48,334.6,262.09,260.89,289.05,2035.28,1987.09
1,50.047401,50.5164,52.441,44.098,15.3876,14.8233,9.9253,237.239002,185.221204
2,26.5,5.88,4.76,7.46,25.27,29.07,27.04,125.98,130.73
3,433.63,439.44,442.08,425.31,432.1598,457.47,447.76,3077.8498,3188.9698
4,272.9895,237.075399,226.3041,238.834399,280.4634,316.077299,341.1091,1912.853199,1901.278501


In [16]:
pp.tail()

Unnamed: 0_level_0,1,2,3,4,5,6,7,Suma1,Suma2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2746,583.34973,584.182491,583.766111,584.598872,585.848014,582.93335,582.93335,4087.611918,4082.61535
2747,1501.06616,673.303716,680.775611,818.92213,422.527712,1276.719838,1075.502737,6448.817905,6043.996129
2748,525.725023,529.202746,540.523116,541.564011,654.51184,438.365268,487.14794,3717.039945,3295.713543
2749,116.78788,116.78788,116.78788,116.78788,116.78788,116.78788,116.78788,817.51516,817.51516
2756,1501.06616,673.303716,680.775611,818.92213,422.527712,1276.719838,1075.502737,6448.817905,6043.996129


In [17]:
test.tail()

Unnamed: 0_level_0,2020-01-18,2020-01-19,2020-01-20,2020-01-21,2020-01-22,2020-01-23,2020-01-24,first_week,second_week
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2746,582.933333,582.933333,582.933333,582.933333,582.933333,582.933333,582.933333,4080.533333,4080.533333
2747,438.500082,363.889917,444.643897,390.535651,428.61882,409.824931,408.652628,2884.665926,2812.730452
2748,435.831081,435.831081,435.831081,435.831081,435.831081,435.831081,435.831081,3050.817568,3050.817568
2749,116.787879,116.787879,116.787879,116.787879,116.787879,116.787879,116.787879,817.515152,817.515152
2756,438.500082,363.889917,444.643897,390.535651,428.61882,409.824931,408.652628,2884.665926,2812.730452


In [10]:
pp.columns

Index([1, 2, 3, 4, 5, 6, 7, 'Suma1', 'Suma2'], dtype='object')

In [11]:
test = pd.read_pickle('../data/test.pkl')
error = compute_error(pp, test)
print('Extra trees:', round(error,2))

Extra trees: 157155.46
