In [1]:
import pandas as pd
import datetime
from catboost import Pool, CatBoostRegressor

In [2]:
'''
given a start date in datetime format "start_date" and an "end_date" returns a list of strings with the dates from
"start_date" to "end_date".

Example:

start_date = datetime.date(2019, 9 , 30)
end_date = datetime.date(2019, 10, 7)
get_date_range(start_date, end_date)
'''
def get_date_range(start_date, end_date):
    number_of_days = (end_date-start_date).days
    return [(start_date + datetime.timedelta(days = day)).isoformat() for day in range(number_of_days+1)]

'''
This function expects two dataframes with the same format: for the first seven columns, each column corresponds to a date 
and each row corresponds to a counter index. In position i,j there should be DELTA of counter i in date j. 
For the last two columns of the dataframes they should not reffer to a daily prediction but to the aggregated prediction 
of week_1 and week_2. Given these two dataframes (one for theprediction and one for the real values), 
the function returns de error according to the competition rules.

Examples:

import pandas as pd
import copy

test = pd.read_pickle('../data/test.pkl')

compute_error(test, test)

test_v3 = copy.copy(test)
test_v3.iloc[:,0] = test_v3.iloc[:,1]
compute_error(test_v3, test)

'''
def compute_error(pred, real):
    daily_rmses = []
    for i in range(7):
        daily_rmses.append((((real.iloc[:,i] - pred.iloc[:,i])**2/len(real.iloc[:,i])).sum())**(1/2))
    rmse_1 = sum(daily_rmses)/7
    
    first_week_pred_sum = pred.iloc[:,7].sum()
    second_week_pred_sum = pred.iloc[:,8].sum()
    first_week_real_sum = real.iloc[:,7].sum()
    second_week_real_sum = real.iloc[:,8].sum()
    
    first_week_rmse = (((first_week_real_sum - first_week_pred_sum)**2)/len(real.iloc[:,7]))**(1/2)
    second_week_rmse = (((second_week_real_sum - second_week_pred_sum)**2)/len(real.iloc[:,8]))**(1/2)
    rmse_2 = (first_week_rmse + second_week_rmse)/2
    
    return (rmse_1 + rmse_2)/2

In [15]:
path = '../data/df6.pkl'
df = pd.read_pickle(path)
df.head(2)


Unnamed: 0,ID,DELTA,YEAR_DAY,DATE,MEAN_CONSUMPTION,VARIANCE_CONSUMPTION,WEEKDAY,IS_WEEKEND,sin_WEEKDAY,cos_WEEKDAY,sin_year_day,cos_year_day,PRECIPITATIONS,MIN_TEMP,MEAN_TEMP,MAX_TEMP,SUN,IS_GOOD
0,0,243.0,0,2019-02-01,282.893151,11453.639651,4,0,-0.433884,-0.900969,0.0,1.0,0.0,16.5,17.6,18.7,7.1,1
1,0,236.0,1,2019-02-02,282.893151,11453.639651,5,1,-0.974928,-0.222521,0.017213,0.999852,0.0,9.8,13.4,17.1,6.1,1


In [16]:
start_date = datetime.date(2019, 2 , 1)
end_date = datetime.date(2020, 1, 17)
train = df[df['DATE'].isin(get_date_range(start_date, end_date))]
train = train[train['IS_GOOD']==1]
train.drop(['sin_year_day','cos_year_day','sin_WEEKDAY','cos_WEEKDAY','IS_GOOD','DATE'], axis=1, inplace=True)
train['SUN'] = train['SUN'].fillna(train['SUN'].mean())
train['PRECIPITATIONS'] = train['PRECIPITATIONS'].fillna(train['PRECIPITATIONS'].mean())

start_date = datetime.date(2020, 1 , 18)
end_date = datetime.date(2020, 1, 31)
test = df[df['DATE'].isin(get_date_range(start_date, end_date))]
test = test[test['IS_GOOD']==1]
test.drop(['sin_year_day','cos_year_day','sin_WEEKDAY','cos_WEEKDAY','IS_GOOD'], axis=1, inplace=True)
test['SUN'] = test['SUN'].fillna(test['SUN'].mean())
test['PRECIPITATIONS'] = test['PRECIPITATIONS'].fillna(test['PRECIPITATIONS'].mean())

print('Train:', train.shape, 'Test:', test.shape)

X_train = train.drop(['DELTA'], axis=1)
y_train = train['DELTA']

X_test = test.drop(['DELTA', 'DATE'], axis=1)

Train: (931203, 12) Test: (37142, 13)


In [17]:
X_train

Unnamed: 0,ID,YEAR_DAY,MEAN_CONSUMPTION,VARIANCE_CONSUMPTION,WEEKDAY,IS_WEEKEND,PRECIPITATIONS,MIN_TEMP,MEAN_TEMP,MAX_TEMP,SUN
0,0,0,282.893151,11453.639651,4,0,0.0,16.5,17.6,18.7,7.1
1,0,1,282.893151,11453.639651,5,1,0.0,9.8,13.4,17.1,6.1
2,0,2,282.893151,11453.639651,6,1,0.0,7.7,10.6,13.6,9.3
3,0,3,282.893151,11453.639651,0,0,0.0,4.1,10.6,17.2,9.3
4,0,4,282.893151,11453.639651,1,0,0.0,7.5,14.6,21.6,9.2
...,...,...,...,...,...,...,...,...,...,...,...
990226,2712,346,57.015873,388.230751,0,0,0.0,2.6,9.9,17.2,6.8
990227,2712,347,57.015873,388.230751,1,0,0.0,2.6,9.8,16.9,8.7
990228,2712,348,57.015873,388.230751,2,0,0.0,3.4,10.8,18.1,7.2
990229,2712,349,57.015873,388.230751,3,0,0.0,5.2,12.0,18.8,8.7


In [18]:
X_train.nunique()

ID                      2653
YEAR_DAY                 351
MEAN_CONSUMPTION        2586
VARIANCE_CONSUMPTION    2595
WEEKDAY                    7
IS_WEEKEND                 2
PRECIPITATIONS            38
MIN_TEMP                 173
MEAN_TEMP                151
MAX_TEMP                 160
SUN                       95
dtype: int64

In [19]:
train_1 = Pool(X_train,
                y_train,
                cat_features=[0, 1, 4, 5]) #ID, YEAR_DAY, WEEKDAY, IS_WEEKEND

In [20]:
test_1 = Pool(X_test,
                cat_features=[0, 1, 4, 5])

In [21]:
model = CatBoostRegressor(iterations=500, 
                          depth=6, 
                          learning_rate=0.3
                          )

In [22]:
model.fit(train_1)

0:	learn: 69035.6521546	total: 1.22s	remaining: 10m 9s
1:	learn: 68163.3441592	total: 2.07s	remaining: 8m 34s
2:	learn: 67436.4101406	total: 2.86s	remaining: 7m 53s
3:	learn: 67423.8565681	total: 3.45s	remaining: 7m 8s
4:	learn: 67415.5559147	total: 4.04s	remaining: 6m 39s
5:	learn: 66965.3968031	total: 4.76s	remaining: 6m 32s
6:	learn: 66948.4599822	total: 5.43s	remaining: 6m 22s
7:	learn: 66705.1092570	total: 5.95s	remaining: 6m 5s
8:	learn: 65625.0376756	total: 6.48s	remaining: 5m 53s
9:	learn: 65622.3755788	total: 6.85s	remaining: 5m 35s
10:	learn: 65416.7428283	total: 7.43s	remaining: 5m 30s
11:	learn: 65380.3942961	total: 8.36s	remaining: 5m 39s
12:	learn: 65067.4492710	total: 8.84s	remaining: 5m 31s
13:	learn: 65066.8323677	total: 9.04s	remaining: 5m 14s
14:	learn: 64668.4200989	total: 9.61s	remaining: 5m 10s
15:	learn: 64368.0549808	total: 10.1s	remaining: 5m 5s
16:	learn: 64355.3252370	total: 10.6s	remaining: 5m 1s
17:	learn: 64351.7318509	total: 11.2s	remaining: 4m 59s
18:	le

<catboost.core.CatBoostRegressor at 0x186668ba710>

In [23]:
y_pred = model.predict(test_1)
print(y_pred)

[  64.64537311  462.36114063  317.37827901 ... -594.93863412 -287.44411011
 -227.37823416]


In [26]:
y_pred[y_pred < 0] = 0

In [31]:
len(y_pred)

37142

In [35]:
len(X_test)

37142

In [39]:
test['predicciones'] = y_pred

In [40]:
test.head(5)

Unnamed: 0,ID,DELTA,YEAR_DAY,DATE,MEAN_CONSUMPTION,VARIANCE_CONSUMPTION,WEEKDAY,IS_WEEKEND,PRECIPITATIONS,MIN_TEMP,MEAN_TEMP,MAX_TEMP,SUN,predicciones
351,0,421.0,351,2020-01-18,282.893151,11453.639651,5,1,2.4,7.7,13.2,18.8,4.0,64.645373
352,0,273.0,352,2020-01-19,282.893151,11453.639651,6,1,38.2,9.5,9.6,9.6,0.0,462.361141
353,0,306.0,353,2020-01-20,282.893151,11453.639651,0,0,53.5,5.0,7.1,9.2,0.0,317.378279
354,0,292.0,354,2020-01-21,282.893151,11453.639651,1,0,35.3,6.6,9.8,13.1,0.0,787.876725
355,0,460.0,355,2020-01-22,282.893151,11453.639651,2,0,9.9,9.7,12.0,14.3,0.0,0.0


In [149]:
catboost_preds_limpio = test.drop(['DELTA',
                                    'YEAR_DAY',
                                    'WEEKDAY',
                                    'MEAN_CONSUMPTION',
                                    'VARIANCE_CONSUMPTION',
                                    'IS_WEEKEND',
                                    'PRECIPITATIONS',
                                    'MIN_TEMP',
                                    'MEAN_TEMP',
                                    'MAX_TEMP',
                                    'SUN'], axis=1)
catboost_preds_limpio.set_index('DATE', inplace = True)
catboost_preds_limpio.head(30)

Unnamed: 0_level_0,ID,predicciones
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-18,0,64.645373
2020-01-19,0,462.361141
2020-01-20,0,317.378279
2020-01-21,0,787.876725
2020-01-22,0,0.0
2020-01-23,0,0.0
2020-01-24,0,0.0
2020-01-25,0,0.0
2020-01-26,0,0.0
2020-01-27,0,0.0


In [150]:
final_df_1 = pd.DataFrame(index=['2020-01-18', 
                                '2020-01-19',
                                '2020-01-20',
                                '2020-01-21',
                                '2020-01-22',
                                '2020-01-23',
                                '2020-01-24',
                                '2020-01-25',
                                '2020-01-26',
                                '2020-01-27',
                                '2020-01-28',
                                '2020-01-29',
                                '2020-01-30',
                                '2020-01-31'])

for contador in range(0, len(catboost_preds_limpio['ID'].unique())):
    df = catboost_preds_limpio.loc[(catboost_preds_limpio['ID'] == contador)]
    df = df.drop(['ID'], axis=1)
    final_df_1[contador] = df
final_df_1.head(15)

  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contador] = df
  final_df_1[contado

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2643,2644,2645,2646,2647,2648,2649,2650,2651,2652
2020-01-18,64.645373,0.0,0.0,0.0,158.524232,175.732703,0.0,0.0,0.0,72.475929,...,0.0,0.0,0.0,531.297242,0.0,0.0,,0.0,0.0,
2020-01-19,462.361141,0.0,0.0,146.452308,452.347332,433.526957,364.887102,0.0,0.0,252.319131,...,127.737727,0.0,0.0,994.799127,0.0,0.0,,0.0,426.045393,
2020-01-20,317.378279,0.0,0.0,0.0,234.546599,317.9966,71.249032,0.0,0.0,0.0,...,113.516182,0.0,0.0,714.992278,0.0,0.0,,3.222016,169.932284,
2020-01-21,787.876725,0.184625,0.184625,227.844259,821.749807,660.883164,397.42594,96.80095,0.184625,110.900948,...,162.582393,0.184625,0.184625,1188.704196,0.184625,48.374993,,7.930276,646.918931,
2020-01-22,0.0,0.0,0.0,0.0,204.099063,483.883886,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,308.457597,0.0,0.0,,0.0,0.0,
2020-01-23,0.0,0.0,0.0,0.0,0.0,378.025047,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,
2020-01-24,0.0,0.0,0.0,0.0,0.0,448.281004,78.329043,0.0,0.0,127.675731,...,0.0,0.0,0.0,415.921504,0.0,0.0,,0.0,0.0,
2020-01-25,0.0,0.0,0.0,0.0,0.0,236.64197,179.231076,0.0,0.0,248.764498,...,0.0,0.0,0.0,394.710562,0.0,0.0,,0.0,0.0,
2020-01-26,0.0,0.0,0.0,0.0,0.0,267.058212,81.005365,0.0,0.0,165.933642,...,0.0,0.0,0.0,477.9482,0.0,0.0,,0.0,0.0,
2020-01-27,0.0,0.0,0.0,0.0,0.0,369.631595,27.894753,0.0,0.0,28.297379,...,0.0,0.0,0.0,424.536554,0.0,0.0,,0.0,0.0,


In [110]:
final_df = pd.DataFrame(columns=['ID','Dia_1','Dia_2','Dia_3','Dia_4','Dia_5','Dia_6','Dia_7', 'Dia_8','Dia_9','Dia_10','Dia_11','Dia_12','Dia_13','Dia_14'], index=range(2800))
final_df.head(5)

Unnamed: 0,ID,Dia_1,Dia_2,Dia_3,Dia_4,Dia_5,Dia_6,Dia_7,Dia_8,Dia_9,Dia_10,Dia_11,Dia_12,Dia_13,Dia_14
0,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,


In [154]:
final_df_2 = final_df_1.T
final_df_2.rename(columns={'2020-01-18': "Dia_1",
                            '2020-01-19': "Dia_2",
                            '2020-01-20': "Dia_3",
                            '2020-01-21': "Dia_4",
                            '2020-01-22': "Dia_5",
                            '2020-01-23': "Dia_6",
                            '2020-01-24': "Dia_7",
                            '2020-01-25': "Dia_8",
                            '2020-01-26': "Dia_9",
                            '2020-01-27': "Dia_10",
                            '2020-01-28': "Dia_11",
                            '2020-01-29': "Dia_12",
                            '2020-01-30': "Dia_13",
                            '2020-01-31': "Dia_14",})

Unnamed: 0,Dia_1,Dia_2,Dia_3,Dia_4,Dia_5,Dia_6,Dia_7,Dia_8,Dia_9,Dia_10,Dia_11,Dia_12,Dia_13,Dia_14
0,64.645373,462.361141,317.378279,787.876725,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.184625,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.184625,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,146.452308,0.000000,227.844259,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,158.524232,452.347332,234.546599,821.749807,204.099063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648,0.000000,0.000000,0.000000,48.374993,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649,,,,,,,,,,,,,,
2650,0.000000,0.000000,3.222016,7.930276,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2651,0.000000,426.045393,169.932284,646.918931,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [155]:
Semana_1 = ['Dia_1','Dia_2','Dia_3','Dia_4','Dia_5','Dia_6','Dia_7']
Semana_2 = ['Dia_8','Dia_9','Dia_10','Dia_11','Dia_12','Dia_13','Dia_14']

In [158]:
final_df_2['Semana_1'] = final_df_2['Dia_1','Dia_2','Dia_3','Dia_4','Dia_5','Dia_6','Dia_7'].sum(axis=1)

KeyError: ('Dia_1', 'Dia_2', 'Dia_3', 'Dia_4', 'Dia_5', 'Dia_6', 'Dia_7')

In [55]:
start_date = datetime.date(2020, 1, 18)
end_date = datetime.date(2020, 1, 31)

def get_date_range(start_date, end_date):
    number_of_days = (end_date-start_date).days
    return [(start_date + datetime.timedelta(days = day)).isoformat() for day in range(number_of_days+1)]

fechas_test = get_date_range(start_date, end_date)

final_df = pd.DataFrame(columns=['ID','Dia_1','Dia_2','Dia_3','Dia_4','Dia_5','Dia_6','Dia_7', 'Dia_8','Dia_9','Dia_10','Dia_11','Dia_12','Dia_13','Dia_14'], index=range(2800))

In [None]:

final_df['Semana_1'] = final_df[['Dia_1','Dia_2','Dia_3','Dia_4','Dia_5','Dia_6','Dia_7']].sum(axis=1)