En este notebook se recogen varias pruebas solo con los contadores cuyas series temporales son completas o casi completas.

1. Predicción haciendo la media entre XGBoost Regressor y Gradient Boosting Regressor
2. Modelos ARIMA para todas las series
3. Ensemble entre ARIMA y XGBoost Regressor

### Imports, utils and train/test creation

In [43]:
import pandas as pd
import datetime
from tqdm import tqdm

import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
import warnings

In [45]:
warnings.simplefilter('ignore')

In [39]:
'''
given a start date in datetime format "start_date" and an "end_date" returns a list of strings with the dates from
"start_date" to "end_date".

Example:

start_date = datetime.date(2019, 9 , 30)
end_date = datetime.date(2019, 10, 7)
get_date_range(start_date, end_date)
'''
def get_date_range(start_date, end_date):
    number_of_days = (end_date-start_date).days
    return [(start_date + datetime.timedelta(days = day)).isoformat() for day in range(number_of_days+1)]

'''
This function expects two dataframes with the same format: for the first seven columns, each column corresponds to a date 
and each row corresponds to a counter index. In position i,j there should be DELTA of counter i in date j. 
For the last two columns of the dataframes they should not reffer to a daily prediction but to the aggregated prediction 
of week_1 and week_2. Given these two dataframes (one for theprediction and one for the real values), 
the function returns de error according to the competition rules.

Examples:

import pandas as pd
import copy

test = pd.read_pickle('../data/test.pkl')

compute_error(test, test)

test_v3 = copy.copy(test)
test_v3.iloc[:,0] = test_v3.iloc[:,1]
compute_error(test_v3, test)

'''
def compute_error(pred, real):
    daily_rmses = []
    for i in range(7):
        daily_rmses.append((((real.iloc[:,i] - pred.iloc[:,i])**2/len(real.iloc[:,i])).sum())**(1/2))
    rmse_1 = sum(daily_rmses)/7
    
    first_week_pred_sum = pred.iloc[:,7].sum()
    second_week_pred_sum = pred.iloc[:,8].sum()
    first_week_real_sum = real.iloc[:,7].sum()
    second_week_real_sum = real.iloc[:,8].sum()
    
    first_week_rmse = (((first_week_real_sum - first_week_pred_sum)**2)/len(real.iloc[:,7]))**(1/2)
    second_week_rmse = (((second_week_real_sum - second_week_pred_sum)**2)/len(real.iloc[:,8]))**(1/2)
    rmse_2 = (first_week_rmse + second_week_rmse)/2
    
    return (rmse_1 + rmse_2)/2

In [47]:
path = '../data/df6.pkl'

df = pd.read_pickle(path)
start_date = datetime.date(2019, 2 , 1)
end_date = datetime.date(2020, 1, 17)
train = df[df['DATE'].isin(get_date_range(start_date, end_date))]
train.drop(['YEAR_DAY','WEEKDAY','IS_GOOD','DATE'], axis=1, inplace=True)
train['SUN'] = train['SUN'].fillna(train['SUN'].mean())
train['PRECIPITATIONS'] = train['PRECIPITATIONS'].fillna(train['PRECIPITATIONS'].mean())

start_date = datetime.date(2020, 1 , 18)
end_date = datetime.date(2020, 1, 31)
test = df[df['DATE'].isin(get_date_range(start_date, end_date))]
test.drop(['YEAR_DAY','WEEKDAY','IS_GOOD'], axis=1, inplace=True)
test['SUN'] = test['SUN'].fillna(test['SUN'].mean())
test['PRECIPITATIONS'] = test['PRECIPITATIONS'].fillna(test['PRECIPITATIONS'].mean())

print('Train:', train.shape, 'Test:', test.shape)

X_train = train.drop(['DELTA'], axis=1)
y_train = train['DELTA']

X_test = test.drop(['DELTA', 'DATE'], axis=1)

Train: (964197, 14) Test: (38458, 15)


### XGBR and GBR for all counters

- The final prediction is the mean between XGBR and GBR
- No lags are used
- The week prediction is done just by adding the daily predictions 

#### PROBAR LO MISMO PERO METIENDO LAG_7

In [22]:
model1 = xgb.XGBRegressor(
    n_estimators=1000,
    reg_lambda=1,
    gamma=0,
    max_depth=8
)

model2 = GradientBoostingRegressor()

print('Fitting XGB...')
model1.fit(X_train, y_train)
print('Fitting GB...')
model2.fit(X_train, y_train)
print('End fitting.')

Fitting XGB...


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Fitting GB...
End fitting.


In [48]:
y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)

results_df = pd.DataFrame.from_dict({'ID':test['ID'].values, 
                                     'DATE':test['DATE'].values,
                                     'y_pred1':y_pred1,
                                     'y_pred2':y_pred2})
results_df = results_df.sort_values(['ID','DATE'])
results_df['FINAL'] = results_df[['y_pred1','y_pred2']].mean(axis=1)

In [49]:
start_date = datetime.date(2020, 1 , 18)
end_date = datetime.date(2020, 1, 31)
fechas_test = get_date_range(start_date, end_date)

ID = []
Dia_1 = []
Dia_2 = []
Dia_3 = []
Dia_4 = []
Dia_5 = []
Dia_6 = []
Dia_7 = []
for i, fecha in enumerate(fechas_test[0:7]):
    aux = results_df[results_df['DATE']==fecha]
    ID = list(aux['ID'].values)
    if i==0:
        Dia_1 += list(aux['FINAL'].values)
    if i==1:
        Dia_2 += list(aux['FINAL'].values)
    if i==2:
        Dia_3 += list(aux['FINAL'].values)
    if i==3:
        Dia_4 += list(aux['FINAL'].values)
    if i==4:
        Dia_5 += list(aux['FINAL'].values)
    if i==5:
        Dia_6 += list(aux['FINAL'].values)
    if i==6:
        Dia_7 += list(aux['FINAL'].values)
print(len(ID),len(Dia_1))
final_df = pd.DataFrame.from_dict({'ID':ID,
                                   'Dia_1':Dia_1,
                                  'Dia_2':Dia_2,
                                  'Dia_3':Dia_3,
                                  'Dia_4':Dia_4,
                                  'Dia_5':Dia_5,
                                  'Dia_6':Dia_6,
                                  'Dia_7':Dia_7,})

ID = []
Dia_8 = []
Dia_9 = []
Dia_10 = []
Dia_11 = []
Dia_12 = []
Dia_13 = []
Dia_14 = []
for i, fecha in enumerate(fechas_test[7:14]):
    aux = results_df[results_df['DATE']==fecha]
    ID = list(aux['ID'].values)
    if i==0:
        Dia_8 += list(aux['FINAL'].values)
    if i==1:
        Dia_9 += list(aux['FINAL'].values)
    if i==2:
        Dia_10 += list(aux['FINAL'].values)
    if i==3:
        Dia_11 += list(aux['FINAL'].values)
    if i==4:
        Dia_12 += list(aux['FINAL'].values)
    if i==5:
        Dia_13 += list(aux['FINAL'].values)
    if i==6:
        Dia_14 += list(aux['FINAL'].values)
print(len(ID),len(Dia_11))
final_df2 = pd.DataFrame.from_dict({'ID':ID,
                                   'Dia_8':Dia_8,
                                  'Dia_9':Dia_9,
                                  'Dia_10':Dia_10,
                                  'Dia_11':Dia_11,
                                  'Dia_12':Dia_12,
                                  'Dia_13':Dia_13,
                                  'Dia_14':Dia_14,})

final_df['Semana_1'] = final_df[['Dia_1','Dia_2','Dia_3','Dia_4','Dia_5','Dia_6','Dia_7']].sum(axis=1)
final_df['Semana_2'] = final_df2[['Dia_8','Dia_9','Dia_10','Dia_11','Dia_12','Dia_13','Dia_14']].sum(axis=1)

final_df2 = final_df.drop('ID', axis=1)

2747 2747
2747 2747


In [50]:
test = pd.read_pickle('../data/test.pkl')
error = compute_error(final_df2, test)
print('Mean between XGBR and GBR:', round(error,2))

Mean between XGBR and GBR: 21080.97


### ARIMA for good counters, XGBR and GBR for bad ones