In [16]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
import warnings
from pmdarima.arima import auto_arima

  from pandas import MultiIndex, Int64Index


In [17]:
warnings.simplefilter('ignore')

In [18]:
'''
given a start date in datetime format "start_date" and an "end_date" returns a list of strings with the dates from
"start_date" to "end_date".

Example:

start_date = datetime.date(2019, 9 , 30)
end_date = datetime.date(2019, 10, 7)
get_date_range(start_date, end_date)
'''
def get_date_range(start_date, end_date):
    number_of_days = (end_date-start_date).days
    return [(start_date + datetime.timedelta(days = day)).isoformat() for day in range(number_of_days+1)]

'''
This function expects two dataframes with the same format: for the first seven columns, each column corresponds to a date 
and each row corresponds to a counter index. In position i,j there should be DELTA of counter i in date j. 
For the last two columns of the dataframes they should not reffer to a daily prediction but to the aggregated prediction 
of week_1 and week_2. Given these two dataframes (one for theprediction and one for the real values), 
the function returns de error according to the competition rules.

Examples:

import pandas as pd
import copy

test = pd.read_pickle('../data/test.pkl')

compute_error(test, test)

test_v3 = copy.copy(test)
test_v3.iloc[:,0] = test_v3.iloc[:,1]
compute_error(test_v3, test)

'''
def compute_error(pred, real):
    daily_rmses = []
    for i in range(7):
        daily_rmses.append((((real.iloc[:,i] - pred.iloc[:,i])**2/len(real.iloc[:,i])).sum())**(1/2))
    rmse_1 = sum(daily_rmses)/7
    
    first_week_pred_sum = pred.iloc[:,7].sum()
    second_week_pred_sum = pred.iloc[:,8].sum()
    first_week_real_sum = real.iloc[:,7].sum()
    second_week_real_sum = real.iloc[:,8].sum()
    
    first_week_rmse = (((first_week_real_sum - first_week_pred_sum)**2)/len(real.iloc[:,7]))**(1/2)
    second_week_rmse = (((second_week_real_sum - second_week_pred_sum)**2)/len(real.iloc[:,8]))**(1/2)
    rmse_2 = (first_week_rmse + second_week_rmse)/2
    
    return (rmse_1 + rmse_2)/2

In [4]:
path = '../data/df6.pkl'

df = pd.read_pickle(path)
start_date = datetime.date(2019, 2 , 1)
end_date = datetime.date(2020, 1, 17)
train = df[df['DATE'].isin(get_date_range(start_date, end_date))]
train = train[train['IS_GOOD']==1]
train = train[['ID','DATE','DELTA']]

start_date = datetime.date(2020, 1 , 18)
end_date = datetime.date(2020, 1, 31)
test = df[df['DATE'].isin(get_date_range(start_date, end_date))]
test = test[test['IS_GOOD']==1]
test = test[['ID','DATE','DELTA']]

print('Train:', train.shape, 'Test:', test.shape)

Train: (931203, 3) Test: (37142, 3)


In [5]:
all_data = pd.concat([train, test])
all_data.shape

(968345, 3)

In [6]:
all_data = all_data[all_data['DATE'] > '2019-10-31']

In [7]:
#Todas las predicciones en un dataframe
predicciones_df = test[test['ID'] == 0]
predicciones_df = predicciones_df.drop(['ID', 'DELTA'], axis=1)
predicciones_df.set_index('DATE', inplace = True)

In [23]:
for contador in tqdm(all_data['ID'].unique()):
    one_counter = all_data[all_data['ID'] == contador]
    #Drop ID
    one_counter.drop(['ID'], axis=1, inplace=True)
    #Fechas a indices
    one_counter.set_index('DATE', inplace = True)
    #MODELO POR CONTADOR
    arima_model = auto_arima(one_counter, 
                    start_p=0, 
                    d=1, 
                    start_q=0, 
                    max_p=3, 
                    max_d=3, 
                    max_q=3, 
                    start_P=0, 
                    D=1, 
                    start_Q=0, 
                    max_P=3, 
                    max_D=3, 
                    max_Q=3, 
                    m=7, 
                    seasonal=True, 
                    error_action='warn', 
                    trace=False, 
                    supress_warnings=True, 
                    stepwise=True, 
                    random_state=2517, 
                    n_jobs=-1,
                    n_fits=10)   
    predicciones_df[f'predict_cont_{contador}'] = arima_model.predict(n_periods = 14)

 61%|████████████████████████████████████████████▉                             | 1610/2653 [2:57:24<1:54:56,  6.61s/it]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
predicciones_df.to_pickle('../data/predicciones_arima.pkl')