In [21]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from skforecast.ForecasterAutoreg import ForecasterAutoreg

precios_diarios = pd.read_csv('src/data/data_lake/business/features/precios_diarios.csv')
precios_diarios['fecha'] = pd.to_datetime(precios_diarios['fecha'], format='%Y-%m-%d')
precios_diarios['dia_mes'] = pd.to_numeric(precios_diarios['dia_mes'])
precios_diarios = precios_diarios.set_index('fecha')
precios_diarios = precios_diarios.asfreq('D')
precios_diarios = precios_diarios.sort_index()
precios_diarios.index = pd.DatetimeIndex(precios_diarios.index).to_period('D')
precios_diarios.head()

Unnamed: 0_level_0,precio,dia_mes,dia_mes_binario
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1995-07-20,1.409435,20,0
1995-07-21,4.924333,21,1
1995-07-22,1.2695,22,1
1995-07-23,0.953083,23,1
1995-07-24,4.305917,24,1


In [None]:
def test_train_datasets(data_frame):
    data_train = data_frame[data_frame['dia_mes_binario'] == 0]
    data_test  = data_frame[data_frame['dia_mes_binario'] == 1]
    return data_train, data_test

In [22]:
def test_train_datasets_1(data_frame, porcentaje):
    n = round(len(data_frame)*porcentaje)
    data_train = data_frame[:-n]
    data_test  = data_frame[-n:]
    return data_train, data_test

In [24]:
data_train, data_test = test_train_datasets_1(precios_diarios, 0.3)

In [25]:
data_train

Unnamed: 0_level_0,precio,dia_mes,dia_mes_binario
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1995-07-20,1.409435,20,0
1995-07-21,4.924333,21,1
1995-07-22,1.269500,22,1
1995-07-23,0.953083,23,1
1995-07-24,4.305917,24,1
...,...,...,...
2013-08-01,229.413225,1,0
2013-08-02,199.507241,2,0
2013-08-03,188.789906,3,0
2013-08-04,173.907940,4,0


In [26]:
data_test

Unnamed: 0_level_0,precio,dia_mes,dia_mes_binario
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-08-06,153.596007,6,0
2013-08-07,154.258525,7,0
2013-08-08,188.667108,8,0
2013-08-09,188.903924,9,0
2013-08-10,189.606083,10,0
...,...,...,...
2021-04-26,188.325833,26,1
2021-04-27,181.770000,27,1
2021-04-28,175.874167,28,1
2021-04-29,165.203333,29,1


In [47]:
import statsmodels.api as st
import pickle

forecaster = st.tsa.statespace.SARIMAX(
    endog = data_train[['precio']],
    exog = data_train[['dia_mes']],
    enforce_stationarity = False,
    enforce_invertibility = False,
    )

model = forecaster.fit()
pickle.dump(model, open('precios-diarios.pkl', 'wb'))

In [28]:
forecaster = ForecasterAutoreg(regressor = RandomForestRegressor(random_state = 123), lags = 6)

forecaster.fit(data_train['precio'])
forecaster



ForecasterAutoreg 
Regressor: RandomForestRegressor(random_state=123) 
Lags: [1 2 3 4 5 6] 
Window size: 6 
Included exogenous: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: [0, 6591] 
Training index type: RangeIndex 
Training index frequency: 1 
Regressor parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 123, 'verbose': 0, 'warm_start': False} 
Creation date: 2022-07-02 15:30:04 
Last fit date: 2022-07-02 15:30:08 
Skforecast version: 0.4.2 

In [29]:
pasos = len(data_test)
predictions = forecaster.predict(steps = pasos)
predictions.tail(2)

9415    189.539552
9416    189.650839
Name: pred, dtype: float64

In [31]:
result = model.forecast(pasos, exog = data_test[["dia_mes"]],)

In [30]:
predictions.shape

(2825,)

In [20]:
data_train['precio']

fecha
1995-07-20      1.409435
1995-07-21      4.924333
1995-07-22      1.269500
1995-07-23      0.953083
1995-07-24      4.305917
                 ...    
2013-08-01    229.413225
2013-08-02    199.507241
2013-08-03    188.789906
2013-08-04    173.907940
2013-08-05    165.066455
Freq: D, Name: precio, Length: 6592, dtype: float64

In [15]:
data_train.head()

Unnamed: 0_level_0,precio,dia_mes,dia_mes_binario
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1995-07-20,1.409435,20,0
1995-07-21,4.924333,21,1
1995-07-22,1.2695,22,1
1995-07-23,0.953083,23,1
1995-07-24,4.305917,24,1


In [34]:
result.head()

2013-08-06    164.045835
2013-08-07    163.031417
2013-08-08    162.023163
2013-08-09    161.021036
2013-08-10    160.024999
Freq: D, Name: predicted_mean, dtype: float64

In [36]:
predictions =  pd.DataFrame(result)
predictions.tail(3)

Unnamed: 0,predicted_mean
2021-04-28,-0.242127
2021-04-29,-0.250775
2021-04-30,-0.259422


In [44]:
data_pred = pd.concat([data_test.loc[:, ['precio']], predictions], axis=1, join = 'inner')
data_pred = data_pred.reset_index()
data_pred.columns = ['fecha', 'precio_promedio_real', 'precio_promedio_pred']

In [45]:
data_pred.head(7)

Unnamed: 0,fecha,precio_promedio_real,precio_promedio_pred
0,2013-08-06,153.596007,164.045835
1,2013-08-07,154.258525,163.031417
2,2013-08-08,188.667108,162.023163
3,2013-08-09,188.903924,161.021036
4,2013-08-10,189.606083,160.024999
5,2013-08-11,177.055017,159.035013
6,2013-08-12,163.397886,158.051041


In [46]:
data_pred.to_csv('src/data/data_lake/business/forecasts/precios-diarios.csv', index = False)