# Predicción empleando nuestro modelo predictivo

**Modelo**: RandomForest

**Variable respuesta**: 'Totales'

**Variables predictoras**: codificadas sin estandarizar.

In [420]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Barra de progreso de un proceso
# ------------------------------------------------------------------------------
from tqdm import tqdm

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

# Poner descansos en código.
# ------------------------------------------------------------------------------
import time

# Librería para crear archivos pickle.
# ------------------------------------------------------------------------------
import pickle

In [329]:
df = pd.read_csv('data/totales_encod.csv', index_col = 0)
df.head(3)

Unnamed: 0,registro,clima,temperatura,sens_termica,humedad,viento,total,fecha_nueva,festividad,festividad_1,mes_bueno,dia_semana_nuevo,no_laboral_nuevo,año_map,estacion_map
0,1,2,14.1,18.2,81.0,10.7,985,2018-01-01,New Year's Day,1,1,0,0,0,0
1,2,2,14.9,17.7,70.0,16.7,801,2018-01-02,,0,1,1,1,0,0
2,3,1,8.0,9.5,44.0,16.6,1349,2018-01-03,,0,1,2,1,0,0


In [330]:
df.drop(['registro','fecha_nueva','festividad','sens_termica'], axis=1,inplace=True)
df.columns

Index(['clima', 'temperatura', 'humedad', 'viento', 'total', 'festividad_1',
       'mes_bueno', 'dia_semana_nuevo', 'no_laboral_nuevo', 'año_map',
       'estacion_map'],
      dtype='object')

In [97]:
X=df.drop(['total'],axis=1)
y=df['total']

In [98]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
# Definir diccionario con los hiperparámetros a modificar
# min_samples_split: número mínimo de observaciones que debe tener un nodo para dividirse, por defecto [10, 50, 100]
# min_samples_leaf: número mínimo de observaciones que debe tener cada uno de los nodos hijos para que se produzca la division, por defecto [10, 50, 100]
param = {"max_depth": [16,17], "max_features": [1,2,3], "min_samples_split": [5, 75, 200], "min_samples_leaf": [5,75,200]}

In [100]:
gs_rf = GridSearchCV(estimator=RandomForestRegressor(), param_grid= param, cv=10, verbose=-1,
                        return_train_score = True, scoring="neg_mean_squared_error")

In [101]:
gs_rf.fit(x_train, y_train)
bosque = gs_rf.best_estimator_
bosque

In [102]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [103]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)
df_totales_encod = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest totales codificadas")
df_totales_encod

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,452.901207,341945.670063,584.761208,0.900099,test,Random Forest totales codificadas
1,423.474037,345989.23027,588.208492,0.909325,train,Random Forest totales codificadas


## PROBLEMA: falta columna para año 2023

In [331]:
df.head(3)

Unnamed: 0,clima,temperatura,humedad,viento,total,festividad_1,mes_bueno,dia_semana_nuevo,no_laboral_nuevo,año_map,estacion_map
0,2,14.1,81.0,10.7,985,1,1,0,0,0,0
1,2,14.9,70.0,16.7,801,0,1,1,1,0,0
2,1,8.0,44.0,16.6,1349,0,1,2,1,0,0


In [None]:
######################## SOLUCIÓNAR PROBLEMA ###########################

In [335]:
#Codificamos de nuevo la columna año_bueno, ya que para poder hacer la prediccion neceitamos una columna con el año.
df2=df.copy()
df2['año_map_23']=0
df2.head(3)

Unnamed: 0,clima,temperatura,humedad,viento,total,festividad_1,mes_bueno,dia_semana_nuevo,no_laboral_nuevo,año_map,estacion_map,año_map_23
0,2,14.1,81.0,10.7,985,1,1,0,0,0,0,0
1,2,14.9,70.0,16.7,801,0,1,1,1,0,0,0
2,1,8.0,44.0,16.6,1349,0,1,2,1,0,0,0


In [251]:
#FUNCION CON ERROR QUE NO SE PUDO VOLVER A REPETIR ESTAS MÉTRICAS...
#def probar_modelo(dataframe,parametros):
#    X3=dataframe.drop(['total'],axis=1)
#    y3=dataframe['total']
#    time.sleep(3)
#    x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=3)
#    time.sleep(3)
#    gs_rf3 = GridSearchCV(estimator=RandomForestRegressor(), param_grid= parametros, cv=10, verbose=-1,
#                        return_train_score = True, scoring="neg_mean_squared_error")
#    time.sleep(3)
#    gs_rf3.fit(x3_train, y3_train)
#    time.sleep(3)
#    bosque3 = gs_rf2.best_estimator_ #ERROR FUNCION, por eso salieron bien las metricas, es irreproducible ya...
#    time.sleep(3)
#    y3_pred_test_rf = bosque3.predict(x3_test)
#    y3_pred_train_rf = bosque3.predict(x3_train)
#    time.sleep(3)
#    df_totales_encod3 = metricas(y3_test, y3_train, y3_pred_test_rf, y3_pred_train_rf, "Random Forest totales codificadas")
#
#    return  bosque3,df_totales_encod3

In [224]:
#ERROR QUE NO SE PUDO VOLVER A REPETIR ESTAS MÉTRICAS...
#param5 = {"max_depth": [15,16,17], "max_features": [1,2,3], "min_samples_leaf": [2,3,5],"min_samples_split": [3, 5, 10]}
#bosque5,metricas5=probar_modelo(df2,param5)

In [225]:
#bosque5

In [226]:
#metricas5

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,326.238546,199586.254222,446.750774,0.94476,test,Random Forest totales codificadas
1,342.367171,245731.658964,495.713283,0.934894,train,Random Forest totales codificadas


In [334]:
#param4 = {"max_depth": [15], "max_features": [3], "min_samples_leaf": [2],"min_samples_split": [3]}
#bosque4,metricas4=probar_modelo(df2,param4)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

### PRUEBA1

In [271]:
df2.head(3)

Unnamed: 0,clima,temperatura,humedad,viento,total,festividad_1,mes_bueno,dia_semana_nuevo,no_laboral_nuevo,año_map,estacion_map,año_map_23
0,2,14.1,81.0,10.7,985,1,1,0,0,0,0,0
1,2,14.9,70.0,16.7,801,0,1,1,1,0,0,0
2,1,8.0,44.0,16.6,1349,0,1,2,1,0,0,0


In [338]:
df3=df2.copy()

In [339]:
df3['año_map_18']=df3['año_map'].map({0:1,1:0})
df3['año_map_19']=df3['año_map']
df3.drop(['año_map'],axis=1,inplace=True)
df3.head(3)

Unnamed: 0,clima,temperatura,humedad,viento,total,festividad_1,mes_bueno,dia_semana_nuevo,no_laboral_nuevo,estacion_map,año_map_23,año_map_18,año_map_19
0,2,14.1,81.0,10.7,985,1,1,0,0,0,0,1,0
1,2,14.9,70.0,16.7,801,0,1,1,1,0,0,1,0
2,1,8.0,44.0,16.6,1349,0,1,2,1,0,0,1,0


In [291]:
def probar_modelo2(dataframe,parametros):
    X3=dataframe.drop(['total'],axis=1)
    y3=dataframe['total']
    time.sleep(3)
    x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=9)
    time.sleep(3)
    gs_rf3 = GridSearchCV(estimator=RandomForestRegressor(), param_grid= parametros, cv=10, verbose=-1,
                        return_train_score = True, scoring="neg_mean_squared_error")
    time.sleep(3)
    gs_rf3.fit(x3_train, y3_train)
    time.sleep(3)
    bosque3 = gs_rf3.best_estimator_
    time.sleep(3)
    y3_pred_test_rf = bosque3.predict(x3_test)
    y3_pred_train_rf = bosque3.predict(x3_train)
    time.sleep(3)
    df_totales_encod3 = metricas(y3_test, y3_train, y3_pred_test_rf, y3_pred_train_rf, "Random Forest totales codificadas")

    return  bosque3,df_totales_encod3

In [377]:
param7 = {"max_depth": [15], "max_features": [3], "min_samples_leaf": [2],"min_samples_split": [1]}
bosque7,metricas7=probar_modelo2(df3,param7)

In [378]:
print(bosque7)
metricas7

RandomForestRegressor(max_depth=15, max_features=3, min_samples_leaf=2,
                      min_samples_split=1)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,474.035113,420103.279688,648.153747,0.900796,test,Random Forest totales codificadas
1,300.671703,185081.880292,430.211437,0.948834,train,Random Forest totales codificadas


In [379]:
df3.head(3)

Unnamed: 0,clima,temperatura,humedad,viento,total,festividad_1,mes_bueno,dia_semana_nuevo,no_laboral_nuevo,estacion_map,año_map_23,año_map_18,año_map_19
0,2,14.1,81.0,10.7,985,1,1,0,0,0,0,1,0
1,2,14.9,70.0,16.7,801,0,1,1,1,0,0,1,0
2,1,8.0,44.0,16.6,1349,0,1,2,1,0,0,1,0


In [414]:
X20=df3.drop(['total'],axis=1)
y20=df3['total']
time.sleep(3)
x20_train, x20_test, y20_train, y20_test = train_test_split(X20, y20, test_size=0.2, random_state=42)
time.sleep(3)
param20= {"max_depth": [15], "max_features": [3], "min_samples_leaf": [2],"min_samples_split": [1]}
gs_rf20 = GridSearchCV(estimator=RandomForestRegressor(), param_grid= param20, cv=10, verbose=-1,
                    return_train_score = True, scoring="neg_mean_squared_error")
time.sleep(3)
gs_rf20.fit(x20_train, y20_train)
time.sleep(3)
bosque20 = gs_rf20.best_estimator_
time.sleep(3)
y20_pred_test_rf = bosque20.predict(x20_test)
y20_pred_train_rf = bosque20.predict(x20_train)
time.sleep(3)
df_totales_encod20 = metricas(y20_test, y20_train, y20_pred_test_rf, y20_pred_train_rf, "Random Forest totales codificadas")

In [415]:
print(bosque20)
df_totales_encod20

RandomForestRegressor(max_depth=15, max_features=3, min_samples_leaf=2,
                      min_samples_split=1)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,416.56809,305914.440928,553.095327,0.910626,test,Random Forest totales codificadas
1,306.377158,190867.652672,436.884027,0.949978,train,Random Forest totales codificadas


In [416]:
df3.head(3)

Unnamed: 0,clima,temperatura,humedad,viento,total,festividad_1,mes_bueno,dia_semana_nuevo,no_laboral_nuevo,estacion_map,año_map_23,año_map_18,año_map_19
0,2,14.1,81.0,10.7,985,1,1,0,0,0,0,1,0
1,2,14.9,70.0,16.7,801,0,1,1,1,0,0,1,0
2,1,8.0,44.0,16.6,1349,0,1,2,1,0,0,1,0


### PRUEBA2 (NO USAR ESTA, MEJOR LA PRUEBA 1) (lo dejo puesto para que nos acordemos que ya se ha probado esto)

In [380]:
df2.head(3)

Unnamed: 0,clima,temperatura,humedad,viento,total,festividad_1,mes_bueno,dia_semana_nuevo,no_laboral_nuevo,año_map,estacion_map,año_map_23
0,2,14.1,81.0,10.7,985,1,1,0,0,0,0,0
1,2,14.9,70.0,16.7,801,0,1,1,1,0,0,0
2,1,8.0,44.0,16.6,1349,0,1,2,1,0,0,0


In [385]:
param10 = {"max_depth": [15], "max_features": [3], "min_samples_leaf": [2],"min_samples_split": [1]}
bosque10,metricas10=probar_modelo2(df2,param10)

In [382]:
print(bosque10)
metricas10

RandomForestRegressor(max_depth=16, max_features=3, min_samples_leaf=2,
                      min_samples_split=3)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,505.707497,473517.271931,688.125913,0.888182,test,Random Forest totales codificadas
1,295.878782,177690.337345,421.533317,0.950877,train,Random Forest totales codificadas


## Guardamos pickle del modelo

Guardaremos nuestro modelo en un archivo .pkl (pickle) para poder ejecutar el mismo modelo en otro ordenador pero sin perder el tiempo de ejecutar todo el modelo y correr el riesgo de perder esas métricas tan buenas.

In [434]:
bosque20

In [438]:
#Guardamos nuestro modelo en un pickle para no tener que correr el modelo cada vez que queramos
#  hacer una predicción.
with open('data/modelo_rf_totales.pkl', 'wb') as fp:
    pickle.dump(bosque20,fp)

## Continuació prediccion en 7_1_RF_totales_prediccion.ipynb