# Predicción de ventas

## Preparación del entorno de desarrollo

In [None]:
# Habilitar autocomplete en Kaggle
%config Completer.use_jedi = False

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Análisis exploratorio

### Cargar _datasets_ y mostrar algunos datos

In [None]:
store_df = pd.read_csv("/kaggle/input/rossmann-store-sales/store.csv")
store_df.head()

In [None]:
# Especifico el tipo para 'StateHoliday' porque sino da un _warning_
train_df = pd.read_csv("/kaggle/input/rossmann-store-sales/train.csv", 
                       dtype={
                           'StateHoliday': 'str'
                       })
train_df.head()

In [None]:
train_df['StateHoliday'].value_counts()
# Daba error sin especificar el tipo porque se mezclan números y caracteres

In [None]:
test_df = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv', 
                       dtype={
                           'StateHoliday': 'str'
                       })
test_df.head()

### Unir _datasets_

In [None]:
train_merged_df = train_df.merge(store_df, on='Store')
train_merged_df.head()

In [None]:
test_merged_df = test_df.merge(store_df, on='Store')
test_merged_df.head()

### Datos estadísticos de la columna de salida

In [None]:
train_merged_df['Sales'].describe()

In [None]:
sns.set(style="ticks", color_codes=True, font_scale=1.5)

fig, ax1 = plt.subplots(1, figsize=(18, 10))
corr = train_merged_df.corr()
sns.heatmap(corr, cmap=sns.diverging_palette(220, 10, as_cmap=True), annot=True, fmt=".2f")
sns.set(font_scale=1.5)

In [None]:
plt.hist(train_merged_df['Sales'])

## Ingeniería de _featrures_

### Columnas relacionadas con fechas

In [None]:
# Función para convertir las fechas en columnas que pueda procesar un modelo
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week
    return df

In [None]:
train_merged_date_df = split_date(train_merged_df)
train_merged_date_df.info()

In [None]:
plt.boxplot([train_merged_date_df.loc[train_merged_date_df['Day'] == 1, 'Sales'], 
            train_merged_date_df.loc[train_merged_date_df['Day'] == 2, 'Sales'], 
            train_merged_date_df.loc[train_merged_date_df['Day'] == 3, 'Sales'], 
            train_merged_date_df.loc[train_merged_date_df['Day'] == 4, 'Sales'], 
            train_merged_date_df.loc[train_merged_date_df['Day'] == 5, 'Sales'], 
            train_merged_date_df.loc[train_merged_date_df['Day'] == 6, 'Sales'], 
            train_merged_date_df.loc[train_merged_date_df['Day'] == 7, 'Sales']]);

In [None]:
test_merged_date_df = split_date(test_merged_df)
test_merged_date_df.info()

### Eliminar días de los _stores_ cerrados

Cuando la columna `Open` es `0`, las tiendas están cerradas y no vale la pena considerarla para entrenar el modelo

In [None]:
train_merged_date_df.loc[train_merged_date_df['Open'] == 0]['Sales'].describe()

In [None]:
train_open_df = train_merged_date_df.loc[train_merged_date_df['Open'] == 1]
test_open_df = test_merged_date_df.loc[test_merged_date_df['Open'] == 1]

In [None]:
plt.boxplot([train_open_df.loc[train_open_df['Day'] == 1, 'Sales'], 
            train_open_df.loc[train_open_df['Day'] == 2, 'Sales'], 
            train_open_df.loc[train_open_df['Day'] == 3, 'Sales'], 
            train_open_df.loc[train_open_df['Day'] == 4, 'Sales'], 
            train_open_df.loc[train_open_df['Day'] == 5, 'Sales'], 
            train_open_df.loc[train_open_df['Day'] == 6, 'Sales'], 
            train_open_df.loc[train_open_df['Day'] == 7, 'Sales']]);

### Imputar si hay o no competencia en cada fila

El _dataset_ `store.csv` tiene, dentro del detalle del comercio, la fecha en la que abrió la competencia. En esta sección se van a cambiar esas columnas por una única _feature_ que marque con `0` o `1` si es que existía competencia en la fecha del registro.

In [None]:
def comp_exists(df):
    copy_comp_year = df['CompetitionOpenSinceYear'].copy()
    copy_comp_month = df['CompetitionOpenSinceMonth'].copy()
    copy_year = df['Year'].copy()
    copy_month = df['Month'].copy()
    df = df.copy()
    df['CompetitionOpen'] = 12 * (copy_year - copy_comp_year) + (copy_month - copy_comp_month)
    df['CompetitionOpen'] = df['CompetitionOpen'].map(lambda x: 0 if x < 0 else 1).fillna(0)
    return df.drop(columns=['CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth'])

In [None]:
train_exist_comp_df = comp_exists(train_open_df)
test_exist_comp_df = comp_exists(test_open_df)
train_exist_comp_df.head()

In [None]:
plt.boxplot([train_exist_comp_df.loc[train_exist_comp_df['CompetitionOpen'] == 0, 'Sales'], 
            train_exist_comp_df.loc[train_exist_comp_df['CompetitionOpen'] == 1, 'Sales']]);

### Imputar si hay o no promociones

El _dataset_ tiene columnas que brindan información sobre las promociones activas. La columna `PromoInterval` contine los meses en los que hay una segunda promoción activa (separados por `,`).

In [None]:
from math import ceil
def promo_cols(df):
    copy_promo2_year = df['Promo2SinceYear'].copy()
    copy_promo2_week = df['Promo2SinceWeek'].copy()
    copy_year = df['Year'].copy()
    copy_week = df['WeekOfYear'].copy()
    copy_promo_interval = df['PromoInterval'].copy()
    months = copy_promo_interval.str.split(',', expand=True)
    df = df.copy()
    month2str = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,              
                 'Jul':7, 'Aug':8, 'Sept':9, 'Oct':10, 'Nov':11, 'Dec':12}
    for i in range(months.shape[1]):
        months[i] = months[i].map(month2str)
    months = months.add_prefix('month_')
    # Months since Promo2 was open
    df['Promo2Open'] = 12 * (copy_year - copy_promo2_year) +  (copy_week - copy_promo2_week)*7/30.5
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x < 0 else x).fillna(0) * df['Promo2']
    df = pd.concat([df, months], axis=1)
    df['IsPromo2Month'] = np.ceil(df['Promo2Open']%12)
    df['IsPromo2Month'] = df.apply(lambda row : (row['IsPromo2Month'] == row['month_0']) | (row['IsPromo2Month'] == row['month_1']) | (row['IsPromo2Month'] == row['month_2']) | (row['IsPromo2Month'] == row['month_3']), axis = 1)
    df["IsPromo2Month"] = df["IsPromo2Month"].astype(int)
    return df.drop(columns=['Promo2SinceYear', 'Promo2SinceWeek', 'PromoInterval', 'month_0', 'month_1', 'month_2', 'month_3'])

In [None]:
train_promo_active_df = promo_cols(train_exist_comp_df)
test_promo_active_df = promo_cols(test_exist_comp_df)
train_promo_active_df.head()

In [None]:
plt.boxplot([train_promo_active_df.loc[train_promo_active_df['IsPromo2Month'] == 0, 'Sales'], 
            train_promo_active_df.loc[train_promo_active_df['IsPromo2Month'] == 1, 'Sales']]);

In [None]:
plt.boxplot([train_promo_active_df.loc[train_promo_active_df['Promo'] == 0, 'Sales'], 
            train_promo_active_df.loc[train_promo_active_df['Promo'] == 1, 'Sales']]);

Se observa mayor influencia de la columna `Promo`, pero la _feature_ creada `IsPromo2Month` también se ve relacionada con la salida.

### Revisar datos faltantes

In [None]:
train_promo_active_df.isna().sum()

### Imputar la máxima distancia a los faltantes

In [None]:
max_distance = train_promo_active_df['CompetitionDistance'].max()
train_promo_active_df['CompetitionDistance'].fillna(max_distance, inplace=True)
test_promo_active_df['CompetitionDistance'].fillna(max_distance, inplace=True)
train_promo_active_df.isna().sum()

Con esto ya no quedan `NaN`

### Analizar valores únicos

In [None]:
train_promo_active_df.nunique()

### Separar _features_ y _labels_

In [None]:
features_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', 
              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month']
labels_col = 'Sales'

In [None]:
train_inputs = train_promo_active_df[features_cols].copy()
targets = train_promo_active_df[labels_col].copy()
test_inputs = test_promo_active_df[features_cols].copy()

In [None]:
numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 
              'CompetitionDistance', 'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month',
              'Day', 'Month', 'Year', 'WeekOfYear',  ]
categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

### Normalización de las _features_

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(train_inputs[numeric_cols])

In [None]:
train_norm_inputs = train_inputs.copy()
train_norm_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
test_norm_inputs = test_inputs.copy()
test_norm_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

### _One Hot encoding_ de las _features_

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(train_inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))

In [None]:
train_norm_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
test_norm_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

### Obtener `X_train` y `X_val` para entrenar el modelo

Si bien existe un _dataset_ para _test_ dentro de los archivos de datos, este conjunto no contiene la columna de salida, porque está pensado para la competencia de Kaggle.

Con esta finalidad, se va a crear un _dataframe_ de validación que será el que se utilizará en este trabajo.

In [None]:
X = train_norm_inputs[numeric_cols + encoded_cols]
X_test = test_norm_inputs[numeric_cols + encoded_cols]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, train_targets, val_targets = train_test_split(X, targets, test_size=0.1)

## Modelo

### Modelo Base

In [None]:
class BaseModel(object):

    def __init__(self):
        self.__model = None

    def fit(self, X, Y):
        return NotImplemented

    def predict(self, X):
        return NotImplemented

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(a, b):
    return mean_squared_error(a, b, squared=False)

### Modelo Promedio

El siguiente modelo es para tener un punto de partida para comparación. El modelo más simple, que se propone, es devolver el promedio de las ventas como predicción.

In [None]:
class AverageModel(BaseModel):

    def fit(self, X, y):
        self.__model = y.mean()
        return None

    def predict(self, X):
        return np.ones((X.shape[0],1)) * self.__model

In [None]:
average_model = AverageModel()

#### Entrenamiento

In [None]:
average_model.fit(X_train, train_targets)

#### Predicción

In [None]:
average_preds = average_model.predict(X_train)

#### Evaluación

In [None]:
rmse(average_preds, train_targets)

In [None]:
average_val_preds = average_model.predict(X_val)
rmse(average_val_preds, val_targets)

Este resulta es el punto de partida para comparar los siguientes modelos.

### Modelo XGBoost

Para comenzar con el modelo, se buscaron hiperparámetros que permitar un entrenamiento y predicción rápidos, pero con mejor _performance_ que el modelo promedio. Además se configuró para utilizar la GPU, bajando el tiempo de entrenamiento de **15 minutos** a **51 segundos**.

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=10, 
                     learning_rate=0.2, max_depth=10, subsample=0.9, 
                     colsample_bytree=0.7, tree_method='gpu_hist', gpu_id=0)

#### Entrenamiento

In [None]:
%%time
model.fit(X_train, train_targets)

#### Predicción

In [None]:
preds = model.predict(X_train)

#### Evaluación

In [None]:
rmse(preds, train_targets)

In [None]:
val_preds = model.predict(X_val)
rmse(val_preds, val_targets)

Se observa una mejora significativa en la estimación que arroja el modelo XGBoost, comparado con el promedio.

## Ajuste de hiperparámetros

In [None]:
def params_tunning(**params):
    model = XGBRegressor(n_jobs=-1, random_state=42, 
                     learning_rate=0.2, max_depth=10, subsample=0.9, 
                     colsample_bytree=0.7, tree_method='gpu_hist', gpu_id=0, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

In [None]:
params_tunning(n_estimators=100)

In [None]:
params_tunning(n_estimators=1000)

In [None]:
params_tunning(n_estimators=2000)

In [None]:
params_tunning(n_estimators=5000)

Se observa que a partir de **2000 estimadores** el modelo comienza a mostrar _overfitting_.