In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# CONSTANTS

In [None]:
INPUT_TRAIN = "../input/tabular-playground-series-jan-2022/train.csv"
INPUT_TEST = "../input/tabular-playground-series-jan-2022/test.csv"
SUBMISSION = "../input/tabular-playground-series-jan-2022/sample_submission.csv"

# FUNCTIONS

In [None]:
def check_and_plot_nan_percentage(df=None, x_offset=0, y_offset=0, print_values=True):
    """
    Plots the percentage of missing values on each columns of an input dataframe

            Parameters:
                    df (DataFrame): A pandas Dataframe
                    x_offset (float): x_offset on each bar value
                    y_offset (float): y_offset on each bar value
                    print_values (boolean): Set it to True to display percentage on bars

    """
    if df is None:
        print("Input dataframe is None : exit")
        return
    else:
        values = []
        for c in df.columns:
            values.append(100*df[c].isna().sum() / df.shape[0])
        plt.figure(figsize=(9, 6))
        plt.title("NaN percentage per column",
                  fontsize=16,
                  fontweight='bold',
                  pad=20
                  )
        plt.bar(range(0, len(df.columns)), values, edgecolor='black')
        plt.xticks(range(0, len(df.columns)), df.columns, rotation=90)
        xlocs, xlabs = plt.xticks()
        if print_values:
            for i, v in enumerate(values):
                if v > 0:
                    if i % 2 == 0:
                        plt.text(xlocs[i] + x_offset, v +
                                 y_offset, str(round(v, 1)))
                    else:
                        plt.text(xlocs[i] + x_offset, v +
                                 y_offset, str(round(v, 1)))
        plt.show()

In [None]:
def evaluate_smape_model(model=None, X_test=None, y_test=None, biais_values=None):
    """
    Makes predictions on X_test then, evaluate the metric SMAPE between predictions and y_test
    and then evaluate SMAPE on predictions + biais values.

            Parameters:
                    model : A sklearn or tensorflow model
                    X_test (array): x_offset on each bar value
                    y_test (array): y_offset on each bar value
                    biais_values (List): List of biais to add on predictions to evaluate SMAPE

    """
    if model is None:
        print("Input model is None")
        return
    elif X_test is None:
        print("X_test is None")
        return
    elif y_test is None:
        print("y_test is None")
        return
    else:
        y_pred = np.round(model.predict(X_test)).reshape(-1, 1).astype(int)
        smp = smape(y_true=y_test, y_pred=y_pred)
        print("SMAPE on test set =", smp)
        print("#" * 10)

        if biais_values is not None and len(biais_values) > 0:
            smps = []
            for i in biais_values:
                smp = smape(y_true=y_test, y_pred=y_pred+i)
                smps.append(smp)

            print("Best SMAPE on test =", min(smps),
                " biais =", biais[smps.index(min(smps))])
            print("#" * 10)

            plt.plot(biais_values, smps)
            plt.title("Impact of biais on predictions")
            plt.ylabel("SMAPE Score")
            plt.xlabel("Biais value")
            plt.show()

In [None]:
def load_train_test_set():
    """
    Checks if train and test set exist and returns them into a list.

            Return:
                    X_train (Array): Set use for training in fit method
                    X_test (Array): Set use to predict
                    y_test (Array): Set use to compare predictions and true values
                    y_train (Array): Set use for training in fit method

    """
    if not os.path.exists("data/x_train.npy"):
        print("X_train has not been created or has been deleted : cannot continue")
        return [None, None, None, None]
    elif not os.path.exists("data/x_test.npy"):
        print("X_test has not been created or has been deleted : cannot continue")
        return [None, None, None, None]
    elif not os.path.exists("data/y_train.npy"):
        print("y_train has not been created or has been deleted : cannot continue")
        return [None, None, None, None]
    elif not os.path.exists("data/y_test.npy"):
        print("y_test has not been created or has been deleted : cannot continue")
        return [None, None, None, None]
    else:
        X_train = np.load("data/x_train.npy")
        X_test = np.load("data/x_test.npy")
        y_train = np.load("data/y_train.npy")
        y_test = np.load("data/y_test.npy")
        return [X_train, X_test, y_train, y_test]

# 1. DATA LOADING

In [None]:
df_train = pd.read_csv(INPUT_TRAIN)
df_test = pd.read_csv(INPUT_TEST)

# 2. FIRST DATA LOOK AROUND

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
print("Are test columns in train columns ?",
      df_test.columns.isin(df_train.columns).all())

### 2.2 Check missing values on each column

In [None]:
check_and_plot_nan_percentage(
    df=df_train, x_offset=0, y_offset=0, print_values=True
)

In [None]:
check_and_plot_nan_percentage(
    df=df_test, x_offset=0, y_offset=0, print_values=True
)

In [None]:
df_train.head()

In [None]:
df_test.head()

### 2.3 Store categorcial columns and check unique values between train and test set

In [None]:
CATEGORICAL_COLUMNS = ["country", "store", "product"]

In [None]:
for c in CATEGORICAL_COLUMNS:
    print("Train:", df_train[c].unique())
    print("Test :", df_test[c].unique())
    print("Are train and test values the same ?",
          (df_test[c].unique() == df_train[c].unique()).all())

### 2.4 Convert date to datetime object with the right format

In [None]:
format = '%Y/%m/%d'
df_train['date'] = pd.to_datetime(df_train['date'], format=format)
df_test['date'] = pd.to_datetime(df_test['date'], format=format)

### 2.5 Describe numerical columns

In [None]:
for c in df_train.columns:
    if c not in CATEGORICAL_COLUMNS:
        print(df_train[c].describe())
        print(("\n"))

In [None]:
df_train['num_sold'].hist()
plt.show()

# 3. CLEANING

In [None]:
df_train = df_train.drop(columns=["row_id"])

# 4. FEATURE ENGINEERING

In [None]:
df_train["weekday"] = df_train["date"].dt.dayofweek
df_train["month"] = df_train["date"].dt.month
df_train["year"] = df_train["date"].dt.year
df_train['is_weekend'] = (df_train['date'].dt.weekday >= 5).astype(int)

df_test["weekday"] = df_test["date"].dt.dayofweek
df_test["month"] = df_test["date"].dt.month
df_test["year"] = df_test["date"].dt.year
df_test['is_weekend'] = (df_test['date'].dt.weekday >= 5).astype(int)

In [None]:
df_train = df_train.drop(columns=["date"])
df_test = df_test.drop(columns=["date"])

I drop date column on each dataset after doing the feature engineering because the intertia will be coupled otherwise.

# 5. DATA EXPLORATION

In [None]:
years = list(df_train["year"].unique())
months = list(df_train["month"].unique())
days = list(df_train["weekday"].unique())
countries = list(df_train["country"].unique())
stores = list(df_train["store"].unique())
products = list(df_train["product"].unique())

In [None]:
ax = df_train["is_weekend"].value_counts(normalize=True).plot(kind="bar",
                                                              title="Sales distribution over weekends",
                                                              ylabel="Sales percentage"
                                                              )
ax.set_xticklabels(["During week", "During weekend"])
plt.show()

In [None]:
total_sales_per_year = []
total_sales_per_month_per_year = []
total_sales_per_day_over_year = []

for i in range(0, len(years)):
    total_sales_per_month_per_year.append([])
    total_sales_per_day_over_year.append([])
    total_sales_per_year.append(
        df_train[df_train["year"] == years[i]]["num_sold"].sum())
    sub_df = df_train[df_train["year"] == years[i]].copy(deep=True)
    for month in months:
        total_sales_per_month_per_year[i].append(
            sub_df[sub_df["month"] == month]["num_sold"].sum())
    for day in days:
        total_sales_per_day_over_year[i].append(
            sub_df[sub_df["weekday"] == day]["num_sold"].sum())

total_sales_per_year /= sum(total_sales_per_year)

plt.bar(years, total_sales_per_year)
plt.xticks(years)
plt.title("Total sales over years")
plt.ylabel("Percentage")
plt.show()

fig, axs = plt.subplots(1, len(years), figsize=[16, 6], sharey=True)
for i in range(0, len(years)):
    axs[i].bar(months, total_sales_per_month_per_year[i])
    axs[i].set_title(str("Year: " + str(years[i])))
    axs[i].set_xticks(months)
    axs[i].set_xlabel("Month")
plt.show()

fig, axs = plt.subplots(1, len(years), figsize=[16, 6], sharey=True)
for i in range(0, len(years)):
    axs[i].bar(days, total_sales_per_day_over_year[i])
    axs[i].set_title(str("Year: " + str(years[i])))
    axs[i].set_xticks(days)
    axs[i].set_xlabel("Weekday")
plt.show()

* Sales growth from year to year
* A seasonality is observable over the years: Sales increase in December and January, decrease in February, increase again in March, April and May, then decrease until August and increase again from September
* Weekend concentrate sales

In [None]:
total_sales_per_country = []
total_sales_per_country_over_years = []
for i in range(0, len(countries)):
    total_sales_per_country_over_years.append([])
    total_sales_per_country.append(
        df_train[df_train["country"] == countries[i]]["num_sold"].sum())
    sub_df = df_train[df_train["country"] == countries[i]].copy(deep=True)
    for year in years:
        total_sales_per_country_over_years[i].append(
            sub_df[sub_df["year"] == year]["num_sold"].sum())

total_sales_per_country /= sum(total_sales_per_country)
plt.bar(countries, total_sales_per_country)
plt.xticks(countries)
plt.title("Total sales over countries")
plt.ylabel("Sales percentage")
plt.show()

fig, axs = plt.subplots(1, len(countries), figsize=[16, 6], sharey=True)
for i in range(0, len(countries)):
    axs[i].bar(years, total_sales_per_country_over_years[i])
    axs[i].set_xticks(years)
    axs[i].set_title(str("Country: " + str(countries[i])))
    axs[i].set_xlabel("Year")
plt.show()

In [None]:
total_sales_per_store = []
total_sales_per_store_over_years = []
total_sales_per_store_per_country = []
total_sales_per_store_per_product = []

for i in range(0, len(stores)):
    total_sales_per_store_over_years.append([])
    total_sales_per_store_per_country.append([])
    total_sales_per_store_per_product.append([])
    total_sales_per_store.append(
        df_train[df_train["store"] == stores[i]]["num_sold"].sum())
    sub_df = df_train[df_train["store"] == stores[i]].copy(deep=True)
    for year in years:
        total_sales_per_store_over_years[i].append(
            sub_df[sub_df["year"] == year]["num_sold"].sum())
    for country in countries:
        total_sales_per_store_per_country[i].append(
            sub_df[sub_df["country"] == country]["num_sold"].sum())
    for product in products:
        total_sales_per_store_per_product[i].append(
            sub_df[sub_df["product"] == product]["num_sold"].sum())

total_sales_per_store /= sum(total_sales_per_store)
plt.bar(stores, total_sales_per_store)
plt.xticks(stores)
plt.title("Total sales per store over the years")
plt.ylabel("Sales percentage")
plt.show()

fig, axs = plt.subplots(1, len(stores), figsize=[16, 6], sharey=True)
for i in range(0, len(stores)):
    axs[i].bar(years, total_sales_per_store_over_years[i])
    axs[i].set_xticks(years)
    axs[i].set_title(str("Store: " + str(stores[i])))
    axs[i].set_xlabel("Year")
plt.show()

fig, axs = plt.subplots(1, len(stores), figsize=[16, 6], sharey=True)
for i in range(0, len(stores)):
    axs[i].bar(countries, total_sales_per_store_per_country[i])
    axs[i].set_xticks(countries)
    axs[i].set_title(str("Store: " + str(stores[i])))
    axs[i].set_xlabel("Country")
plt.show()

fig, axs = plt.subplots(1, len(stores), figsize=[16, 6], sharey=True)
for i in range(0, len(stores)):
    axs[i].bar(products, total_sales_per_store_per_product[i])
    axs[i].set_xticks(products)
    axs[i].set_title(str("Store: " + str(stores[i])))
    axs[i].set_xlabel("Product")
plt.show()

In [None]:
total_sales_per_product = []
total_sales_per_product_over_years = []
total_sales_per_product_per_country = []

for i in range(0, len(products)):
    total_sales_per_product_over_years.append([])
    total_sales_per_product_per_country.append([])
    total_sales_per_product.append(
        df_train[df_train["product"] == products[i]]["num_sold"].sum())
    sub_df = df_train[df_train["product"] == products[i]].copy(deep=True)
    for year in years:
        total_sales_per_product_over_years[i].append(
            sub_df[sub_df["year"] == year]["num_sold"].sum())
    for country in countries:
        total_sales_per_product_per_country[i].append(
            sub_df[sub_df["country"] == country]["num_sold"].sum())

total_sales_per_product /= sum(total_sales_per_product)
plt.bar(products, total_sales_per_product)
plt.xticks(products)
plt.title("Total sales per product over the years")
plt.ylabel("Sales percentage")
plt.show()

fig, axs = plt.subplots(1, len(products), figsize=[16, 6], sharey=True)
for i in range(0, len(products)):
    axs[i].bar(years, total_sales_per_product_over_years[i])
    axs[i].set_xticks(years)
    axs[i].set_title(str("Products: " + str(products[i])))
    axs[i].set_xlabel("Year")
plt.show()

fig, axs = plt.subplots(1, len(products), figsize=[16, 6], sharey=True)
for i in range(0, len(products)):
    axs[i].bar(countries, total_sales_per_product_per_country[i])
    axs[i].set_xticks(countries)
    axs[i].set_title(str("Product: " + str(products[i])))
    axs[i].set_xlabel("Country")
plt.show()

# 6.PREPROCESSING

### 6.1 Encode categorical columns

In [None]:
le = preprocessing.LabelEncoder()

for c in CATEGORICAL_COLUMNS:
    df_train[c] = le.fit_transform(df_train[c])
    df_test[c] = le.transform(df_test[c])

### 6.2 Construct X and y arrays

In [None]:
y = np.array(df_train["num_sold"])
y = y.reshape(-1, 1)

In [None]:
X = np.array(df_train.drop(columns="num_sold"))
X_to_pred = np.array(df_test.drop(columns=["row_id"]))

### 6.3 Split train dataframe into train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save set to use it in different branches
if not os.path.exists("data/"):
    os.mkdir("data/")
np.save("data/x_train.npy", X_train)
np.save("data/x_test.npy", X_test)
np.save("data/y_train.npy", y_train)
np.save("data/y_test.npy", y_test)

### 6.4 Create the appropriate score

In [None]:
def smape(y_true, y_pred):
    return (100/y_true.shape[0]) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# SMAPE must be lowered to increase performances
smape_score = make_scorer(score_func=smape, greater_is_better=False)

# 7. Training

I will train Gradient boosting + random forest regressor on the train set. The training will be performed on a grid to optimize hyperparameters and validate them through a cross validation on 5-folds. 

GridSearchCV will be used to achieve it.

First, load train and test set from data folder to make sure to share the same set between all branches / try / models.

In [None]:
X_train, X_test, y_train, y_test = load_train_test_set()

In [None]:
gboost = GradientBoostingRegressor(random_state=0)
parameters = {'n_estimators': (50, 75, 100, 150, 200, 250, 300, 500),
              'learning_rate': (0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.08, 0.1, 0.15, 0.2),
              'max_depth': (3, 5, 8), "max_features": ["auto", "log2"]
              }
res = GridSearchCV(estimator=gboost, param_grid=parameters,
                   scoring=smape_score, cv=5).fit(X_train, y_train.ravel())

print("Best params found through the gridsearch= ", res.best_params_)

opti_gboost = res.best_estimator_

Below I define a list of values that will be added to the output predictions in order to check if the global minimum has been reached.

In [None]:
biais = [i for i in range(-30, 30)]

In [None]:
evaluate_smape_model(model=opti_gboost, X_test=X_test, y_test=y_test, biais_values=biais)

# 8. Impact of replacing outliers by mean

In [None]:
X_train, X_test, y_train, y_test = load_train_test_set()

In [None]:
y = np.array(df_train["num_sold"])
limit = round(np.percentile(y, 96), 2)
outliers = y_train > limit
y_train[outliers] = np.mean(y)
outliers = y_test > limit
y_test[outliers] = np.mean(y)

In [None]:
res = GridSearchCV(estimator=gboost, param_grid=parameters,
                   scoring=smape_score, cv=5).fit(X_train, y_train.ravel())

print("Best params found through the gridsearch= ", res.best_params_)

opti_gboost2 = res.best_estimator_

In [None]:
evaluate_smape_model(model=opti_gboost2, X_test=X_test, y_test=y_test, biais_values=biais)

# 9. Impact of replacing outliers by limit (96th percentile)

In [None]:
X_train, X_test, y_train, y_test = load_train_test_set()

In [None]:
y = np.array(df_train["num_sold"])
limit = round(np.percentile(y, 96), 2)
outliers = y_train > limit
y_train[outliers] = limit
outliers = y_test > limit
y_test[outliers] = limit

In [None]:
res = GridSearchCV(estimator=gboost, param_grid=parameters,
                   scoring=smape_score, cv=5).fit(X_train, y_train.ravel())

print("Best params found through the gridsearch= ", res.best_params_)

opti_gboost3 = res.best_estimator_

In [None]:
evaluate_smape_model(model=opti_gboost3, X_test=X_test, y_test=y_test, biais_values=biais)

# 10. Train the random forest regressor as the gradient boosting

In [None]:
X_train, X_test, y_train, y_test = load_train_test_set()

In [None]:
rfr = RandomForestRegressor()
parameters = {'n_estimators': (100, 200, 300, 500, 1000, 1500, 2000),
              'max_depth': [None, 3, 5],
              'bootstrap': [True, False]
              }
res = GridSearchCV(estimator=rfr, param_grid=parameters,
                   scoring=smape_score, cv=5).fit(X_train, y_train.ravel())

print("Best params found through the gridsearch: ", res.best_params_)

opti_rfr = res.best_estimator_

High number of estimators with random forest regressor avoid overfitting but consume more computation time.

In [None]:
evaluate_smape_model(model=opti_rfr, X_test=X_test, y_test=y_test, biais_values=biais)

# 11. Impact of replacing outliers by mean

In [None]:
X_train, X_test, y_train, y_test = load_train_test_set()

In [None]:
y = np.array(df_train["num_sold"])
limit = round(np.percentile(y, 96), 2)
outliers = y_train > limit
y_train[outliers] = np.mean(y)
outliers = y_test > limit
y_test[outliers] = np.mean(y)

In [None]:
res = GridSearchCV(estimator=rfr, param_grid=parameters,
                   scoring=smape_score, cv=5).fit(X_train, y_train.ravel())

print("Best params found through the gridsearch: ", res.best_params_)

opti_rfr2 = res.best_estimator_

In [None]:
evaluate_smape_model(model=opti_rfr2, X_test=X_test, y_test=y_test, biais_values=biais)

# 12. Impact of replacing outliers by limit

In [None]:
X_train, X_test, y_train, y_test = load_train_test_set()

In [None]:
y = np.array(df_train["num_sold"])
limit = round(np.percentile(y, 96), 2)
outliers = y_train > limit
y_train[outliers] = limit
outliers = y_test > limit
y_test[outliers] = limit

In [None]:
res = GridSearchCV(estimator=rfr, param_grid=parameters,
                   scoring=smape_score, cv=5).fit(X_train, y_train.ravel())

print("Best params found through the gridsearch: ", res.best_params_)

opti_rfr3 = res.best_estimator_

In [None]:
evaluate_smape_model(model=opti_rfr3, X_test=X_test, y_test=y_test, biais_values=biais)

# 13. Forecast the number of sales for the coming years (Test dataframe)

Here the gradient boosting model with a biais of -2 and without preprocessing on num_sold is selected despite the SMAPE on test set is not the lower. Indeed, clamping the value of num_sold based on a limit defined from the past could be a good idea only if the sale behaviour will be same as the one in past. But the data exploration shows that there is a stable growth market year from year to year so clamping the values could not represent this growth.

In [None]:
X_train, X_test, y_train, y_test = load_train_test_set()

In [None]:
best_biais = -2

In [None]:
num_sold = np.round(opti_gboost.predict(X_to_pred)).reshape(-1, 1).astype(int) + best_biais

In [None]:
df = pd.read_csv(SUBMISSION)
df["num_sold"] = num_sold
df.to_csv("submission.csv", index=False)

# 14. Sales predictions per store

In [None]:
df_train_original = pd.read_csv(INPUT_TRAIN)

In [None]:
for c in CATEGORICAL_COLUMNS:
    df_train_original[c] = le.fit_transform(df_train_original[c])
    df_test[c] = le.inverse_transform(df_test[c])

In [None]:
df_test["num_sold"] = num_sold

In [None]:
kagglemart_sales = df_test[df_test["store"] == "KaggleMart"]["num_sold"].sum()
kagglerama_sales = df_test[df_test["store"] == "KaggleRama"]["num_sold"].sum()
print("Sales predictions for KaggleMart", kagglemart_sales)
print("Sales predictions for KaggleRama", kagglerama_sales)
print("Checksum valid ?", num_sold.sum() == kagglemart_sales + kagglerama_sales)

<b> Conclusion:</b> KaggleRama will have the higher number of sales for 2019.

# REFERENCES

* Tensor girl: https://www.kaggle.com/usharengaraju/tensorflow-tf-data-keraspreprocessinglayers-w-b for the cheat code to compute the feature is_weekend in one line of code

* SMAPE formula : https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error