Hello all ! Thanks to the big kaggle community, I am learning a lot everyday, so if you have any comment on my work, do not hesitate !
Also, if you found this work interesting, please leave an upvote ! :)

# 0 - Introduction

In [None]:
# import libraries

# basics libraries
import pandas as pd
import numpy as np
import tensorflow as tf

# vizualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing libraries
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, LabelEncoder, StandardScaler, RobustScaler

# model libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost.sklearn import XGBRegressor

from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

# evaluation libraries
from sklearn.metrics import accuracy_score

In [None]:
# constants
DROPOUT = 0.5
EPOCHS = 10
BATCH_SIZE = 50
FOLDS = 10

OPTIMIZER = tf.keras.optimizers.Adam()
LOSS ='categorical_crossentropy'
METRICS = ["accuracy"]

In [None]:
# functions
def sales_duration(data):
    data.date = pd.to_datetime(data.date)
    number_of_days = data.date.max() - data.date.min()
    number_of_years = number_of_days.days / 365
    print(number_of_days.days, 'days')
    print(number_of_years, 'years')
    
def sales_per_day():
    fig, ax = plt.subplots(figsize=(7,4))
    plt.hist(train_df.num_sold, color='mediumblue')
    
    ax.set(xlabel = "Sales Per day",
           ylabel = "Count",
           title = "Distribution of Sales Per Day")

def daily_sales(data):
    daily_data = data.copy()
    daily_data.date = daily_data.date.apply(lambda x: str(x)[:-3])
    daily_data = daily_data.groupby('date')['num_sold'].sum().reset_index()
    daily_data.date = pd.to_datetime(daily_data.date)
    return daily_data

def time_plot(data, x_col, y_col, title):
    fig, ax = plt.subplots(figsize=(15,5))
    sns.lineplot(x_col, y_col, data=data, ax=ax, color='mediumblue', label='Total Sales')
    
    second = data.groupby(data.date.dt.year)[y_col].mean().reset_index()
    second.date = pd.to_datetime(second.date, format='%Y')
    #sns.lineplot((second.date + datetime.timedelta(6*365/12)), y_col, data=second, ax=ax, color='red', label='Mean Sales')   
    
    ax.set(xlabel = "Date",
           ylabel = "Sales",
           title = title)
    
    sns.despine()
    
def get_diff(data):
    data['sales_diff'] = data.num_sold.diff()
    data = data.dropna()
    
    #data.to_csv('../data/stationary_df.csv')
    return data

def plot_results(results, original_df):

    fig, ax = plt.subplots(figsize=(15,5))
    sns.lineplot(original_df.date, original_df.num_sold, data=original_df, ax=ax, 
                 label='Original', color='mediumblue')
    sns.lineplot(results.date, results.num_sold, data=results, ax=ax, 
                 label='Predicted', color='Red')
    
    ax.set(xlabel = "Date",
           ylabel = "Sales")

# 1 - Data analysis

About the data:

The train set has data from 2015 to 2018. 
The test set has data only from 2019.

The main challenge for me was to find out how to deal with the 'date' feature. I find these two articles very useful for it:

- https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96
- https://towardsdatascience.com/5-machine-learning-techniques-for-sales-forecasting-598e4984b109

Do not hesitate to check them out !

In [None]:
# import dataset
train_df = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")

dataset_df = [train_df,test_df]

sample_sub_df = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv")

In [None]:
# check for missing values
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
#  "row_id" is not useful
del train_df["row_id"]
del test_df["row_id"]

In [None]:
# do we have equal data for all stores ? for all countries ?
print(train_df["store"].value_counts())
sns.catplot(x="store", kind="count", palette="ch:.25", data=train_df)

In [None]:
print(train_df["country"].value_counts())
print(train_df["product"].value_counts())

In [None]:
# it seems like KaggleRama is selling more in all countries
print(train_df[['country','store','num_sold']].groupby(['country','store'], as_index=False).mean())

In [None]:
# the best seller is the Kaggle Hat
print(train_df[['country','product','num_sold']].groupby(['country','product'], as_index=False).mean())

In [None]:
# norway has the best results
print(train_df[['country','num_sold']].groupby(['country'], as_index=False).mean())

-------------------------------

In [None]:
# study of holidays -> https://www.kaggle.com/drcapa/holidays-finland-norway-sweden-20152019
holi_df = pd.read_csv("../input/holidays-finland-norway-sweden-20152019/Holidays_Finland_Norway_Sweden_2015-2019.csv")

In [None]:
holi_df.head()

In [None]:
holi_df["Date"] = pd.to_datetime(holi_df["Date"], format = '%Y-%m-%dT', errors = 'coerce')

---------------------------------------

# 2 - Data Observation

In [None]:
# Duration of train_df -> from 2015 to 2018
sales_duration(train_df)

In [None]:
# Duration of test_df -> only 2019
sales_duration(test_df)

In [None]:
# let's plot the sales for train_df
sales_per_day()

In [None]:
train_daily_df = daily_sales(train_df)
train_daily_df.head()

In [None]:
time_plot(train_daily_df, 'date', 'num_sold', 'Daily Sales')

# 3 - Data preprocessing

In [None]:
"""
The 'date' feature needs to be modified

My first idea was erase the '-' in the 'date' feature and turn the object 'date' into an 'int' with the function below:

for dataset in dataset_df:
    dataset["date"] = dataset["date"].str.replace("-","").astype(str).astype(int)
    
Then, I decided to use the 'to_datetime' function from pandas to transform 'date' from object to datetime type and then
split it into several subfeatures to be able to chose what I want to put in my model (year, month, day ...)
"""

In [None]:
for dataset in dataset_df:
    dataset["date"] = pd.to_datetime(dataset["date"], format = '%Y-%m-%dT', errors = 'coerce')

In [None]:
for dataset in dataset_df:
    dataset["date_year"] = dataset["date"].dt.year
    dataset["date_month"] = dataset["date"].dt.month
    dataset["date_week"] = dataset["date"].dt.week
    dataset["date_day"] = dataset["date"].dt.day
    dataset["date_dayofweek"] = dataset["date"].dt.dayofweek
    dataset["weekend"] = 0

In [None]:
# fill the weekend column
for dataset in dataset_df:
    i = 0
    while i < len(dataset):
        if dataset['date_dayofweek'][i] == 5 or dataset['date_dayofweek'][i] == 6 :
            dataset['weekend'][i] = 1
        i = i + 1

In [None]:
train_df.head()

In [None]:
holi_df.head()

In [None]:
train_df.info()
train_df.head()

In [None]:
# mapping the categorical features ('country', 'store' and 'product')
encoder = LabelEncoder()

for dataset in dataset_df:
    dataset["country"] = encoder.fit_transform(dataset["country"])
    dataset["store"] = encoder.fit_transform(dataset["store"])
    dataset["product"] = encoder.fit_transform(dataset["product"])

In [None]:
# we will not be using the 'date' feature on our model
# same for 'date_year' feature because the model will train on values from 2015 tp 2018 and it will only
# be tested on values from 2019
del train_df['date']
del train_df['date_year']

cols = ["country", "store", "product", "date_month", "date_week", "date_day", "date_dayofweek", "weekend"]

# Scaling
scaler = StandardScaler()

for dataset in dataset_df:
    dataset[cols] = scaler.fit_transform(dataset[cols])

In [None]:
train_df.head()

# 4 - Model and training

In [None]:
np.random.seed(713)

In [None]:
X_train = train_df.drop("num_sold", axis=1).values
y_train = train_df["num_sold"].values

In [None]:
X_test = test_df[cols].values

In [None]:
model = RandomForestRegressor(n_estimators=100, max_depth=20)
#model = XGBRegressor(n_estimators=100, learning_rate=0.2, objective='reg:squarederror')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# 5 - Submission and result observation

In [None]:
y_pred[:10]

In [None]:
# to submit
results = pd.Series(y_pred[:],name="num_sold")
submission = pd.concat([sample_sub_df["row_id"], results],axis = 1)

submission.to_csv("to_submit.csv", index=False)

In [None]:
# results observation
test_preds = pd.concat([test_df, results],axis = 1)
submission_daily_df = daily_sales(test_preds)

plot_results(submission_daily_df, train_daily_df)

# 6 - Conclusion

It seems like our model is not that bad. Of course, there is still a lot of things to do but it is able to determine the peaks and the ups and downs.

Areas of improvement:
- train more models
- more feature engineering on 'date' (e.g. holidays)
- ...