In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# importing useful packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
pd.options.display.float_format = '{:,}'.format

**Importing Datasets:**

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-jan-2022/test.csv')

In [None]:
#holidays = pd.read_csv(r'../input/gdp-of-finland-norway-and-sweden-2015-2019/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv', parse_dates=['date'])
#holidays.head()

## EDA

**First overview of the datasets:**

In [None]:
print(f'train shape: {train.shape}', f'test shape: {test.shape}', f'train rows / (train + test rows): {train.shape[0]/(train.shape[0]+test.shape[0])}', sep='\n')

Which columns compose the datasets?

In [None]:
print(f'train columns: {train.columns}', f'test columns: {test.columns}', sep='\n')

As we can see, we have to predict the sales occured in each store in order to predict are best ones going forward.

In [None]:
train.head()

In [None]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

How many missing values are there in training and test sets?

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

There are no NA's, great news.

- How many / Which countries contains a Kaggle store? 
- How many Kaggle stores are there in the dataset? H
- How many / Which products are there? 
- Which countries/stores sell more?

In [None]:
train['country'].unique()

Kaggle stores are situated only in northern Europe.

In [None]:
train['store'].unique()

There are two different stores.

In [None]:
train['product'].unique()

As We can expected, it seems that stores contain only nerd products :)

In [None]:
country_sold = train.groupby('country')['num_sold'].sum().sort_values(ascending=False)
country_sold.map('{:,}'.format)

In [None]:
plt.pie(country_sold.values, labels=country_sold.index, autopct='%0.1f%%')
plt.title('Sales share per country')
plt.show()

In [None]:
products_mean = train.groupby(['country', 'store', 'product']).agg(
    {'product': 'count', 'num_sold': 'mean'})
products_mean['product'].map('{:,}'.format)
products_mean['num_sold'].map('{:,}'.format)
products_mean

- Countries, stores and products sold are equally distributed in train set, but some places sell more in quantity. I would say that every product and every store are registered for every day between 2015 and 2018
- Norway through the years sold more than 4 millions of pieces, for about 43% of all sales
- Finland in the country with the worst results
- It seems that the Kaggle Hat is the best product everywhere, followed by the Kaggle Mug.

Let's see which store sells more between KaggleRama and KaggleMart in each country: 


In [None]:
store_sold_average = train.groupby(['country', 'store']).agg(
    {'num_sold': 'mean'})
store_sold_average

In [None]:
store_sold_total = train.groupby(['store'])['num_sold'].sum()
plt.pie(store_sold_total.values, labels=store_sold_total.index, autopct='%0.1f%%')
plt.show()

- KaggleRama seems to bee the best store, on average and on total sales, counting for the 63.5% of total amount

Let's see the sales distribution for each country and store.

In [None]:
sns.kdeplot(x=train['num_sold'], hue=train['country'])
plt.title('Sales distribution')
plt.show()

In [None]:
sns.kdeplot(x=train['num_sold'], hue=train['store'])
plt.title('Sales distribution')
plt.show()

- Sales distribution is right skewed in all countries and stores

The df is a timeseries, so I'll transform date column into the index and we can then add year, month and weekday name columns:

In [None]:
train.set_index(train['date'], inplace=True)
test.set_index(test['date'], inplace=True)
train.head()

In [None]:
train['Year'] = train.index.year
train['Month'] = train.index.month
train['Weekday'] = train.index.day_name()
train.head()

Let's visualize something possible time patterns:

In [None]:
plt.figure(figsize=[12, 6])
train['num_sold'].plot(linewidth=0.5)

Sales are too variables, but we can instantly see some patterns:
- there is always a peak in sales around December/January then they go immediately down
- after sales increase until May/June/July and they go down until October/November
- the cycle is repeated

We can say that there are regular seasonality and cyclical trends.

Let's see if the trend is the same for each country and store:

In [None]:
for country in train['country'].unique():
    temp_df = train.copy()
    temp_df.loc[temp_df['country'] == country, 'num_sold'].plot(linewidth=0.5)
    plt.title(country)
    plt.show()

- The trend is exactly the same for each country, and what about the stores?


In [None]:
for country in train['country'].unique():
    temp_df = train.copy()
    for store in temp_df['store'].unique():
        temp_df.loc[temp_df['store'] == store, 'num_sold'].plot(linewidth=0.5)
        plt.title([country, store])
        plt.show()

The same for the stores.

Let's see if kaggle sales are going up through the years as it seems and which are the best months and days of the week to sell:

In [None]:
train['num_sold'].resample('Y').sum().map('{:,}'.format)

Kaggle sales are going up through last years.

In [None]:
year_sales = train['num_sold'].resample('Y').sum()
year_sales.pct_change()

Sales increased for about 7% from 2017 to 2018!

In [None]:
year_month_group = train.groupby(['Year', 'Month']).agg(
    {'num_sold': 'sum'}).sort_values(by=['Year', 'num_sold'], ascending=False)
year_month_group['num_sold'] = year_month_group['num_sold'].map('{:,}'.format)
year_month_group

In [None]:
month_group = train.groupby(['Month']).agg(
    {'num_sold': 'sum'}).sort_values(by=['num_sold'], ascending=False)
month_group['num_sold'] = month_group['num_sold'].map('{:,}'.format)
month_group

- December is the month with most sales, probably for Christmas
- The The first part of the year from January to May is a good time for kaggle pockets

In [None]:
weekday_sales = train.groupby(['Year', 'Weekday']).agg(
    {'num_sold': 'sum'}).sort_values(by=['Year', 'num_sold'], ascending=False)
weekday_sales['num_sold'] = weekday_sales['num_sold'].map('{:,}'.format)
weekday_sales

- Most of sales are concentrated in the weekend.


Let's see the variability in each year and month.

In [None]:
sns.boxplot(data=train, x='Year', y='num_sold')
plt.ylabel('num_sold')
plt.title('Sales by Year')

- Variability through each year is pretty much the same.

- There are many outliers. We'll to take care of that in training model section

In [None]:
sns.boxplot(data=train, x='Month', y='num_sold')
plt.ylabel('num_sold')
plt.title('Sales by Year')

Last plots didn't tell much more then what we saw earlier.

Let's explore Sales average per month trend

In [None]:
df = train.groupby(['country','Year','Month']).num_sold.mean().reset_index()
fig = plt.figure(figsize  = (20,13)) 
year = 2015
for i in range(4):
    ax = fig.add_subplot(2,2,i+1)
    ax.plot(df[(df['Year']==year) & (df['country']=='Norway')]['Month'], df[(df['Year']==year) & (df['country']=='Norway')]['num_sold'], label = 'Norway')
    ax.plot(df[(df['Year']==year) & (df['country']=='Sweden')]['Month'], df[(df['Year']==year) & (df['country']=='Sweden')]['num_sold'], label = 'Sweden')
    ax.plot(df[(df['Year']==year) & (df['country']=='Finland')]['Month'], df[(df['Year']==year) & (df['country']=='Finland')]['num_sold'], label = 'Finland')
    ax.title.set_text(f'Avg Monthly Sales Trend in {year}')
    ax.set_ylabel('Average Sales')
    ax.set_xlabel('Month')
    ax.legend()
    year+=1

- Trends are pretty much the same, lines are almost the same line translated, especially for Sweden and Finland

What about the stores?

In [None]:
df = train.groupby(['store','Year','Month']).num_sold.mean().reset_index()
fig = plt.figure(figsize  = (20,13)) 
year = 2015
for i in range(4):
    ax = fig.add_subplot(2,2,i+1)
    ax.plot(df[(df['Year']==year) & (df['store']=='KaggleRama')]['Month'], df[(df['Year']==year) & (df['store']=='KaggleRama')]['num_sold'], label = 'KaggleRama')
    ax.plot(df[(df['Year']==year) & (df['store']=='KaggleMart')]['Month'], df[(df['Year']==year) & (df['store']=='KaggleMart')]['num_sold'], label = 'KaggleMart')
    ax.title.set_text(f'Avg Monthly Sales Trend in {year}')
    ax.set_ylabel('Average Sales')
    ax.set_xlabel('Month')
    ax.legend()
    year+=1

- Same insights from the stores

And what about the products?

In [None]:
train['product'].unique()

In [None]:
df = train.groupby(['product','Year','Month']).num_sold.mean().reset_index()
fig = plt.figure(figsize  = (20,13)) 
year = 2015
for i in range(4):
    ax = fig.add_subplot(2,2,i+1)
    ax.plot(df[(df['Year']==year) & (df['product']=='Kaggle Mug')]['Month'], df[(df['Year']==year) & (df['product']=='Kaggle Mug')]['num_sold'], label = 'Kaggle Mug')
    ax.plot(df[(df['Year']==year) & (df['product']=='Kaggle Hat')]['Month'], df[(df['Year']==year) & (df['product']=='Kaggle Hat')]['num_sold'], label = 'Kaggle Hat')
    ax.plot(df[(df['Year']==year) & (df['product']=='Kaggle Sticker')]['Month'], df[(df['Year']==year) & (df['product']=='Kaggle Sticker')]['num_sold'], label = 'Kaggle Sticker')
    ax.title.set_text(f'Avg Monthly Sales Trend in {year}')
    ax.set_ylabel('Average Sales')
    ax.set_xlabel('Month')
    ax.legend()
    year+=1

- Like for the countries and stores, the trend for the products is similar for every year.
- Hats peak in April and December and has minmum sales in September-October.
- Mugs peak in December and the sales dips in July-August.
- Stickers follow almost same sales through out the Year irrespective of country, store etc.

## Model building

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math

**1. Preparing data for training**

I'll do a one hot encoding for categorical variables, except for weekdays. For weekdays I will distinguish between days, Fridays and other days of the week I will keep only month and weekday for the time fields, as we saw that the year seems to not influence the sales.

In [None]:
train_encoded = pd.concat([train, pd.get_dummies(train[['country', 'store', 'product']])], axis=1).drop(
    columns=['row_id', 'date', 'country', 'store', 'product', 'country_Finland', 'store_KaggleMart', 'product_Kaggle Sticker'])
weekday_dict = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7,
}
train_encoded['Weekday'] = train_encoded['Weekday'].map(weekday_dict)
train_encoded.head()

`num_sold` column is on another scale in comparison with other variables, but it is the target variable so we don't have to scale it.

Dividing the predictors from the target column:

In [None]:
X = train_encoded.iloc[:, 1:]
y = train_encoded['num_sold']

In [None]:
linear_regression = LinearRegression()
linear_regression_fit = linear_regression.fit(X, y)

In [None]:
y_pred = linear_regression_fit.predict(X)
print(f'Linear regression RMSE: {math.sqrt(mean_squared_error(y, y_pred))}')

We are very far from having a good result.
In any case, let's see model coefficients:

In [None]:
pd.DataFrame(linear_regression_fit.coef_, X.columns, columns=['Coefficients'])

- Month has a negative coefficient but we say earlier that december is the best month every year. Maybe it's a good idea to classify eache month by past results
- Kaggle Hat has a much higher coefficient than Kaggle Mug, and it's a good sign that they are both positive. In fact Kaggle Sticker is the worst product
- Same insights for countries field coefficients
- Weekday has a positive coefficient and it's correct, in fact we say tha sales go up from Monday to Friday

**I think that a linear model is not a good solution for this problem.**

In [None]:
month_dict = {
    1: 3,
    2: 2,
    3: 3,
    4: 3,
    5: 3,
    6: 2,
    7: 1,
    8: 1,
    9: 1,
    10: 1,
    11: 1,
    12: 3,
}

weekend_dict = {
    1: 0,
    2: 0,
    3: 0,
    4: 0,
    5: 0,
    6: 1,
    7: 1}
train_encoded['IsWeekend'] = train_encoded['Weekday'].map(weekend_dict)
train_encoded['month_class'] = train_encoded['Month'].map(month_dict)
train_encoded.head()

Train correlations:

In [None]:
plt.rcParams['figure.figsize'] = (20, 10)
sns.heatmap(train_encoded.corr(), annot=True, cmap="coolwarm")

**Target column deductions:**
- It seems to be negatively correlated with Month column, but as we saw the best month for sales is december. I think I will only keep month class column in model training
- Other columns interact with `num_sold` columns like we saw in EDA section: the day of the week influence the number of sales, the hat is the most sold product, Norway is the country with more sales and KaggleRama store sells more than KaggleMart

**Interactions between predictors:**
- `Weekday` column is highly correlated with `IsWeekend` column. I can think about excluding one of those.

In [None]:
X2 = train_encoded.iloc[:,1:]
X2.drop(columns=['Month', 'Weekday'], inplace=True)
linear_regression_fit = linear_regression.fit(X2, y)
y_pred2 = linear_regression_fit.predict(X2)
print(f'Linear regression RMSE: {math.sqrt(mean_squared_error(y, y_pred2))}')

The root mean square error is a little bit lower. It seems that new features worked!

Let's see new coefficients.

In [None]:
pd.DataFrame(linear_regression_fit.coef_, X2.columns, columns=['Coefficients'])

Now let's try some ensembles to boost our predictions, let's begin with a Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     X2, y, test_size=0.25, random_state=123)

In [None]:
random_forest = RandomForestRegressor()
random_forest_fit = random_forest.fit(X_train, y_train)
test_rf_pred = random_forest_fit.predict(X_test)
print(f'Random forest RMSE: {math.sqrt(mean_squared_error(y_test, test_rf_pred))}')

Random Forest works much better, but I think tha the hey to improve more is working on feature engineering.

In [None]:
importances = random_forest.feature_importances_
feature_importance = pd.DataFrame(importances, X2.columns, columns=['Feature importance'])
feature_importance.sort_values(by='Feature importance', ascending=False).plot.bar(legend=None, title='Feature importance')

- It seems that features importance values are aligned with what we have seen earlier

In [None]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xgb_fit = xgb_reg.fit(X_train, y_train)
test_xgb_pred = xgb_fit.predict(X_test)
print(f'XGBoost RMSE: {math.sqrt(mean_squared_error(y_test, test_xgb_pred))}')

In [None]:
train.head()

In [None]:
train.columns

In [None]:
avg_sales_2018 = train[train['Year'] == 2018].groupby(['country', 'store', 'product', 'Month', 'Weekday'], as_index=False)['num_sold'].mean()
avg_sales_2018['num_sold_2019'] = avg_sales_2018['num_sold'] + avg_sales_2018['num_sold'] * 0.08 
avg_sales_2018.head()

In [None]:
test.head()

In [None]:
# test['Year'].unique() 
test2 = test.copy()
test2['Year'] = pd.to_datetime(test.index).year
test2['Month'] = pd.to_datetime(test.index).month
test2['Weekday'] = pd.to_datetime(test.index).day_name()
test_pred = test2.merge(avg_sales_2018, 
                       how='inner', 
                       left_on=['country', 'store', 'product', 'Month', 'Weekday'],
                       right_on=['country', 'store', 'product', 'Month', 'Weekday'])
test_pred.head()

In [None]:
#submission_df = pd.DataFrame({'row_id': test_pred['row_id'],'num_sold': test_pred['num_sold_2019']})
#submission_df.to_csv('avg_predictions.csv', index = False)

Since there are many outliers, I could try to use the median for the predictions

In [None]:
median_sales_2018 = train[train['Year'] == 2018].groupby(['country', 'store', 'product', 'Month', 'Weekday'], as_index=False)['num_sold'].median()
median_sales_2018['num_sold_2019'] = median_sales_2018['num_sold'] + median_sales_2018['num_sold'] * 0.08
median_pred = test2.merge(median_sales_2018, 
                       how='inner', 
                       left_on=['country', 'store', 'product', 'Month', 'Weekday'],
                       right_on=['country', 'store', 'product', 'Month', 'Weekday'])
submission_df = pd.DataFrame({'row_id': median_pred['row_id'],'num_sold': median_pred['num_sold_2019']})
submission_df.to_csv('median_predictions.csv', index = False)

**3. Test preprocessing and predictions**

In [None]:
test.head()

In [None]:
test['Year'] = test.index.year
test['Month'] = test.index.month
test['Weekday'] = test.index.day_name()

In [None]:
test.head()

In [None]:
test_encoded = pd.concat([test, pd.get_dummies(test[['country', 'store', 'product']])], axis=1).drop(
    columns=['row_id', 'date', 'country', 'store', 'product', 'country_Finland', 'store_KaggleMart', 'product_Kaggle Sticker'])
test_encoded['Weekday'] = test_encoded['Weekday'].map(weekday_dict)
test_encoded.head()

In [None]:
test_encoded['IsWeekend'] = test_encoded['Weekday'].map(weekend_dict)
test_encoded['month_class'] = test_encoded['Month'].map(month_dict)
test_encoded.drop(columns=['Month', 'Weekday'], inplace=True)
test_encoded.head()

Baseline predictions with Random forest regressor:

In [None]:
# preds = random_forest_fit.predict(test_encoded)
# submission_df = pd.DataFrame({'row_id':test['row_id'],'num_sold':preds})
# submission_df.to_csv('submit_rf.csv', index = False)

**Credits to other notebooks in the competition**