In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Tabular Playground Series - Jan 2022
In the [Tabular Playground Series - Jan 2022 competition](https://www.kaggle.com/c/tabular-playground-series-jan-2022) we are tasked with predicting the sales of three different products namely the Kaggle Mug, the Kaggle Hat and the Kaggle Sticker in two different stores i.e. KaggleMart and KaggleRama in three different countries which are Finland, Sweden and Norway for the year 2019. 
We are provided with training data for the years 2015 to 2018.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from urllib.request import urlopen
from PIL import Image

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

In [None]:
df_train.info()

#### We can see that the data has no missing values!! For all 18 combinations of these, we have the sales data for 1461 days. The 1461 days are all days of the four years 2015, 2016, 2017, 2018.

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
# Dropping column row_id
df_train = df_train.drop('row_id', axis=1)
df_train.head()

In [None]:
# Changing data to datetime datatype
df_train.date = pd.to_datetime(df_train.date)
df_train.date.dtype

In [None]:
def EDA(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            print('\033[1m' + 'Total Unique values in {} :'.format(col) + '\033[0m',len(df_train[col].unique()))
            print('\t\033[1m' + 'Categories in {} :'.format(col) + '\033[0m', df_train[col].unique())

EDA(df_train)

# Histograms 


In [None]:
plt.figure(figsize=(20, 12))
for i, (combi, df) in enumerate(df_train.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
    ax.hist(df.num_sold, bins=50, color='turquoise')
    ax.set_title(combi)
    
plt.suptitle('Histograms of num_sold', y=1)
plt.tight_layout(h_pad = 3.0)
plt.show()

The histograms for every country-store-product combination show that all histograms are skewed. For the outliers, predictions will be much less accurate than for the regular days. 

In [None]:
# The dates are read as strings and must be converted
for df in [df_train, df_train]:
    df['date'] = pd.to_datetime(df.date)
    df.set_index('date', inplace=True, drop=False)

In [None]:
plt.figure(figsize=(20, 16))
for i, (combi, df) in enumerate(df_train.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin = 1)
    resampled = df.resample('MS').sum()
    resampled = resampled.groupby(resampled.index.month).mean()
    ax.bar(range(1, 13), resampled.num_sold)
    ax.set_xticks(ticks=range(1, 13), labels='JFMAMJJASOND')
    ax.set_title(combi)
    ax.set_ylim(resampled.num_sold.min(), resampled.num_sold.max())
plt.suptitle('Monthly sales for 2015-2018', y=1.03)
plt.tight_layout(h_pad=3.0)
plt.show()

## Monthly sales & seasonal variation
A plot of the monthly totals shows the seasonal variation and a growing trend. The growth looks more significant fo the stickers than for the hats.

In [None]:
plt.figure(figsize=(20, 16))
for i, (combi, df) in enumerate(df_train.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
    #print(df.resample('MS').num_sold.sum())
    resampled = df.resample('MS').num_sold.sum()
    ax.bar(range(len(resampled)), resampled)
    ax.set_title(combi)
    ax.set_ylim(resampled.min(), resampled.max())
    ax.set_xticks(range(0, 48, 12), [f"Jan {y}" for y in range(2015, 2019)])
plt.suptitle('Monthly sales for 2015-2018', y=1)
plt.tight_layout()
plt.show()
plt.savefig('monthly_sales.png')
plt.close()

In [None]:
fig = plt.figure(figsize=(15, 10))
# fig.set_facecolor("#fff")
ax = fig.add_subplot()
# ax.set_facecolor("#fff")
ax.grid(color="lightgrey", alpha=0.7, linewidth=1.5, axis="y", zorder=0)
sns.countplot(x="country", data=df_train, ax=ax, zorder=2)
ax.set_title("Country Distribution", loc="left", fontsize=25, pad=5, y=1, zorder=3)
ax.xaxis.set_tick_params(color="#000", labelsize=15, pad=10, length=0)
ax.set_xlabel(None)
ax.set_ylabel("Count", fontsize=15, fontfamily="serif", labelpad=10)
# hide_spines(ax)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_yticks(range(0, 9000, 1000))
fig.show()