In [None]:
# Imports

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import gridspec

# Data Preparation

In [None]:
# Reading in the files
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', parse_dates=['date'])

# Dropping the 'row_id' column
train.drop(['row_id'], inplace=True, axis=1)

# Removing Kaggle from store and product
train['product'] = train['product'].apply(lambda row: row.split(' ')[1])
train['store'] = train['store'].str[6:]

# Viewing the first few observations
train.head()

# EDA

## Description of Data

In [None]:
train.info()
print('-'*50)

train.describe()

In [None]:
# Describing the categorical variables

## Function that creates a description on categorical features
def describe_categorical(X):     
    from IPython.display import display, HTML     
    display(HTML(X[X.columns[X.dtypes=="object"]].describe().to_html()))
    
## Describing the categorical features
describe_categorical(train)

In [None]:
# Helper Functions for Plotting

fontsize = {
    'title':18,
    'label':16,
    'tick':14,
}

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']

colors = ['salmon', 'steelblue', '#029386', '#AF8FE9']

## Plotting data by month
def plot_time_series(df, cat, g='month', ax=None, xax=True, lw=5):
    if not ax:
        fig = plt.figure(figsize=(20,6))
        ax = fig.add_subplot(1,1,1)
    for label, color in zip(df[cat].unique(), colors):
        y = df.loc[df[cat]==label,]
        ax.plot(y[g], y['num_sold'], c=color, label=label, linewidth=lw)
    plt.legend(title=f"{cat.capitalize()}")
    plt.xlabel(f'{g.capitalize()}', fontsize=fontsize['label'])
    plt.xticks(fontsize=fontsize['tick'])
    if xax:
        plt.xticks(df[g].unique())
    if g == 'month':
        ax.set_xticklabels(months)
    plt.yticks(fontsize=fontsize['tick'])
    plt.ylabel('Items Sold', fontsize=fontsize['label'])
    plt.title(f'{g.capitalize()}ly Sales by {cat.capitalize()}', fontsize=fontsize['title'])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False);

def addtext(x,y):
    for idx, val in enumerate(x):
        plt.text(val,round(y[idx])-40,round(y[idx]), ha = 'center', fontdict={'color':'white','weight': 'bold'})
    
## Plotting bar graph
def plot_bar(df, cat, legend=False, ax=None):
    if not ax:
        fig = plt.figure()
        ax = fig.add_subplot(1,1,1)
    df = df.reset_index().sort_values(by='num_sold', ascending=False)
    ax.bar(df[cat], df['num_sold'], color='steelblue')
    for idx, val in enumerate(df[cat].unique()):
        plt.text(val,round(df['num_sold'][idx])-40,round(df['num_sold'][idx]), ha = 'center', fontdict={'color':'white','weight': 'bold', 'size':14})
    plt.xticks(fontsize=fontsize['tick'])
    plt.xlabel(cat.capitalize(), fontsize=fontsize['label'])
    plt.yticks(fontsize=fontsize['tick'])
    plt.ylabel('Items Sold', fontsize=fontsize['label'])
    plt.title(f'Average Items Sold by {cat.capitalize()}', fontsize=fontsize['title'])
    if legend:
        plt.legend(title=f"{cat.capitalize()}")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    plt.gca().get_yaxis().set_visible(False);
    
## function to add annotations
def addlabels(x,y):
    for idx, val in enumerate(x):
        plt.text(val, round(y[idx])-40, round(y[idx]), ha = 'center', fontdict={'color':'white','weight': 'bold', 'size':12})
        
## Plotting pie chart        
def plot_pie(df, ax=None, title=None):
    if not ax:
        fig, ax = plt.subplots()
    patches, texts, pcts = ax.pie(
        df['num_sold'],
        labels=df.index,
        colors=colors,
        autopct='%.1f%%',
        wedgeprops={'linewidth': 3.0, 'edgecolor': 'white'},
        textprops={'size': 'large'},
        startangle=90
    )
    for i, patch in enumerate(patches):
        texts[i].set_color(colors[i])
    plt.setp(pcts, color='white', fontweight=500, fontsize=fontsize['tick'])
    plt.setp(texts, fontweight=600, fontsize=fontsize['label'])
    plt.tight_layout()
    if title:
        ax.set_title(title, fontsize=fontsize['title'])        

# Dealing with Time

In [None]:
# Viewing the range of time

print(f"Range of time series: {train['date'].min()} -> {train['date'].max()}")

In [None]:
# Extracting year and month

train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['week'] = train['date'].dt.isocalendar().week
train['day'] = train['date'].dt.day

In [None]:
# Grouping by month and year

time = train.groupby(['year', 'month']).agg({'num_sold':'mean'}).reset_index()

In [None]:
fig = plt.figure(figsize=(20,6))
ax = fig.add_subplot(1,1,1)
plot_time_series(train, 'year', 'date', ax, xax=False, lw=.4)

In [None]:
# Plotting Monthly items sold

plot_time_series(time, 'year')

## Insights by Year
* There is a yearly seasonality for all years
    * around Christmas time, there is a huge spike
    * around the summer the sales go down and hits s minimum on August and September
* the sales are increasing from year to year

## Items Sold by Country, Product, Store

In [None]:
# Grouping data to get mean of 'country', 'product', 'store'

country = train.groupby(['country']).agg({'num_sold':'mean'}).sort_values(by='num_sold', ascending=False)
product = train.groupby(['product']).agg({'num_sold':'mean'}).sort_values(by='num_sold', ascending=False)
store = train.groupby(['store']).agg({'num_sold':'mean'}).sort_values(by='num_sold', ascending=False)
year = train.groupby(['year']).agg({'num_sold':'mean'}).sort_values(by='num_sold', ascending=False)

In [None]:
# Plotting pie charts

fig, axs = plt.subplots(1, 4, figsize=(20, 8))
fig.tight_layout()
plot_pie(df=country, ax=axs[0], title='Country Sales')
plot_pie(df=product, ax=axs[1], title='Product Sales')
plot_pie(df=store, ax=axs[2], title='Store Sales')
plot_pie(df=year, ax=axs[3], title='Yearly Sales')

## Insights on Sales
* Norway makes the most sales
* Hats are the #1 sellers
* The store Rama makes the most sells
* Sales are increasing yearly

# Country

In [None]:
# Group by 'country' AND 'month' and 'country' 

country_group = train.groupby('country').agg({'num_sold':'mean'}).reset_index().sort_values(by='num_sold', ascending=False)
country_time = train.groupby(['month', 'country']).agg({'num_sold':'mean'}).reset_index()

In [None]:
## Creating a figure
fig = plt.figure(figsize=(20,6))

## Creating a grid for different subplots
spec = gridspec.GridSpec(
    ncols=2, nrows=1,
    width_ratios=[1, 2], wspace=0.2,
)

## Plotting the bar graph
ax = fig.add_subplot(spec[0])
plot_bar(country, cat='country', ax=ax)

## Plotting the time series
ax2 = fig.add_subplot(spec[1])
plot_time_series(country_time, 'country', ax=ax2)

## Insights on Country by Year
* the country that makes the most sales is in Norway
    * by about 150 more per month
* each country follows the same seasonal pattern as the year

# Store

In [None]:
# Viewing the average items sold by store by month

store_time = train.groupby(['month', 'store']).agg({'num_sold':'mean'}).reset_index()

In [None]:
# Grouping store values

t = train.groupby(['store', 'country']).agg({'num_sold':'mean'}).reset_index().sort_values(by=['country', 'store'], ascending=False)
swe_store = t[t['country']=='Sweden']['num_sold'].values
fin_store = t[t['country']=='Finland']['num_sold'].values
nor_store = t[t['country']=='Norway']['num_sold'].values

In [None]:
## Creating a figure
fig = plt.figure(figsize=(20,6))

## Creating a grid for different subplots
spec = gridspec.GridSpec(
    ncols=2, nrows=1,
    width_ratios=[1, 2], wspace=0.2,
)

## Plotting the bar graph
ax = fig.add_subplot(spec[0])

barWidth = 0.20
br1 = np.arange(store_time['store'].nunique())
br2 = [x + barWidth for x in br1]
br3 = [x + barWidth for x in br2]

ax.bar(br1, fin_store, color='steelblue', width=barWidth, label='Finland')
addlabels(br1, fin_store)

ax.bar(br2, nor_store, color='salmon', width=barWidth, label='Norway')
addlabels(br2, nor_store)

ax.bar(br3, swe_store, color='teal', width=barWidth, label='Sweden')
addlabels(br3, swe_store)

ax.set_xticks((br2[0],br2[1]), ('Rama', 'Mart'), fontsize=fontsize['tick'])
# plt.xticks(fontsize=fontsize['tick'])
plt.xlabel('Store', fontsize=fontsize['label'])
ax.legend(title='Country')
plt.title('Average Items Sold by Store', fontsize=fontsize['title'])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.gca().get_yaxis().set_visible(False);

## Plotting the 
ax2 = fig.add_subplot(spec[1])
plot_time_series(store_time, 'store', ax=ax2)

## Insights on Stores
* There are only two different stores
* Rama makes the most sales compared to Mart
* The yearly cycle is the same even in the store level 

# Products



In [None]:
# More group bys

product_time = train.groupby(['month','product']).agg({'num_sold':'mean'}).reset_index()
gp = train.groupby(['country', 'store', 'product']).agg({'num_sold':'mean'}).sort_values(by='num_sold', ascending=True)

In [None]:
# Plotting time vs product sales

plot_time_series(df=product_time, cat='product');

In [None]:
# Plotting the sales by 'country', 'store', 'product'

gp.plot(kind='barh', figsize=(18,8), legend=False, title="Sales by Country, Store, Product");

## Insights on Products
* Hats follow the trend of the yearly cycle
    * It makes sense, people wear hats during the winter times and not the summer
* Mugs almost follows a different pattern
    * It does spike around winter time
    * Levels off rest of the year
* Stickers are leveled off
    * Yhis makes sense since the time of the year will not affect the decision to buy a sticker or not
    * Might be better off not modeling this feature