# Initilizing the Notebook...

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.head()

# Sum of all product sold
First, let's look at the aggregate sum of all products sold for each company before we get into the time series portion of it.

In [None]:
def absolute_value(val):
    a  = np.round(val/100.*sum(pie_data), 0)
    return a

labels = df['store'].unique()
pie_data = [df[df['store'] == i]['num_sold'].sum() for i in labels]
plt.figure(figsize=(8,8))
plt.pie(pie_data, labels=labels, autopct=absolute_value)
plt.title(f'Total sum of product sold between {labels[0]} and {labels[1]}', fontsize=16)
plt.axis('equal')
plt.show()

So, it looks as if KaggleRama has been the bigger vendor so far in this competition. So why not end the competition here and just make KaggleRama the official vendor? Well, as data scientists we know that past performance does not matter as much as future performance. KaggleMart may have not been in the game long enough to grow its store as much as KaggleRama, while giving the better product, customer service, user experience, etc, that would separate itself as the better merchandise vendor.

# Product sold by each company during the timeframe

In [None]:
plt.figure(figsize=(14,9))
sns.lineplot(data=df, x='date', y='num_sold', hue='store')
plt.title('Product sold by each company for the timeframe given', fontsize=18)
plt.show()

This graph shows the number of products sold versus time for each store. Unsurprisingly, for any given day, KaggleRama seems to outperform KaggleMart on its products sold. Each vendor has spikes of sale right before the new year where the significant others of Data scientists buy all the bad random forest and modeling pun sweaters they can handle. Then part way through the year again, more smaller spikes in sales for the yearly Christmas-in-July celebration. It is probably worth noting that each year sales seems to be taking a roughly sinusoidal path with peaks being in Febuary (Since Data science is a work of love (for data)). And lastly, each year, you can see the rate of products being sold increasing for each vendor.

# Breakdown each stores sales by products sold

In [None]:
mart = df.loc[df['store'] =='KaggleMart']
rama = df.loc[df['store'] =='KaggleRama']

In [None]:
def absolute_value(val):
    a  = np.round(val/100.*sum(pie_data), 0)
    return a
plt.figure(figsize=(12,4))
labels = mart['product'].unique()
pie_data = [mart[mart['product'] == i]['num_sold'].sum() for i in mart['product'].unique()]
plt.subplot(1,2,1)
plt.pie(pie_data, labels=labels, autopct=absolute_value)
plt.title(f'Breakdown of products sold by KaggleMart', fontsize=16)
plt.axis('equal')

labels = rama['product'].unique()
pie_data = [rama[rama['product'] == i]['num_sold'].sum() for i in rama['product'].unique()]
plt.subplot(1,2,2)
plt.pie(pie_data, labels=labels, autopct=absolute_value)
plt.title(f'Breakdown of products sold by KaggleRama', fontsize=16)
plt.axis('equal')

plt.show()

In [None]:
print(f"Percentages of product sold per product for KaggleMart:\n {(mart.groupby('product').sum()/mart['num_sold'].sum())['num_sold']}\n")
print(f"Percentages of product sold per product for KaggleRama:\n {(rama.groupby('product').sum()/rama['num_sold'].sum())['num_sold']}")

How interesting! It looks as if Kaggle fans are buying roughly the same percentages of each different product. The main difference we've seen so far between these stores are the quantities of products being sold. Let's look at these further, and see the breakdown over the years.

In [None]:
plt.figure(figsize=(19,9))
plt.subplot(1,2,1)
sns.lineplot(data=mart, x='date', y='num_sold', hue='product')
plt.title('Breakdown of products sold by KaggleMart', fontsize=18)

plt.subplot(1,2,2)
sns.lineplot(data=rama, x='date', y='num_sold', hue='product')
plt.title('Breakdown of products sold by KaggleRama', fontsize=18)

plt.show()

It looks like the patterns continue to be symmetric among the two vendors. Each product roughly follows the same patterns we talked about previously. Now is just an added breakdown by products. Stickers seem to perform the worst, with mugs in the middle and lastly hats being the top performer. Lastly, with each product, regardless of the vendor, the top number of sales per year increases year to year.

# Products sold by Country

In [None]:
def absolute_value(val):
    a  = np.round(val/100.*sum(pie_data), 0)
    return a

plt.figure(figsize=(16,4))
labels = mart['country'].unique()
pie_data = [mart[mart['country'] == i]['num_sold'].sum() for i in mart['country'].unique()]
plt.subplot(1,2,1)
plt.pie(pie_data, labels=labels, autopct=absolute_value)
plt.title(f'Breakdown of product sold by KaggleMart in each country', fontsize=16)
plt.axis('equal')

labels = rama['country'].unique()
pie_data = [rama[rama['country'] == i]['num_sold'].sum() for i in rama['country'].unique()]
plt.subplot(1,2,2)
plt.pie(pie_data, labels=labels, autopct=absolute_value)
plt.title(f'Breakdown of products sold by KaggleRama in each country', fontsize=16)
plt.axis('equal')

plt.show()

In [None]:
print(f"Percentages of product sold per country for KaggleMart:\n {(mart.groupby('country').sum()/mart['num_sold'].sum())['num_sold']}\n")
print(f"Percentages of product sold per country for KaggleRama:\n {(rama.groupby('country').sum()/rama['num_sold'].sum())['num_sold']}")

Wow! The pattern continues as the percentages are similar while the volume from KaggleRama is just higher than that of KaggleMart. For each store, Norway has the most buyers with Sweden in second and Findland being the smallest market. I guess we know why hats are the most sold product now! Gotta keep those ears warm...

# Total Sales by Date

In [None]:
mart_sales = mart.groupby('date')['num_sold'].sum().to_frame()
rama_sales = rama.groupby('date')['num_sold'].sum().to_frame()
mart_sales['cumulative'] = mart.groupby('date')['num_sold'].sum().cumsum()
rama_sales['cumulative'] = rama.groupby('date')['num_sold'].sum().cumsum()
mart_sales['store'] = 'KaggleMart'
rama_sales['store'] = 'KaggleRama'
cum_sales = mart_sales.append(rama_sales)
cum_sales = cum_sales.sort_values('date').reset_index()

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(data=cum_sales, x='date', y='cumulative', hue='store')
plt.title('Total amount of products sold per store', fontsize=18)
plt.show()

Here is the cumulative number of products sold for each store. As we can see, the rate of sales is fairly linear with a slight curve upward which means that both of these stores are growing in popularity. The steeper slope of products sold by KaggleRama relative to KaggleMart would be from KaggleRama consistently selling more products, per product, per country. For this competition of stores, its going to be about which store has the greatest concavity from the graph shown. From the graph you cannot immediatly tell which store is selling at a more increaed rate as the sales seem to be increasing in a fairly linear fashion.

# Conclusion

So far, it seems that KaggleRama has had the majority of the market when it comes to selling products within the countries given. With Norway being the largest market opportunity for both stores when it comes to selling hats. For the overall competition, it may help to have a check in the predictions to make sure that the percentages outlined here are roughly followed. It would definitely be surprising for stickers to have a massive increase in num_sold based on what we saw here today.

Hope you enjoyed this EDA and that this inspires some interesting thoughts and/or conversation. Happy New Year!

All the best,
BTK