In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import os

In [None]:
for dirname, _, filenames in os.walk('../input/tabular-playground-series-jan-2022'):
    for filename in filenames:
        #print(os.path.join( filename))
        if (filename.find('train') != -1):
            train=pd.read_csv(os.path.join(dirname, filename))
            train.info()
            
        elif (filename.find('test') != -1):
            test=pd.read_csv(os.path.join(dirname, filename))
            test.info()
            print("="*50)
        else:
            submission=pd.read_csv(os.path.join(dirname, filename))

In [None]:
print("Check for Null values in Train csv",train.isnull().sum())
print("="*50)
print("Check for Null values in Test csv",test.isnull().sum())
print("="*50)

In [None]:
train.describe()

There could be some outliers in num_sold field.

#### KPI Variables

In [None]:
# add store information
fig = go.Figure(data=[go.Table(header=dict(values=['KPI', 'Value']),
                 cells=dict(values=[['Number of Countries', 'Number of stores','Number of Different Products', 
                                     'Window Start Date', 'Window End Date',
                                    '#Rows in training set', '#Date Points in Train Dataset'], 
                                    [train['country'].nunique(), train['store'].nunique() ,train['product'].nunique(), 
                                     train['date'].min(), train['date'].max(),
                                    train.shape[0], train['date'].nunique()]]))
                     ])

fig.update_layout({"title": f'BASIC KPIS of TRAIN DATA'}, height=500, width=500)
fig.show()

#### TIME SERIES CHART : AVG SALES ON EACH DAY

In [None]:

train_aux = train[['date', 'num_sold', 'store']].groupby('date').mean()
train_aux = train_aux.reset_index()
fig = go.Figure(data=go.Scatter(x=train_aux['date'], 
                                y=train_aux['num_sold'],
                                marker_color='red', text="sales"))
fig.update_layout({"title": f'Avg Sales by date for all stores and products',
                   "xaxis": {"title":"Date"},
                   "yaxis": {"title":"Avg Unit Sold"},
                   "showlegend": False})
fig.show()

Increasing Trend in Sales across Years.

There is a dip in sales post May which improves at Year end. This is steady trend across years.

#### Store vs Avg. Sales

In [None]:
train_aux=train[['date', 'num_sold', 'store']].groupby(['date','store']).mean().reset_index()
fig = px.scatter(train_aux, x="date", y="num_sold", color='store', 
                           color_continuous_scale="earth",
                 size='num_sold',  size_max=30)

fig.update_layout({"title": f'Correlation between store and Sales (total avg sales and promotion on each day)',
                   "xaxis": {"title":"date"},
                   "yaxis": {"title":"Sales"},
                   "showlegend": False})
fig.show()

Across the Years KaggleRama is producing more sales than Kaggle Rama. 

#### Country vs Avg. Sales

In [None]:
train_aux=train[['date', 'num_sold', 'country','store']].groupby(['date','store','country']).mean().reset_index()
train_aux['store-country']=train_aux['store']+" in "+train_aux['country']
train_aux['year'] = pd.to_datetime(train_aux['date']).dt.year
train_aux.head()


In [None]:
fig=plt.figure(figsize=(10,8))
for store,color in zip(train_aux['store-country'].unique(),['red','orange','brown','green','yellow','black']):
    sns.lineplot(data = train_aux[train_aux['store-country'] == store],x='year',y='num_sold', linewidth = 1.5, label=str(store))
    
plt.tight_layout()

In [None]:
sns.lmplot(x='year',y='num_sold',data=train_aux,row='country',col='store')

In all the countries KaggleRama has contributed more in sales than Kagglemart. Further rate of increase in sales for KaggleRama is higher than KaggleMart.

In [None]:
# extract date features
train['year'] = pd.to_datetime(train['date']).dt.year
train['month_name'] = pd.to_datetime(train['date']).dt.month_name()
train['month'] = pd.to_datetime(train['date']).dt.month
train['day'] = pd.to_datetime(train['date']).dt.day
train['day_of_week_name'] = pd.to_datetime(train['date']).dt.day_name()
train['day_of_week'] = pd.to_datetime(train['date']).dt.day_of_week


In [None]:
by_feature_num_sold_df=train.groupby(['year','store']).median()['num_sold'].reset_index()
px.bar(by_feature_num_sold_df,x='year', y='num_sold',color='store')

In [None]:
by_feature_num_sold_df=train.groupby(['month','month_name','store']).median()['num_sold'].reset_index()
by_feature_num_sold_df = by_feature_num_sold_df.sort_values('month', ascending=True)
px.bar(by_feature_num_sold_df,x='month_name', y='num_sold',color='store')

In [None]:
by_feature_num_sold_df=train.groupby(['day_of_week','day_of_week_name','store']).median()['num_sold'].reset_index()
by_feature_num_sold_df = by_feature_num_sold_df.sort_values('day_of_week', ascending=True)
px.bar(by_feature_num_sold_df,x='day_of_week_name', y='num_sold',color='store')

In [None]:
by_feature_num_sold_df=train.groupby(['day','store']).median()['num_sold'].reset_index()
by_feature_num_sold_df = by_feature_num_sold_df.sort_values('day', ascending=True)
px.bar(by_feature_num_sold_df,x='day', y='num_sold',color='store')

Sale is continuously increasing from 2015 to 2018, for KaggleRama increment rate is more than KaggleMart.

Peak of the Sale is in January & December, after January sale dips gradually to touch the minimum in Jule, from August Sale increases again to reach to peak value in December.

Sale starts to increase from Thursday in a week and in Weekend Sale is highest. In all the cases KaggleRama sale is higher than KaggleMart.

Sale does not change much in a month, it is more or less same with some minor hikes.

#### Analysis of Sales in Stores w.r.t types of Products

In [None]:
by_feature_num_sold_df=train.groupby(['store','product']).median()[['num_sold']].reset_index()
px.bar(by_feature_num_sold_df,x='product', y='num_sold',color='store')

In [None]:
by_feature_num_sold_df=train.groupby(['year','store','product']).median()[['num_sold']].reset_index()
sns.lmplot(x='year',y='num_sold',data=by_feature_num_sold_df,row='product',col='store')

KaggleRama is contributing much more than KaggleMart in sales of all the products across years.