# Loading libraries and data

In [None]:
# importing necessary libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
# Loading dataset
df_train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
df_test  = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
subms  =   pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

# First look at the data

In [None]:
# First look at the data
print('\n *** Train data ***')
print(df_train.head())
print('\n --------------------------------------------------------------------- \n')
print ('\n *** Test data ***')
print(df_test.head())
print('\n --------------------------------------------------------------------- \n')
print('\n *** Shape of Train data ***')
print(df_train.shape)
print('\n --------------------------------------------------------------------- \n')
print('\n *** Shape of Test data ***')
print(df_test.shape)
print('\n --------------------------------------------------------------------- \n')
print('\n *** Info ***')
print(df_train.info())
print('\n --------------------------------------------------------------------- \n')
print('\n *** Columns in the data *** \n')
print(df_train.columns)
print('\n --------------------------------------------------------------------- \n')
print('\n *** Description of the data *** \n')
print(df_train.describe())
print('\n --------------------------------------------------------------------- \n')
print('\n *** Data types ***')
print(df_train.dtypes)
print('\n --------------------------------------------------------------------- \n')

In [None]:
# Checking for missing values
print('\n Missing values in train dataset \n')
print(df_train.isnull().sum())
print('\n ------------------------------------ \n')
print('\n Missing values in test dataset \n')
print(df_test.isnull().sum())

In [None]:
# Time frame
print('*** Time frame for train data ***')
print('Beginning date', df_train['date'].min())
print('Ending date', df_train['date'].max())
print('\n ------------------------------------ \n')
print('*** Time frame for test data ***')
print('Beginning date', df_test['date'].min())
print('Ending date', df_test['date'].max())                                 

In [None]:
print('Unique features in train data')
print(df_train.nunique())
print('\n ------------------------------------ \n')
print('Unique features in test data')
print(df_test.nunique())

In [None]:
print('Checking features in train data')
print(df_train['date'].value_counts())
print('\n ------------------------------------ \n')
print(df_train['country'].value_counts())
print('\n ------------------------------------ \n')
print(df_train['store'].value_counts())
print('\n ------------------------------------ \n')
print(df_train['product'].value_counts())

**Inference**
* Sales data is given for 1461 days starting from 2015-01-01 to 2018-12-31 <br>
* Sales for the year 2019 has to be predicted 
* Each date has 18 entries (3 countries * 3 products * 2 stores)
* There are no missing values

# Visualization

In [None]:
grp = df_train.groupby(['date','store']).agg({'num_sold':'sum'}).reset_index()
fig = px.line(grp, x = 'date', y = 'num_sold', color='store')
fig.update_layout(title='Sales by Date and Store type')
fig.show()

**Inference**
* KaggleRama sells more products than KaggleMart
* Sales increase is noticed at the end of each year

In [None]:
train_KMart = df_train[df_train.store == 'KaggleMart']
train_KRama = df_train[df_train.store == 'KaggleRama']

grp_KMart = train_KMart.groupby(['date','product']).agg({'num_sold':'sum'}).reset_index()
fig = px.line(grp_KMart, x = 'date', y = 'num_sold', color='product')
fig.update_layout(title='KaggleMart sales by Date and Products')
fig.show()

In [None]:
grp_KRama = train_KRama.groupby(['date','product']).agg({'num_sold':'sum'}).reset_index()
fig = px.line(grp_KRama, x = 'date', y = 'num_sold', color='product')
fig.update_layout(title='KaggleRama  sales by Date and Products')
fig.show()

**Inference**
* Sales of hats is high compared to mugs and stickers in both the stores
* Sales of stickers is consistent except for the spikes during end of year
* High fluctuation is noticed in the sales of hats. Sales increases during the March - June every year and also during year end 

In [None]:
grp_KMart = train_KMart.groupby(['date','country']).agg({'num_sold':'sum'}).reset_index()
fig = px.line(grp_KMart, x = 'date', y = 'num_sold', color='country')
fig.update_layout(title='KaggleMart sales by Date and Country')
fig.show()

In [None]:
grp_KRama = train_KRama.groupby(['date','country']).agg({'num_sold':'sum'}).reset_index()
fig = px.line(grp_KRama, x = 'date', y = 'num_sold', color='country')
fig.update_layout(title='KaggleRama  sales by Date and Country')
fig.show()

**Inference**
* Sales is high in Norway compared to other two countries

In [None]:
# converting date feature from string to datetime data type
df_train['date'] = pd.to_datetime(df_train.date)
df_test['date'] = pd.to_datetime(df_test.date)

In [None]:
def feat1(df):
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofmonth'] = df['date'].dt.day
    df['dayofyear'] = df['date'].dt.dayofyear
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['weekend']=(df['dayofweek']//5 == 1)       
    df['weekend']=df['weekend'].astype('int')  
    df['week']=df['date'].dt.isocalendar().week     
    df['week'][df['week']>52]=52                    
    df['week']=df['week'].astype('int')
    return df

df_train = feat1(df_train)
df_test = feat1(df_test)

In [None]:
def hbar(col):
    temp = df_train.groupby(col).agg({'num_sold':'mean'}).reset_index()
    temp = temp.sort_values(col, ascending=False)
    c = {
        'y' : list(temp['num_sold']),
        'x' : list(temp[col]),
        'title' : 'Average sales by ' +col}
    trace = go.Bar(y=c['y'], x=c['x'], orientation='v')
    layout = go.Layout(title=c['title'], xaxis_title="", yaxis_title="", width=650)
    fig = go.Figure([trace], layout=layout)
    fig.update_xaxes(tickangle=45, tickfont=dict(color='crimson'))
    fig.update_yaxes(tickangle=0, tickfont=dict(color='crimson'))
    fig.show()
    return trace
    
trace1 = hbar('dayofweek') 
trace2 = hbar('month') 
trace3 = hbar('year')

**Inference**
* Sales increases during weekend and on friday
* Max sales is noticed during December 
* July - Nov sales is low
* Sales of products increase year on year