### Import the necessary packages.

In [None]:
!pip install -q klib

In [None]:
import klib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualization
from matplotlib import pyplot as plt # visualization
%matplotlib inline
import warnings
import re
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder

### Import train and test dataset

In [None]:
train=pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
test=pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")

### Structure of the train and test dataset.

In [None]:
train.info()

#### The above information shows that there are 26298 data entries and 6 columns.

#### The target column is **num_sold**.

In [None]:
test.info()

#### The above information shows that there are 6570 data entries and 5 columns.

#### Let's see is there any missing values in train and test dataset by column wise.

In [None]:
train.isnull().sum()

#### The above information shows that the train dataset doesn't have any missing values.

In [None]:
test.isnull().sum()

#### The above information shows that the test dataset doesn't have any missing values.

#### Let's see the glimpse of train dataset

In [None]:
train.head()

### Let's explore and visualize each column of train dataset.First,let's see the target column(**num_sold**) distribution.

In [None]:
def box_plot(df,col):
  plt.figure(figsize=(8,8))
  bx=sns.boxplot(data=df ,y=col,color="green")
  plt.title(col.capitalize()+" Distribution Boxplot",fontsize=15)
  plt.ylabel(col,fontsize=15)
  plt.yticks(fontsize=15)

In [None]:
klib.dist_plot(train['num_sold'])

In [None]:
box_plot(train,'num_sold')

In [None]:
train['num_sold'].describe()

#### The above histogram shows that the number_sold column is postively skewed.

#### The boxplot explains there are outliers above the third quartile.

#### The average number of sold items 387.

#### Minimum number of sold item is 70.

#### Maximum number of sold item is 2884.

### Let's see Country column distribution and see how item sales differ in those countries.

In [None]:
def group_wise_box(df,groupcol,value,title=""):
    _=plt.figure(figsize=(10,6))
    _=sns.boxplot(x=df[groupcol],y=df[value])
    _=plt.title(title,fontsize=25)
    _=plt.xlabel(groupcol,fontsize=20)
    _=plt.ylabel(value,fontsize=20)
    _=plt.xticks(fontsize=18)

In [None]:
def group_summary(df,groupcol,value):
    return df.groupby(groupcol)[value].describe().reset_index().sort_values('mean',ascending=False)

In [None]:
def count_plot(df,col,title=""):
  plt.figure(figsize=(10,6))
  _=sns.countplot(x=df[col],order=df[col].value_counts().index)
  _=plt.title(title,fontsize=25)
  _=plt.xlabel(col,fontsize=20)
  _=plt.xticks(fontsize=14)

In [None]:
count_plot(train,'country',title="Country Distribution")

In [None]:
train['country'].value_counts()

In [None]:
group_wise_box(train,'country','num_sold',title="County Wise Sales Distribution")

In [None]:
group_summary(train,'country','num_sold')

#### The above count plot explains that all country has the same number of sales records.

#### The group by boxplot explains that the number of sold items is low in Finland when compared to Sweden and Norway.


### Let's see Store column distribution and see how item sales differ in those stores.

In [None]:
count_plot(train,'store',title="Store Distribution")

In [None]:
train['store'].value_counts()

In [None]:
group_wise_box(train,'store','num_sold',title="Store Wise Sales Distribution")

In [None]:
group_summary(train,'store','num_sold')

#### The above count plot explains that all store has the same number of sales records.

#### The group by boxplot explains that the number of sold items is low in kagglemart store.

#### There is a high difference in the mean number of sold items between kagglerama and kagglemart stores.


### Let's see product column distribution and see how item sales differ in those products.

In [None]:
count_plot(train,'product',title="Product Distribution")

In [None]:
train['product'].value_counts()

In [None]:
group_wise_box(train,'product','num_sold',title="Product Wise Sales Distribution")

In [None]:
group_summary(train,'product','num_sold')

#### The above count plot explains that all product has the same number of sales records.

#### The group by boxplot explains that the kaggle sticker's number of sold items is very low.

#### There is a high difference in the mean number of sold items between product categories.


### Let's create a grouped histogram, boxplot, and see the sales distribution.

In [None]:
g = sns.FacetGrid(train, col="country", row="store" ,hue='product',margin_titles=True,
                  height=5,aspect=.9,legend_out=True);
g.map_dataframe(sns.histplot,x="num_sold",element="poly",fill=False);
g.add_legend();



In [None]:
g = sns.FacetGrid(train, col="country", row="store", margin_titles=True,height=5,aspect=.9);
g.map_dataframe(sns.boxplot, x="product", y="num_sold",color='Orange');
g.add_legend();



In [None]:
group_summary(train,['country','store','product'],'num_sold')

#### The above density plot explains that across all countries' stores the kaggle sticker number of sold items is low.

#### The kaggle hat is the most sold item across all countries store.

#### The boxplot explains that there are outliers in all groups.

#### Let's explore the date column and see how the sales changes over the years. 

#### Let's convert the date column to date format.

In [None]:

train['date'] = pd.to_datetime(train['date'], errors='coerce')

In [None]:
plt.figure(figsize=(15,5))
sns.lineplot(x="date",y="num_sold",
             data=train);

In [None]:
g = sns.FacetGrid(train, col="country", row="store", hue='product',margin_titles=True,height=7,
                  aspect=.9);
g.map_dataframe(sns.lineplot, x="date", y="num_sold");
g.set_titles(size=15);
g.add_legend();



#### The above line plot explains that more number of items sold at end of the year.

### Let's break the date column into the day, month, year, quarter, week format and see how sales are changed.

In [None]:
train['day'],train['day_label'],train['day_number'],train['month_number'],train['month_label'],train['year_quarter'],train['week_of_year'],train['year'] = train['date'].dt.day,train['date'].dt.day_name(), train['date'].dt.dayofweek,train['date'].dt.month,train['date'].dt.strftime('%b'),train['date'].dt.quarter,train['date'].dt.week, train['date'].dt.year

### Let's see how sales are changing over each month of the year and country.

In [None]:
g = sns.FacetGrid(train, col="country", row="year", hue='product',margin_titles=True,height=7,sharex=False,
                  aspect=.9);
g.map_dataframe(sns.lineplot, x="month_label", y="num_sold");
g.set_titles(size=15);
g.add_legend();



#### The above plot explains that every year in the month of April to June and November to December sales are high.

### Let's see how sales are changing over each week of the year and country.

In [None]:
g = sns.FacetGrid(train, col="country", row="year", hue='product',margin_titles=True,height=7,
                  sharex=False,
                  aspect=.9);
g.map_dataframe(sns.lineplot, x="week_of_year", y="num_sold");
g.set_titles(size=15);
g.add_legend();



#### The above plot explains that every year in the week between 10 to 25 and 50 to 52 sales are high.

### Let's see how sales are changing over each day of the week and country.

In [None]:
g = sns.FacetGrid(train, col="country", row="year", hue='product',margin_titles=True,height=7,sharex=False,
                  aspect=.9);
g.map_dataframe(sns.lineplot, x="day_label", y="num_sold");
g.set_titles(size=15);
g.add_legend();



#### The above plot explains that the sales are high on the weekend days of each week.

### Let's see how sales are changing over each quarter of the year and country.

In [None]:
g = sns.FacetGrid(train, col="country", row="year", hue='product',margin_titles=True,height=7,sharex=False,
                  aspect=.9);
g.map_dataframe(sns.lineplot, x="year_quarter", y="num_sold");
g.set_titles(size=15);
g.add_legend();



#### The above plot explains that the sales are high on the 2 and 4 quarters of each year.