# Import Libraries

In [None]:
%%capture
!pip install pycaret[full]
!pip install autoviz

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os
import shutil
import warnings
import seaborn as sns
import datatable as dt
from pathlib import Path
import plotly.express as px
from pycaret.regression import *
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
%matplotlib inline
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')

In [None]:
train['date'] = pd.to_datetime(train['date'])
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['dayofweek'] = train['date'].dt.dayofweek
train['dayofmonth'] = train['date'].dt.days_in_month
train['dayofyear'] = train['date'].dt.dayofyear
train['weekday'] = train['date'].dt.weekday

test['date'] = pd.to_datetime(test['date'])
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['dayofweek'] = test['date'].dt.dayofweek
test['dayofmonth'] = test['date'].dt.days_in_month
test['dayofyear'] = test['date'].dt.dayofyear
test['weekday'] = test['date'].dt.weekday

train.drop('date', axis = 1, inplace = True)
test.drop('date', axis = 1, inplace = True)

In [None]:
train.head()

# EDA

In [None]:
fig = px.pie(train, values='num_sold', names='product', title= "Distribution of Products Sold")
fig.show()

In [None]:
fig = px.pie(train, values='num_sold', names='country', title= "Distribution of Products Sold by Country")
fig.show()

In [None]:
norway  = train[train.country == 'Norway']
norway  = pd.DataFrame(norway.groupby('product')['num_sold'].sum()).reset_index()

finland = train[train.country == 'Finland']
finland = pd.DataFrame(finland.groupby('product')['num_sold'].sum()).reset_index()

sweden  = train[train.country == 'Sweden']
sweden  = pd.DataFrame(sweden.groupby('product')['num_sold'].sum()).reset_index()

In [None]:
fig = make_subplots(rows=3, cols=1,
                    specs=[[{'type':'domain'}],[{'type':'domain'}],[{'type':'domain'}]],
                    vertical_spacing = 0.1)

fig.add_trace(go.Pie(
             values = norway['num_sold'],
             labels = norway['product'],
             title = dict(text = 'Distribution of<br>Kaggle Products<br>sales in Norway',
                          font = dict(size=18, family = 'monospace'),
                          ),
             hole = 0.5,
             hoverinfo='label+percent',),1,1)
fig.add_trace(go.Pie(
             values = finland['num_sold'],
             labels = finland['product'],
             title = dict(text = 'Distribution of<br>Kaggle Products<br>sales in Finland',
                          font = dict(size=18, family = 'monospace'),
                          ),
             hole = 0.5,
             hoverinfo='label+percent',),2,1)

fig.add_trace(go.Pie(
             values = sweden['num_sold'],
             labels = sweden['product'],
             title = dict(text = 'Distribution of<br>Kaggle Products<br>sales in Sweden',
                          font = dict(size=18, family = 'monospace'),
                          ),
             hole = 0.5,
             hoverinfo='label+percent',),3,1)
fig.update_traces(row=1, col=1, hoverinfo='label+percent',
                  textinfo='label+percent',
                  textfont_size=12,
                  opacity = 0.8,
                  showlegend = False,
                  marker = dict(colors = sns.color_palette('Reds').as_hex(),
                              line=dict(color='#000000', width=1)))

fig.update_traces(row=2, col=1, hoverinfo='label+percent',
                  textinfo='label+percent',
                  textfont_size=12,
                  opacity = 0.8,
                  showlegend = False,
                  marker = dict(colors = sns.color_palette('Blues').as_hex(),
                              line=dict(color='#000000', width=1)))

fig.update_traces(row=3, col=1, hoverinfo='label+percent',
                  textinfo='label+percent',
                  textfont_size=12,
                  opacity = 0.8,
                  showlegend = False,
                  marker = dict(colors = sns.color_palette('Wistia').as_hex(),
                              line=dict(color='#000000', width=1)))

fig.update_layout(margin=dict(t=0, b=0, l=0, r=0),
                  height = 1200,
                  font_family   = 'monospace',)

In [None]:
sold = pd.DataFrame(train.groupby('country')['num_sold'].sum()).reset_index()

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x = sold['country'],
                     y = sold['num_sold'],
                     text = sold['num_sold'],
                     textposition = 'outside',
                     marker_color= ['#2d8fb5', '#eb8f8f', '#ebe88f'],
                     width=0.4
                 ))

fig.update_xaxes(tickfont = dict(size=17),
                 tickmode = 'array',
                 ticklen = 6,
                 showline = False,
                 showgrid = False,
                 ticks = 'outside')

fig.update_yaxes(showgrid=False,categoryorder='total ascending',
                 showline=False)

fig.update_layout(font_family   = 'monospace',
                  title         = dict(text = 'Total units sold by Country', x = 0.525),
                  margin        = dict(t=80, b=0, l=70, r=40),
                  font          = dict(color='black'),
                  showlegend    = False,
                 )
fig.show()

In [None]:
sep = ","
dft = AV.AutoViz('',sep=",",depVar="",dfte = train,
                 header=0,verbose=0,lowess=False,chart_format="svg",max_rows_analyzed=150000,
                 max_cols_analyzed=30,)

In [None]:
# Credit to https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
train.isna().sum()

# Modelling

In [None]:
setup(data = train,
            target = 'num_sold',
            data_split_shuffle = False, # we do not use "future" observations to predict "past" observations
            create_clusters = True,
            use_gpu = True,
            silent = True,
            n_jobs = -1)

In [None]:
add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better = False)

In [None]:
N = 3
top = compare_models(sort = 'SMAPE', n_select = N)

In [None]:
blend = blend_models(top)
predict_model(blend);

In [None]:
final_blend = finalize_model(blend)
predict_model(final_blend);

In [None]:
gc.collect()
unseen_predictions = predict_model(final_blend, data=test)
unseen_predictions.head()

In [None]:
gc.collect()

assert(len(test.index)==len(unseen_predictions))

sub = pd.DataFrame(list(zip(test.row_id, unseen_predictions.Label)),columns = ['row_id', 'num_sold'])

sub.to_csv('submission.csv', index = False)

sub.head()

Upvote, If you like the work.

Working on this NB daily check out for more information.

Thank you.