# Part 1: EDA

In [None]:
# Importing library
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
plt.rcParams["figure.dpi"] = 200
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False

Loading dataset

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")

In [None]:
train_data=train
test_data=test

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head(10)

In [None]:
test.head(10)

Checking for null values 

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

We need to change Date column since it is  in text format

In [None]:
train.dtypes

In [None]:
test.dtypes

In [None]:
train["date"]=pd.to_datetime(train["date"])
test["date"]=pd.to_datetime(test["date"])

Finding for Unqiue Values in Train Data

In [None]:
c=train["country"].value_counts()

In [None]:
s=train["store"].value_counts()

In [None]:
p=train["product"].value_counts()

Visualizing Product

In [None]:
plt.pie(p.values,labels=p.index)
plt.legend()
plt.show()

Visualizing Sores using Bar chart

In [None]:
plt.bar(s.index,s.values , color="purple")
plt.show()

Visualizing using bar charts

In [None]:
plt.bar(c.index,c.values , color="pink")
plt.show()

In [None]:
c

In [None]:
s

In [None]:
p

As a result of counting for each column we can say that our dataset is well balanced

- Train data starts in 2015 and ends in 2018 
- Test data starts in 2019 and ends in 2019

In [None]:
print("Train-Data")
print("Duration Starts from: ", train["date"].min())
print("Duration Ends from: ",train["date"].max())

In [None]:
print("Test-Data")
print("Duration Starts from: ", test["date"].min())
print("Duration Ends from: ",test["date"].max())

Grouping by months

In [None]:
train_month = train.set_index("date").groupby([pd.Grouper(freq="M"),"country","store","product"])["num_sold"].mean().unstack([1,2,3])
train_month_country=train.set_index("date").groupby([pd.Grouper(freq="M"),"country"])["num_sold"].mean().unstack()

In [None]:
train_month.head()

Grouping by month makes it eaiser to see overall trends. 

In [None]:
fig, ax= plt.subplots(1,1,figsize=(12,9))
train.set_index("date").groupby([pd.Grouper(freq="M")])['num_sold'].mean().plot(ax=ax)
ax.set_title("Monthly Trend",fontsize=15,loc="left")
plt.show()

In [None]:
fig, ax=plt.subplots(1,1,figsize=(12,9))
train.set_index("date").groupby([pd.Grouper(freq="M"),"country"])["num_sold"].mean().unstack().plot(ax=ax)
ax.set_title("Monthly Trends By Country")
plt.show()

Grouping by days
> Time series data such as product sales often have different distribution on weekends and weekdays 

In [None]:
train["dayofweek"]=train["date"].dt.dayofweek

Here a visualization of the average of the days of the week by month to see the weekend trends:

- 0: mon
- 1: tue
- 2: wed
- 3:thu
- 4 :fri
- 5:sat
- 6:sun

In [None]:
fig, ax=plt.subplots(1,1,figsize=(9,6))
train.set_index("date").groupby([pd.Grouper(freq="M"),"dayofweek"])["num_sold"].mean().unstack().plot(ax=ax)
ax.set_title("Trend by Day of the Week")
plt.show()

Weekends vs Weekdays

In [None]:
train["weekend"]=train["dayofweek"].apply(lambda x : x>=5)
fig,ax =plt.subplots(1,1,figsize=(12,9))
train.set_index("date").groupby([pd.Grouper(freq="M"),"weekend"])["num_sold"].mean().unstack().plot(ax=ax)
ax.set_title("Weekend vs Weekdays Trend Comparison")
plt.show()

# New Notebook Starts Here
> # Part 2: Prediction

In [None]:
!pip install pycaret[full]

In [None]:
import gc
import cudf
from pycaret.regression import *

In [None]:
train = cudf.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col = 'row_id').to_pandas()
test = cudf.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col = 'row_id').to_pandas()

In [None]:
def feature_eng(df):
    df['date'] = pd.to_datetime(df['date'])
    df['week']= df['date'].dt.week
    df['year'] = 'Y' + df['date'].dt.year.astype(str)
    df['quarter'] = 'Q' + df['date'].dt.quarter.astype(str)
    df['day'] = df['date'].dt.day
    df['dayofyear'] = df['date'].dt.dayofyear
    df.loc[(df.date.dt.is_leap_year) & (df.dayofyear >= 60),'dayofyear'] -= 1
    df['weekend'] = df['date'].dt.weekday >=5
    df['weekday'] = 'WD' + df['date'].dt.weekday.astype(str)
    df.drop(columns=['date'],inplace=True)  

feature_eng(train)
feature_eng(test)

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
reg = setup(data = train,
            target = 'num_sold',
            normalize = True, #normalisation helps some algorithms
            normalize_method = 'robust', #resilient to outliers
            transform_target = True, #applies transformation to target column
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            create_clusters = True,
            feature_interaction = True,
            use_gpu = True,
            silent = True,
            n_jobs = -1)

In [None]:
models()

In [None]:
add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better = False)

In [None]:
N = 3
top = compare_models(sort = 'SMAPE', n_select = N)

In [None]:
blend = blend_models(top)
predict_model(blend);

In [None]:
final_blend = finalize_model(blend)
predict_model(final_blend);

In [None]:
gc.collect()
unseen_predictions_blend = predict_model(final_blend, data=test)
unseen_predictions_blend.head()

In [None]:
gc.collect()

assert(len(test.index)==len(unseen_predictions_blend))

sub = pd.DataFrame(list(zip(test.index, unseen_predictions_blend.Label)),columns = ['row_id', 'num_sold'])

sub.to_csv('submission.csv', index = False)

print(sub)