In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Loading the Datasets

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv',)
submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
print("--Done--")

In [None]:
train.sample(2)

In [None]:
test.sample(2)

In [None]:
print(f'Shape of Training Dataset: {train.shape}')
print(f'Shape of Test Dataset: {test.shape}')

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.isna().sum().sum()

#### Observations:
- We have a large enough dataset to train our model
- Creating a validation set would be a issue
- We have a numerical column which is the `target` variable else everything is currently categorical. We will be changing the date to a datetime object as we need to analyse trends in the dataset
- There are no null values in the dataset. Peace.

## Exploratory Data Analysis

In [None]:
train_store = train['store'].value_counts()
train_country = train['country'].value_counts()
train_product = train['product'].value_counts()
test_store = test['store'].value_counts()
test_country = test['country'].value_counts()
test_product = test['product'].value_counts()

In [None]:
def visualize(x_axis, y_axis, title):
    """Defining a function that visualizes that plots a barplot given the x, y and the title of the plot"""
    fig, ax = plt.subplots(figsize=(15, 5))
    ax.spines[["right", "top"]].set_visible(False)
    sns.barplot(x_axis, y_axis, alpha=0.8, palette='ocean')
    plt.title(title,fontweight='bold')
    ax.set_box_aspect(2.5/len(ax.patches))
    for p in ax.patches:
        x = p.get_x() + p.get_width() / 2 
        y = p.get_y() + p.get_height() + (p.get_height() * -0.5)
        value = (p.get_height())
        ax.text(x, y, value, ha="center", fontweight='bold')   
    plt.show()

In [None]:
visualize(train_store.index, train_store.values, 'Distribution count of items from stores - Train')
visualize(test_store.index, test_store.values, 'Distribution count of items from stores - Test')

In [None]:
visualize(train_country.index, train_country.values, 'Distribution count of items for Countries - Train')
visualize(test_country.index, test_country.values, 'Distribution count of items for Countries - Test')

In [None]:
visualize(train_product.index, train_product.values, 'Distribution count of products - Train')
visualize(test_product.index, test_product.values, 'Distribution count of products - Test')

#### Observations:
- Labels are equally distributed for Training and Test set for Country, Products and Store

## In-Depth Analysis

In [None]:
train['date'] = pd.to_datetime(train['date'], format='%Y-%m-%d')
train.info()

We have successfully converted the `date` column to a datetime object

In [None]:
fig, ax = plt.subplots(figsize=(20, 7))
g_time = train.groupby(['date']).sum().reset_index()
sns.lineplot(g_time.date, g_time.num_sold, color='crimson', alpha=0.8)
ax.spines[["top", "right"]].set_visible(False)
plt.grid(linestyle='-.', alpha=0.3, color='b')

plt.title('Trend of sold items by day', fontweight='bold', fontsize=18)
plt.xlabel('Date',fontsize=15, fontweight='bold')
plt.ylabel('Items Sold',fontsize=15, fontweight='bold')
plt.show()

> We see that there is a surge of sales as we increase yearly but a common trend of massive surges are noted on the start of every year.

In [None]:
g_country = train.groupby(['country']).sum().reset_index()
visualize(g_country.country, g_country.num_sold, 'Sold items by Country')

Norway wins the race of sales followed by Sweden and a close third, Finland

In [None]:
g_store = train.groupby(['store']).sum().reset_index()
visualize(g_store.store, g_store.num_sold, 'Sold items by Store')

KaggleRama undoubtedly has a strong marketing team, I'm convinced.

In [None]:
g_product = train.groupby(['product']).sum().reset_index()
fig, ax = plt.subplots(figsize=(15, 5))
ax = sns.barplot(x="product", y='num_sold', data=g_product, alpha=0.8, saturation=3, palette='ocean')
ax.set_box_aspect(2.5/len(ax.patches))
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2 
    y = p.get_y() + p.get_height() + (p.get_height() * -0.5)
    value = (p.get_height())
    ax.text(x, y, value, ha="center", fontweight='bold')   
plt.title('Sold items by Product', fontsize = 16, fontweight='bold')
plt.show()

Kaggle Hat is the most sold product followed by Kaggle Mug and finally Kaggle Sticker

### Breaking the `date` column to Year, Month and Days 

In [None]:
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month_name()
train['day'] = train['date'].dt.day_name()
train.sample(2)

In [None]:
g_year = train.groupby('year').sum().reset_index()
fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.lineplot(g_year.year, g_year.num_sold, linewidth=3, color='crimson',)
ax.spines[['top', 'right']].set_visible(False)
plt.grid(linestyle='-.', alpha=0.3, color='b')

plt.title("Yearly trend of sales", fontsize = 16, fontweight='bold')
plt.xticks(g_year.year.tolist())
plt.xlabel('Year',fontsize=15, fontweight='bold')
plt.ylabel('Items Sold',fontsize=15, fontweight='bold')
plt.show()

> We see a Linear increase of sales starting from 2016

In [None]:
g_month = train.groupby('month').sum().reset_index()
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
g_month['month'] = pd.Categorical(g_month['month'], categories=months, ordered=True)
g_month.sort_values('month', inplace=True)


fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.lineplot(g_month.month, g_month.num_sold, linewidth=3, color='dodgerblue', sort=False)
ax.spines[['top', 'right']].set_visible(False)
plt.grid(linestyle='-.', alpha=0.3, color='b')

plt.title("Monthly trend of sales", fontsize = 16, fontweight='bold')
plt.xlabel('Month',fontsize=15, fontweight='bold')
plt.ylabel('Items Sold',fontsize=15, fontweight='bold')
plt.show()

This is interesting. 

- We know that there was a massive increase of sales around the end of the year **December, January**
- We can also see there are two heads there is another surge in sales around the **March, April, May** period probabaly due to *Easter* and *St.Patricks day*
- We see a drop in sales from **June till September**

In [None]:
g_day = train.groupby('day').sum().reset_index()
days = ['Monday', 'Tuesday', 'Wednesday', 'Thrusday', 'Friday', 'Saturday', 'Sunday']
g_day['day'] = pd.Categorical(g_day['day'], categories=days, ordered=True)
g_day.sort_values('day', inplace=True)


fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.lineplot(g_day.day, g_day.num_sold, linewidth=3, color='green', sort=False)
ax.spines[['top', 'right']].set_visible(False)
plt.grid(linestyle='-.', alpha=0.3, color='b')

plt.title("Daily trend of sales", fontsize = 16, fontweight='bold')
plt.xlabel('Days',fontsize=15, fontweight='bold')
plt.ylabel('Items Sold',fontsize=15, fontweight='bold')
plt.show()

A trend of increase in sales is seen at the start of weekends

In [None]:
count_year = train.groupby(['year', 'country']).sum().reset_index()


fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.lineplot(count_year.year, count_year.num_sold, hue=count_year.country,palette=['orange', 'purple', 'seagreen'], linewidth=3)
ax.spines[['top', 'right']].set_visible(False)
plt.grid(linestyle='-.', alpha=0.3, color='b')

plt.title("Yearly trend of sales based on Country", fontsize = 16, fontweight='bold',)
plt.xlabel('Year',fontsize=15, fontweight='bold')
plt.ylabel('Items Sold',fontsize=15, fontweight='bold')
plt.xticks(g_year.year.tolist())
plt.show()

> It's probably a minor change but Finland seems to increase quicker than Sweden w.r.t sales. It might just be a possibilty that it overtakes in the upcoming years 

In [None]:
count_month = train.groupby(['month', 'country']).sum().reset_index()
count_month['month'] = pd.Categorical(count_month['month'], categories=months, ordered=True)
count_month.sort_values('month', inplace=True)

fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.lineplot(count_month.month, count_month.num_sold, hue=count_month.country, palette=['orange', 'purple', 'seagreen'], linewidth=3)
ax.spines[['top', 'right']].set_visible(False)
plt.title("Monthly trend of sales based on Country", fontsize = 16, fontweight='bold',)
plt.xlabel('Month',fontsize=15, fontweight='bold')
plt.grid(linestyle='-.', alpha=0.3, color='b')

plt.ylabel('Items Sold',fontsize=15, fontweight='bold')
plt.show()

In [None]:
count_day = train.groupby(['day', 'country']).sum().reset_index()
count_day['day'] = pd.Categorical(count_day['day'], categories=days, ordered=True)
count_day.sort_values('day', inplace=True)

fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.lineplot(count_day.day, count_day.num_sold, hue=count_day.country, palette=['orange', 'purple', 'seagreen'], linewidth=3)
ax.spines[['top', 'right']].set_visible(False)
plt.grid(linestyle='-.', alpha=0.3, color='b')

plt.title("Daily trend of sales based on Country", fontsize = 16, fontweight='bold',)
plt.xlabel('Days',fontsize=15, fontweight='bold')
plt.ylabel('Items Sold',fontsize=15, fontweight='bold')
plt.show()

Pretty similar conclusions from the data we see from Monthly and Daily

In [None]:
roll = train.rolling(window = 18).sum()
g_time['roll'] = roll.num_sold

fig, ax = plt.subplots(figsize=(20, 7))
plt.grid(linestyle='-.', alpha=0.3, color='b')
sns.lineplot(x='date', y='num_sold', data = g_time, color='crimson', alpha=0.8,)
sns.lineplot(x='date', y='roll', data = g_time, color='black', alpha=0.8, linewidth=3)
ax.spines[["top", "right"]].set_visible(False)
plt.title('Trend of sold items by day vs Rolling average by day', fontweight='bold', fontsize=18)
plt.xlabel('Date',fontsize=15, fontweight='bold')
plt.ylabel('Items Sold',fontsize=15, fontweight='bold')
plt.show()

# Creating the Model

In [None]:
from xgboost import XGBRegressor
import lightgbm as lgb

### Preprocessing the data

In [None]:
train['day_of_year'] = train.date.dt.dayofyear
train['day_of_month'] = train.date.dt.days_in_month
train['day_date'] = train.date.dt.day

In [None]:
test['date'] = pd.to_datetime(test['date'], format='%Y-%m-%d')


test['day_of_year'] = test.date.dt.dayofyear
test['day_of_month'] = test.date.dt.days_in_month
test['day_date'] = test.date.dt.day
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month_name()
test['day'] = test['date'].dt.day_name()

In [None]:
catergorical_cols = train.select_dtypes('object').columns.tolist()
df_train = pd.get_dummies(train, columns=catergorical_cols)
df_test = pd.get_dummies(test, columns=catergorical_cols)

### Feature Engineering

In [None]:
df_train['high_value'] = 0
for i in range(len(df_train.num_sold)):
    if df_train.num_sold.loc[i] <= 200:
        df_train['high_value'].loc[i] = 1
    elif 200 < df_train.num_sold.loc[i] <= 500:
        df_train['high_value'].loc[i] = 2
    elif 500 < df_train.num_sold.loc[i] <= 800:
        df_train['high_value'].loc[i] = 3
    else:
        df_train['high_value'].loc[i] = 4
    
df_train.high_value.value_counts()

In [None]:
df_test['high_value'] = 0
for i in range(df_test.shape[0]):
    if df_train.day_of_year.loc[i] == df_test.day_of_year.loc[i]:
        df_test.high_value.loc[i] = df_train.high_value.loc[i]

        
df_test.high_value.value_counts()

In [None]:
X_train = df_train.loc[:21038]
X_valid = df_train.loc[21038:]
y_train = X_train.pop('num_sold')
y_valid = X_valid.pop('num_sold')

In [None]:
X_train = X_train.drop('date', axis=1)
X_valid = X_valid.drop('date', axis=1)
df_test = df_test.drop(['row_id', 'date'], axis=1)

In [None]:
print(f'Shape of X_train : {X_train.shape}')
print(f'Shape of X_valid : {X_valid.shape}')
print(f'Shape of y_train : {y_train.shape}')
print(f'Shape of y_valid : {y_valid.shape}')
print(f'Shape of df_test : {df_test.shape}')

### Defining the Metric

In [None]:
def smape(actual, predicted):
    numerator = np.abs(predicted - actual)
    denominator = (np.abs(actual) + np.abs(predicted)) / 2
    return np.mean(numerator / denominator)*100

In [None]:
params = {
    'lambda': 0.0027558604139151484, 
    'alpha': 0.011141013575816665, 
    'eta': 0.027747047243310157, 
    'colsample_bytree': 0.7, 
    'subsample': 1.0, 
    'learning_rate': 0.016, 
    'n_estimators': 2000, 
    'max_depth': 15, 
    'min_child_weight': 31
}

### Creating the Model

In [None]:
model = XGBRegressor(**params,)
model.fit(X_train, y_train, verbose=False, eval_set=[(X_valid, y_valid)],
         early_stopping_rounds = 100)
predictions = model.predict(X_valid)

score = smape(y_valid, predictions)
score

In [None]:
lgb_params = {
    'objective': 'mae',
    'n_estimators': 20000,
    'random_state': 42,
    'learning_rate': 5e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
}

model_lgb = lgb.LGBMRegressor(**lgb_params)
model_lgb.fit(X_train,
          y_train,
          eval_set = [(X_valid, y_valid)],
          eval_metric = 'mape',
          early_stopping_rounds = 200,
          verbose = False,
         )
predictions_lgb = model_lgb.predict(X_valid)

score = smape(y_valid, predictions_lgb)
score

In [None]:
submission

In [None]:
test_predictions = model.predict(df_test)
submission['num_sold'] = test_predictions
submission

In [None]:
submission.to_csv('submission2.csv', index=False)
print("--Done--")

 ### ---Will be updating the model---
 
 
 Special Mentions to notebooks:
  - https://www.kaggle.com/mhslearner/tps-jan-2022-time-series-forcasting
  - [HyperParameters taken from Notebook](https://www.kaggle.com/rhythmcam/tps-01-22-xgboost-optuna-basic)