# Imports

In [None]:
# Data Manipulation, Linear Algebra
import pandas as pd
import numpy as np

# Plots
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore')

# Data Preprocessing

## Getting the Data

In [None]:
base_path = "../input/tabular-playground-series-jan-2022/"

train_data = pd.read_csv(base_path + "train.csv")
test_data = pd.read_csv(base_path + "test.csv")
sample_submission_data = pd.read_csv(base_path + "sample_submission.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

## Concatenating Training and Testing Data

In [None]:
full_data = pd.concat([train_data.iloc[:, :-1], test_data])
full_data.reset_index(drop=True, inplace=True)
full_data

## Feature Engineering

In [None]:
# Credit to https://www.kaggle.com/ranjeetshrivastav/tps-jan-21-base-xgb

full_data['date'] = pd.to_datetime(full_data['date'])

full_data['year'] = full_data['date'].dt.year
full_data['month'] = full_data['date'].dt.month
full_data['day'] = full_data['date'].dt.day
full_data['dayofweek'] = full_data['date'].dt.dayofweek
full_data['dayofmonth'] = full_data['date'].dt.days_in_month
full_data['dayofyear'] = full_data['date'].dt.dayofyear
full_data['weekday'] = full_data['date'].dt.weekday

full_data.drop(columns=['date', 'row_id'], inplace = True)

In [None]:
full_data

# Analysis and EDA on FullData

In [None]:
def Pie_Bar_Plot(data, col):
    # Gettting Column Value Counts
    column_value_counts = full_data[col].value_counts()

    # Creating the explode values_list
    explode_len = len(column_value_counts)
    explode = []
    for i in range(explode_len):
        explode.append(0.1)

    # Creating Subplots
    f, ax = plt.subplots(1, 2, figsize=(18, 8))

    # Pie Plot
    column_value_counts.plot.pie(explode=explode, autopct='%1.1f%%', ax=ax[0], shadow=True)
    ax[0].set_title(f'{col.capitalize()} Pie Plot', fontsize=16, fontweight="bold")
    ax[0].set_ylabel('')

    # Barplot / Count Plot
    sns.barplot(x=column_value_counts.index, y=column_value_counts.values, ax=ax[1])
    ax[1].set_title(f'{col.capitalize()} Count Plot', fontsize=16, fontweight="bold")
    plt.show()

    print("\n\n")

## Distribution Plots using Pie and Bar Plot

In [None]:
cols_to_plot = full_data.columns.to_list()
cols_to_plot.remove("day")
cols_to_plot.remove("dayofyear")

In [None]:
for col in cols_to_plot:
    Pie_Bar_Plot(full_data, col)

## Crosstabs

In [None]:
pd.crosstab(full_data.country, full_data.store, margins=True).style.background_gradient(cmap='summer_r')

In [None]:
pd.crosstab(full_data.country, full_data.year, margins=True).style.background_gradient(cmap='summer_r')

In [None]:
pd.crosstab(full_data.country, full_data.month, margins=True).style.background_gradient(cmap='summer_r')

In [None]:
pd.crosstab(full_data.year, full_data.month, margins=True).style.background_gradient(cmap='summer_r')

## Year vs Number of Sales (for Training Data)

In [None]:
train_data["date"] = pd.to_datetime(train_data["date"])

Thanks to https://www.kaggle.com/vad13irt/tps-jan-2022-exploratory-data-analysis for plot idea below.

In [None]:
def Date_Sales_Plot(data, hue=None):
    plt.figure(figsize=(25, 6))
    sns.lineplot(data=train_data, x="date", y="num_sold", hue=hue, err_style=None)
    plt.xlabel("Date", fontsize=12, fontweight="bold")
    plt.ylabel("Products Sold", fontsize=12, fontweight="bold")
    plt.title("Date vs Products Sold", fontsize=16, fontweight="bold")
    if hue:
        plt.legend(fontsize=13, edgecolor=None, facecolor=None, markerscale=2, handlelength=1, title=None)
    plt.show()

In [None]:
Date_Sales_Plot(train_data)

In [None]:
Date_Sales_Plot(train_data, "country")

In [None]:
Date_Sales_Plot(train_data, "store")

In [None]:
Date_Sales_Plot(train_data, "product")

# Preparing Data

## OneHotEncoding Categorical Variables

In [None]:
full_data = pd.concat([pd.get_dummies(full_data[["country", "store", "product"]]), full_data[["year", "month", "day"]]], axis=1)

In [None]:
# Seperating Training and Testing Data
train = full_data.iloc[:len(train_data), :]
test = full_data.iloc[len(train_data):, :]

# Machine Learning Model

In [None]:
# Train Test Split for training and testing the Regressor
from sklearn.model_selection import train_test_split

# Different Mathematical functions to calculate the Accuracy of Regression Model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Using Possibly every Regression Algorithm available in Sklearn
from sklearn import neighbors, tree, ensemble
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train_data["num_sold"], test_size = 0.2, random_state=0)

In [None]:
MLA_compare = pd.DataFrame()

predictions_list = []

def MLA_testing(MLA):
    row_index = 0
    for regressor in MLA:
        # Training the Machine Learning Algorithm
        regressor.fit(X_train, y_train)

        # Making predictions on the test set
        y_pred = regressor.predict(X_test)

        regressor_r2_score = r2_score(y_test, y_pred)
        regressor_mse = mean_squared_error(y_test, y_pred)
        regressor_rmse = np.sqrt(regressor_mse)
        regressor_mae = mean_absolute_error(y_test, y_pred)

        MLA_name = regressor.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'R2 Score'] = regressor_r2_score
        MLA_compare.loc[row_index, 'Mean Squared Error'] = regressor_mse
        MLA_compare.loc[row_index, 'Root Mean Squared Error'] = regressor_rmse
        MLA_compare.loc[row_index, 'Mean Absolute Error'] = regressor_mae

        predictions_list.append(regressor.predict(test))

        print(MLA_name, "Done")
        row_index+=1

In [None]:
MLA = [
    neighbors.KNeighborsRegressor(),

    tree.DecisionTreeRegressor(),

    ensemble.RandomForestRegressor(),
    ensemble.ExtraTreesRegressor(),
    ensemble.GradientBoostingRegressor(),

    XGBRegressor(),
    CatBoostRegressor(silent=True),
    LGBMRegressor(),
]

MLA_testing(MLA=MLA)

In [None]:
# Sorting by R^2 Value
MLA_compare = MLA_compare.sort_values(by="R2 Score", ascending=False).reset_index(drop=True)
MLA_compare[:10]

In [None]:
# Sorting by Mean Squared Error
MLA_compare = MLA_compare.sort_values(by="Mean Squared Error", ascending=True).reset_index(drop=True)
MLA_compare[:10]

# Submission File

In [None]:
predictions_list = [np.array(x) for x in predictions_list]
prediction = [np.mean(k) for k in zip(*predictions_list)]

In [None]:
sample_submission_data["num_sold"] = prediction

In [None]:
sample_submission_data.to_csv("submission.csv", index=False)