In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_path = "../input/tabular-playground-series-jan-2022/train.csv"
train_df = pd.read_csv(train_path)
train_ids = train_df.pop("row_id")

test_path = "../input/tabular-playground-series-jan-2022/test.csv"
test_df = pd.read_csv(test_path)
test_ids = test_df.pop("row_id")


sample_submission_path = "../input/tabular-playground-series-jan-2022/sample_submission.csv"
sample_submission_df = pd.read_csv(sample_submission_path)

In [None]:
train_df

Number of rows: 26298.<br>
Number of features: 4 (1 datetime feature and 3 categorical features).

<h1>Sample Submission</h1>

Submission file requires 2 columns: `row_id`, `num_sold`. `num_sold` is the column, which we must to predict.<br>The example of submission is below. 

In [None]:
sample_submission_df

<h1>Exploratory Data Analysis</h1>

<h2>Palettes & Functions</h2>

In [None]:
country_colors = ["#003281", "#BB042B", "#FECD00"]
sns.palplot(country_colors)
store_colors = ["#704DF6", "#309975"]
sns.palplot(store_colors)
product_colors = ["#0692B3", "#F370AD", "#D4A427"]
sns.palplot(product_colors)

In [None]:
def hide_spines(ax, spines=["top", "right", "bottom", "left"]):
    for spine in spines:
        ax.spines[spine].set_visible(False)

<h2>Missing values</h2>

In [None]:
train_df.info()

We don't have missing values.

<h2>Features Analysis</h2>

In [None]:
fig = plt.figure(figsize=(10, 7))
fig.set_facecolor("#fff")
ax = fig.add_subplot()
ax.set_facecolor("#fff")
ax.grid(color="lightgrey", alpha=0.7, linewidth=1.5, axis="y", zorder=0)
sns.countplot(x="country", data=train_df, ax=ax, palette=country_colors, zorder=2)
ax.set_title("Country Distribution", loc="left", color="#000", fontsize=25, pad=5, fontweight="bold", fontfamily="serif", y=1, zorder=3)
ax.xaxis.set_tick_params(color="#000", grid_color="#000", labelsize=15, pad=10, length=0)
ax.set_xlabel(None)
ax.set_ylabel("Count", fontsize=15, fontfamily="serif", labelpad=10)
hide_spines(ax)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_yticks(range(0, 9000, 1000))
fig.show()

In [None]:
fig = plt.figure(figsize=(10, 7))
fig.set_facecolor("#fff")
ax = fig.add_subplot()
ax.set_facecolor("#fff")
ax.grid(color="lightgrey", alpha=0.7, linewidth=1.5, axis="y", zorder=0)
sns.countplot(x="store", data=train_df, ax=ax, palette=store_colors, zorder=2)
ax.set_title("Store Distribution", loc="left", color="#000", fontsize=25, pad=5, fontweight="bold", fontfamily="serif", y=1, zorder=3)
ax.xaxis.set_tick_params(color="#000", grid_color="#000", labelsize=15, pad=10, length=0)
ax.set_xlabel(None)
ax.set_ylabel("Count", fontsize=15, fontfamily="serif", labelpad=10)
hide_spines(ax)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_yticks(range(0, 13000, 1000))
fig.show()

In [None]:
fig = plt.figure(figsize=(10, 7))
fig.set_facecolor("#fff")
ax = fig.add_subplot()
ax.set_facecolor("#fff")
ax.grid(color="lightgrey", alpha=0.7, linewidth=1.5,  axis="y", zorder=0)
sns.countplot(x="product", data=train_df, ax=ax, palette=product_colors, zorder=2)
ax.set_title("Product Distribution", loc="left", color="#000", fontsize=25, pad=5, fontweight="bold", fontfamily="serif", y=1, zorder=3)
ax.xaxis.set_tick_params(color="#000", grid_color="#000", labelsize=15, pad=10, length=0)
ax.set_xlabel(None)
ax.set_ylabel("Count", fontsize=15, fontfamily="serif", labelpad=10)
hide_spines(ax)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_yticks(range(0, 9000, 1000))
fig.show()

All features are balanced,

<h2>Number of sells Analysis</h2>

In [None]:
fig = plt.figure(figsize=(25, 7))
fig.set_facecolor("#fff")
ax = fig.add_subplot()
ax.set_facecolor("#fff")
ax.grid(color="lightgrey", alpha=0.7, linewidth=1, axis="both", zorder=0)
sns.lineplot(x="date", y="num_sold", color="#20BEFF", err_style=None, data=train_df, linewidth=1, ax=ax, zorder=2)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_ylabel("Num Sold", fontsize=15, fontfamily="serif", labelpad=10)
ax.set_xlabel("Date", fontsize=15, fontfamily="serif", labelpad=10)
ax.xaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_title("Number of sales", loc="left", color="#000", fontsize=25, pad=5, fontweight="bold", fontfamily="serif", y=1.05, zorder=3)
hide_spines(ax)
fig.show()

We see that plot of the `num_sold` feature is looked at as a cosine function but has 5 picks, not surprised, that these picks show on the 1st of January because people buy a lot of things during this date, so you should make some tricks to better predict them. 

In [None]:
fig = plt.figure(figsize=(25, 7))
fig.set_facecolor("#fff")
ax = fig.add_subplot()
ax.set_facecolor("#fff")
ax.grid(color="lightgrey", alpha=0.7, linewidth=1, axis="both", zorder=0)
sns.lineplot(x="date", y="num_sold", hue="country", color="#FECD00", data=train_df, palette=country_colors, err_style=None, linewidth=1, ax=ax, zorder=2)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_ylabel("Num Sold", fontsize=15, fontfamily="serif", labelpad=10)
ax.set_xlabel("Date", fontsize=15, fontfamily="serif", labelpad=10)
ax.xaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_title("Countries vs Number of sales", loc="left", color="#000", fontsize=25, pad=5, fontweight="bold", fontfamily="serif", y=1.05, zorder=3)
hide_spines(ax)
ax.legend(loc="upper right", ncol=3, fontsize=15, edgecolor=None, facecolor=None, markerscale=2, labelcolor="#000", handlelength=1, title=None)
fig.show()

We see that Finland has the lowest sales, while Norway has the highest, so you can train 3 different models to predict sales for countries.

In [None]:
fig = plt.figure(figsize=(25, 7))
fig.set_facecolor("#fff")
ax = fig.add_subplot()
ax.set_facecolor("#fff")
ax.grid(color="lightgrey", alpha=0.7, linewidth=1, axis="both", zorder=0)
sns.lineplot(x="date", y="num_sold", data=train_df, hue="store", palette=store_colors, err_style=None, linewidth=1, ax=ax, zorder=2)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_ylabel("Num Sold", fontsize=15, fontfamily="serif", labelpad=10)
ax.set_xlabel("Date", fontsize=15, fontfamily="serif", labelpad=10)
ax.xaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_title("Stores vs Number of sales", loc="left", color="#000", fontsize=25, pad=5, fontweight="bold", fontfamily="serif", y=1.05, zorder=3)
hide_spines(ax)
ax.legend(loc="upper right", ncol=3, fontsize=15, edgecolor=None, facecolor=None, markerscale=2, labelcolor="#000", handlelength=1, title=None)
fig.show()

From this plot, we see that `KaggleMart` has lowest sales, than `KaggleRama`.

In [None]:
fig = plt.figure(figsize=(25, 7))
fig.set_facecolor("#fff")
ax = fig.add_subplot()
ax.set_facecolor("#fff")
ax.grid(color="lightgrey", alpha=0.7, linewidth=1, axis="both", zorder=0)
sns.lineplot(x="date", y="num_sold", data=train_df, hue="product", palette=product_colors, err_style=None, linewidth=1, ax=ax, zorder=2)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_ylabel("Num Sold", fontsize=15, fontfamily="serif", labelpad=10)
ax.set_xlabel("Date", fontsize=15, fontfamily="serif", labelpad=10)
ax.xaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.yaxis.set_tick_params(color="#000", labelsize=12, pad=5, length=0)
ax.set_title("Products vs Number of sales", loc="left", color="#000", fontsize=25, pad=5, fontweight="bold", fontfamily="serif", y=1.05, zorder=3)
hide_spines(ax)
ax.legend(loc="upper right", ncol=3, fontsize=15, edgecolor=None, facecolor=None, markerscale=2, labelcolor="#000", handlelength=1, title=None)
fig.show()

The most popular product is `KaggleHat`, the plot of `KaggleHat` has the wave form, while others have almost stable form. 