In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Taking a look at the data

In [None]:
test_df = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv",parse_dates=['date'])
test_df.head()

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv",parse_dates=['date'])
train_df.head()

In [None]:
plot_df = train_df.set_index('date')
plot_df[['country', 'store', 'product', 'num_sold']].pivot(
    columns=['country', 'store', 'product'], values='num_sold').plot(figsize=(18,6));

In [None]:
submission = pd.read_csv(
    '/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv',
    index_col='row_id')

## Let's do some EDA.

In [None]:
train_df.info()

In [None]:
train_df.describe()

On avg the no of items sold is 388. 

Let's take a look at how many items sold in each country.

In [None]:
sns.set()
fig = plt.figure(figsize=(12,6))
g = sns.barplot(x='country', y='num_sold', data=train_df, ci=None)
g.bar_label(g.containers[0],fmt="%.0f", padding=2)
plt.title("Items sold in Each Country",size=15)
plt.show()

Seems like Norway had the most sales followed by sweden(ouh!!)

Let's take a look at the products available.

In [None]:
train_df['product'].unique()

Now let's look at how each of these products did.

In [None]:
fig = plt.figure(figsize=(12,6))
g = sns.barplot(x='product', y='num_sold', data=train_df, ci=None)
g.bar_label(g.containers[0],fmt="%.0f", padding=2)
plt.title("Individual Item Sales", size=15)
plt.show()


The Kaggle Hat has been the most sold item followed by the mug.

Let's take a look at sales of the stores

In [None]:
fig = plt.figure(figsize=(12,6))
g = sns.barplot(x='store', y='num_sold', data=train_df, ci=None)
g.bar_label(g.containers[0],fmt="%.0f", padding=2)
plt.title("Individual Store Sales", size=15)
plt.show()

KaggleRama seems to be making more sales than the other.

In [None]:
y = train_df.pop('num_sold')

train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['dayofweek'] = train_df['date'].dt.dayofweek

test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['dayofweek'] = test_df['date'].dt.dayofweek

train_df = train_df.drop('date', axis='columns')
test_df = test_df.drop('date', axis='columns')

In [None]:
country_map = {'Finland': 0, 'Sweden': 1, 'Norway': 2}
store_map = {'KaggleMart': 0, 'KaggleRama': 1}
product_map = {'Kaggle Mug': 0, 'Kaggle Hat': 1, 'Kaggle Sticker': 2}

train_df['country'] = train_df['country'].map(country_map)
train_df['store'] = train_df['store'].map(store_map)
train_df['product'] = train_df['product'].map(product_map)

test_df['country'] = test_df['country'].map(country_map)
test_df['store'] = test_df['store'].map(store_map)
test_df['product'] = test_df['product'].map(product_map)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=500, n_jobs=-1)
model.fit(train_df, y)
benchmark = submission.copy()
benchmark['num_sold'] = model.predict(test_df)

benchmark.to_csv('rf_submission.csv')