In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## ***Objective***

Figure out which of the store chains would have the best sales going forward. Build forecasting models to help us decide.

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")
df_train

## ***Exploratory Data Analysis***

In [None]:
df_train.dtypes

In [None]:
# Check for duplicate values
df_train.duplicated().sum()
df_test.duplicated().sum()

In [None]:
df = df_train.copy()
df['row_id'] = df_train['row_id']
df_train.drop('row_id', axis = 1, inplace = True)
df_test.drop('row_id', axis = 1, inplace = True)

In [None]:
df_train.columns

In [None]:
df_train['country'].value_counts().plot(kind = 'bar', color = 'blue');

In [None]:
df_train['store'].value_counts().plot(kind = 'bar', color = 'blue');

In [None]:
df_train['product'].value_counts().plot(kind = 'bar', color = 'blue');

In [None]:
sns.boxplot(x ='product', y = 'num_sold', palette = "coolwarm", data = df_train);

In [None]:
sns.boxplot(x = 'country', y = 'num_sold', data = df_train,palette = "coolwarm", hue = 'product');

From the previous graph we can see that:

* Normay bought the largest number of products compared to Sweden and Finland.
* Kaggle hat was the most sold product in every country.
* Kaggle sticker was the least sold product in every country.

In [None]:
sns.boxplot(x = 'store', y = 'num_sold', data = df_train, palette = "coolwarm", hue = 'product');

We can see from the previous graph that:

* KaggleRama had higher sales compared to KaggleMart.
* The highest product sold was Kaggle hat, followed by Kaggle mug, and lastly Kaggle sticker.

In [None]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [None]:
df_train['year'] = df_train['date'].dt.year
df_train['month'] = df_train['date'].dt.month
df_train['day'] = df_train['date'].dt.day

df_test['year'] = df_test['date'].dt.year
df_test['month'] = df_test['date'].dt.month
df_test['day'] = df_test['date'].dt.day

df_train

In [None]:
sns.lineplot(data=df_train, x="year", y="num_sold", palette = "coolwarm", hue = 'store');

As we can see from the lineplot KaggleRama had higher sales for all years.

In [None]:
#rename month column for prettier x axis
df_train_copy = df_train.copy()
df_train_copy['month'].replace({1: "Jan", 2:"Feb", 3:"Mar", 4:"Apr", 5:"May", 6:"Jun", 7: "Jul", 8:"Aug", 9:"Sep", 10: "Oct", 11: "Nov", 12:"Dec"}, inplace = True)

#Create subplots
fig, ax = plt.subplots(1,2, sharey=True, figsize=(25,5))

# adjust the suptitle for the subplots
plt.subplots_adjust(top=0.82, left = 0.42)

#remove grid
sns.despine()

fig.suptitle("KaggleMart VS KaggleRama Sales/Year", size = 15)

#add plot labels and position them to the left
ax[0].set_title("KaggleMart", loc = 'left')
ax[1].set_title("KaggleRama", loc = 'left')

#create lineplots
sns.lineplot(x='month', y='num_sold', hue='year', ci = None, legend = False, marker = 'o', palette = "coolwarm", data=df_train_copy.loc[df_train_copy['store'] == 'KaggleMart'], ax = ax[0]);
sns.lineplot(x='month', y='num_sold', hue='year', ci = None, marker = 'o', palette = "coolwarm", data=df_train_copy.loc[df_train_copy['store'] == 'KaggleRama'], ax = ax[1]);

#to reduce legend duplication
ax[1].legend(loc=(1.1, 0.5));

In [None]:
# drop date column
df_train.drop('date', axis = 1, inplace = True)
df_test.drop('date', axis = 1, inplace = True)

## ***Modelling***

In [None]:
# One hot encoder
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [None]:
# Add back the row_id
df_train['row_id'] = df_train.index
df_test['row_id'] = df_test.index

X_train = df_train.drop('num_sold', axis = 1)
y_train = df_train['num_sold']

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

y_pred = reg.predict(df_test)

In [None]:
reg.score(df_test,y_pred)

In [None]:
# dataframe=pd.DataFrame({"row_id":df['row_id'],'num_sold':y_pred})
output = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
output['num_sold']= y_pred

#idea comes from https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/299162
output['num_sold'] = output['num_sold'].apply(np.ceil)
output.to_csv('linreg_submission.csv',index =False)

In [None]:
output

In [None]:
# # Search for the best hyperparameters
# from sklearn.model_selection import GridSearchCV

# xgb1 = XGBRegressor()
# parameters = {
#               'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07], 
#               'max_depth': [5, 6, 7],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [500]}

# xgb_grid = GridSearchCV(xgb1,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 5,
#                         verbose=True)

# xgb_grid.fit(X_train,
#          y_train)

# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)

In [None]:
from xgboost import XGBRegressor 
# define model
model = XGBRegressor(colsample_bytree = 0.7,
                     learning_rate = 0.07,
                     max_depth= 10,
                     min_child_weight= 4,
                     n_estimators = 700,
                     subsample = 0.7)
# fit model
model.fit(X_train, y_train)
# make a prediction
y_pred = model.predict(df_test)

In [None]:
# dataframe=pd.DataFrame({"row_id":df['row_id'],'num_sold':y_pred})
output = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
output['num_sold']= y_pred

#idea comes from https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/299162
output['num_sold'] = output['num_sold'].apply(np.ceil)
output.to_csv('xgboost_submission3.csv',index =False)

In [None]:
output