Problem statement:-

There are two (fictitious) independent store chains selling Kaggle merchandise that want to become the official outlet for all things Kaggle. We've decided to see if the Kaggle community could help us figure out which of the store chains would have the best sales going forward. So, we've collected some data and are asking you to build forecasting models to help us decide.

Help us figure out whether KaggleMart or KaggleRama should become the official Kaggle outlet!

Import

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

Load

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv")

In [None]:
train

In [None]:
test

In [None]:
submission

Analyse target

In [None]:
sns.distplot(train['num_sold'])

Remove outliers

In [None]:
for x in ['num_sold']:
    q75,q25 = np.percentile(train.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    train.loc[train[x] < min,x] = np.nan
    train.loc[train[x] > max,x] = np.nan


In [None]:
train['num_sold'].isnull().sum()


In [None]:

train = train.dropna(axis = 0)
train


In [None]:
target = train['num_sold']

sns.distplot(train['num_sold'])

Combine train and test

In [None]:
combi = train.drop(['num_sold'], axis=1).append(test)
combi

drop id

In [None]:
combi = combi.drop(['row_id'], axis=1)
combi

Check for null values

In [None]:
combi.isnull().sum()

Datestamp date

In [None]:
combi['date'] = pd.to_datetime(combi['date'], errors='coerce')
combi

Check if date is weekend

In [None]:
from datetime import datetime

combi["day_of_week"] = combi['date'].dt.dayofweek
combi["is_weekend"] = combi['day_of_week'] > 4

combi

Convert boolean to integar

In [None]:
combi['is_weekend'] = combi['is_weekend']* 1
combi

In [None]:
sns.displot(combi['day_of_week'])

In [None]:
sns.displot(combi['is_weekend'])

Get month and day from date

In [None]:
combi['year'] = combi['date'].dt.year
combi['month'] = combi['date'].dt.month
combi['day'] = combi['date'].dt.day

combi

Check for Christmas 1

In [None]:
if combi['month'] is 12 and combi['day'] is 25:
    combi['xmas1'] = True
else:
    combi['xmas1'] = False

combi['xmas1'] = combi['xmas1'] * 1

combi

Check for christmas 2

In [None]:
if combi['month'] is 12 and combi['day'] is 26:
    combi['xmas2'] = True
else:
    combi['xmas2'] = False

combi['xmas2'] = combi['xmas2'] * 1

combi

Check for new year

In [None]:
if combi['month'] is 1 and combi['day'] is 1:
    combi['new_year'] = True
else:
    combi['new_year'] = False

combi['new_year'] = combi['new_year'] * 1

combi

Check for Easter

In [None]:
if combi['year'] is 2015 and combi['month'] is 4 and combi['day'] is 5:
    combi['easter'] = True
elif combi['year'] is 2016 and combi['month'] is 3 and combi['day'] is 27:
    combi['easter'] = True
elif combi['year'] is 2017 and combi['month'] is 4 and combi['day'] is 16:
    combi['easter'] = True
elif combi['year'] is 2018 and combi['month'] is 4 and combi['day'] is 1:
    combi['easter'] = True
elif combi['year'] is 2019 and combi['month'] is 4 and combi['day'] is 21:
    combi['easter'] = True
else:
    combi['easter'] = False

combi['easter'] = combi['easter'] * 1

combi

Ordinal encode

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

for col in combi:
    if combi[col].dtype=="object":
        combi[col] = enc.fit_transform(combi[col].values.reshape(-1,1))
combi


Convert date to number

In [None]:
import datetime 

combi['date_num'] = combi['date'].dt.strftime('%d%m%Y')
combi['date_num'] = combi['date_num'].astype(int)
combi

Drop date and year

In [None]:
combi.drop(['date'], axis=1, inplace=True)
combi.drop(['year'], axis=1, inplace=True)
combi

Check for null values

In [None]:
combi.isnull().sum()

Normalise 

In [None]:
#combi = (combi - combi.min()) / (combi.max() - combi.min())
#combi

Standardise combi

In [None]:
#combi = (combi - np.average(combi)) / (np.std(combi))
#combi

Define X and y

In [None]:
y = target
X = combi[: len(train)]
X_test = combi[len(train) :]

Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_val.shape, y_train.shape,y_val.shape, X_test.shape

Define model

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor().fit(X_train, y_train)
print(model.score(X_train, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

In [None]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

In [None]:
df=pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_val, y_pred, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

Predict on test set and submit

In [None]:
preds = model.predict(X_test)
preds = preds.astype(int)
preds[preds < 0] = 0
preds

In [None]:
submission.num_sold = preds
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission