In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

#ignore warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv", index_col=0, parse_dates=True)
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv", index_col=0, parse_dates=True)

In [None]:
train.head()

In [None]:
print(train.shape, test.shape)

In [None]:
train.info()

In [None]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [None]:
train.info()

In [None]:
# number of unique values
train.nunique()

In [None]:
categorical_cols = [cols for cols in train.select_dtypes('object')]
print(categorical_cols)

In [None]:
# counting the number of occurence of each value
for col in categorical_cols:
    print(f"{col}:")
    print(f"{train[col].value_counts()}")
    print("\n")

##### Finding range of values in train and test data

In [None]:
print(train.date.min())
print(train.date.max())

In [None]:
print(test.date.min())
print(test.date.max())

## EDA

Reference: [this notebook](https://www.kaggle.com/hasanbasriakcay/playground-jan-22-eda-feature-engineering/notebook)

In [None]:
plt.figure(figsize=(10,5))
plt.bar(train['date'], train['num_sold'])

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
sns.distplot(x=train.loc[train['country']=='Norway', 'num_sold'], label='Norway', ax=ax)
sns.distplot(x=train.loc[train['country']=='Finland', 'num_sold'], label='Finland', ax=ax)
sns.distplot(x=train.loc[train['country']=='Sweden', 'num_sold'], label='Sweden', ax=ax)
plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(12,6))
sns.boxplot(data=train, x='country', y='num_sold', ax=ax[0])
sns.boxplot(data=train, x='product', y='num_sold', ax=ax[1])
sns.boxplot(data=train, x='store', y='num_sold', ax=ax[2])
plt.show()

## Feature Engineering

Refer: [this notebook](https://www.kaggle.com/maxencefzr/tps-jan22-eda-simple-catboost?scriptVersionId=84486229&cellId=26)

In [None]:
import holidays

In [None]:
holiday_list = []
holiday_dict = {}
for date, name in holidays.Finland(years=[2015, 2016, 2017, 2018, 2019]).items():
    holiday_list.append([date, name])
    
for date, name in holidays.Norway(years=[2015, 2016, 2017, 2018, 2019]).items():
    holiday_list.append([date, name])
    
for date, name in holidays.Sweden(years=[2015, 2016, 2017, 2018, 2019]).items():
    if name!='Söndag':
        holiday_list.append([date, name.replace(", Söndag", "")])
        

for i in range(len(holiday_list)):
    holiday_dict[holiday_list[i][0]] = holiday_list[i][1]

In [None]:
def create_features(df):
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['DayOfYear'] = df['date'].dt.dayofyear
    df['weekday'] = df['date'].dt.weekday
    df['WeekOfYear'] = df['date'].dt.weekofyear
    df['quarter'] = df['date'].dt.quarter
#     df['IsLeapYear'] = df['date'].dt.is_leap_year
    df['weekend'] = (df['date'].dt.weekday>=5).astype(int)
    df['holiday_name'] = df['date'].map(holiday_dict)
    df['is_holiday'] = np.where(df['holiday_name'].notnull(), 1, 0)
    df['holiday_name'] = df['holiday_name'].fillna("No holiday")
    df['DayOfMonth'] = df['date'].dt.days_in_month
    df['daysinmonth'] = df['date'].dt.days_in_month
    df['country_store_product'] = df['country'].astype(str) + df['store'].astype(str) + df['product'].astype(str)
    df['country_store'] = df['country'].astype(str) + df['store'].astype(str)
    df['store_product'] = df['store'].astype(str) + df['product'].astype(str) 
    df['country_product'] = df['country'].astype(str) + df['product'].astype(str)
    df.drop(columns=['date'], inplace=True)
    
create_features(train)
create_features(test)

In [None]:
# train['lag1'] = train['num_sold'].shift(1)

## Feature Selection

In [None]:
corr_matrix = train.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
print(upper_tri)

In [None]:
corr = sns.heatmap(upper_tri)
plt.show()

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
train.columns

In [None]:
# train.lag1 = train.lag1.fillna(0)

In [None]:
X = train.copy()
y = X.pop('num_sold')

In [None]:
mi_scores = make_mi_scores(X, y)

In [None]:
mi_scores

In [None]:
plot_mi_scores(make_mi_scores(X, y))

In [None]:
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
# print(to_drop)

In [None]:
# train.drop(columns= to_drop, inplace=True)
# test.drop(columns= to_drop, inplace=True)

## Categorical encoding

In [None]:
categorical_cols= [col for col in train.select_dtypes(["object", "category"])]
train = pd.get_dummies(train, columns=categorical_cols)
test = pd.get_dummies(test, columns=categorical_cols)

In [None]:
categorical_cols

In [None]:
y = train['num_sold']
train.drop(columns=['num_sold'], axis=1, inplace=True)

## Cross-validation

In order to avoid using the future data for prediction, we make use of TimeSeriesSplit from Scikit-learn

In [None]:
from sklearn.model_selection import TimeSeriesSplit
folds = TimeSeriesSplit(6)

In [None]:
#defining error function SMAPE 
# Symmetric mean absolute percentage error (SMAPE or sMAPE) is an accuracy measure based on percentage (or relative) errors
def smape(actual, forecast):
    num = np.abs(forecast-actual)
    den = (np.abs(actual) + np.abs(forecast))/2
    
    return 100*np.mean(num/den)

## CatBoost Regressor model

In [None]:
from catboost import CatBoostRegressor

y_pred = np.zeros(len(test))
scores = []

for fold, (train_id, test_id) in enumerate (folds.split(train)):
    print(f"Fold: {fold}")
    
    X_train, y_train = train.iloc[train_id], y.iloc[train_id]
    X_valid, y_valid = train.iloc[test_id], y.iloc[test_id]
    params = {'eval_metric': 'SMAPE', 'iterations': 1000}
    cat = CatBoostRegressor(**params)
    
    cat.fit(X_train, y_train, eval_set=(X_valid, y_valid),
           early_stopping_rounds=2000, verbose=1000)
    
    print('\n')
    valid_pred =  cat.predict(X_valid)
    valid_score = smape(y_valid, valid_pred)
    scores.append(valid_score)
    
    y_pred += cat.predict(test)/folds.n_splits

In [None]:
score = np.array(scores).mean()
print(f"Mean SMAPE: {score}")

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
submission.num_sold = y_pred
submission['num_sold'] = submission['num_sold'].apply(np.ceil)
submission

In [None]:
submission.to_csv('./submission.csv', index=False)