# Library

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Data loading

In [None]:
train_set = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test_set = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
gdp = pd.read_csv('/kaggle/input/tps-jan-2022-gdp-data-long-format/gdp_long.csv')
weather = pd.read_csv('/kaggle/input/finland-norway-and-sweden-weather-data-20152019/nordics_weather.csv')
weather.date = pd.to_datetime(weather.date)
train_set['type'] = 'Train'
test_set['type'] = 'Test'
df = train_set.append(test_set)
df = df.reset_index(drop = True)
df['num_sold'] = np.log(df['num_sold'])
df.head()

In [None]:
# add feature
df.date = pd.to_datetime(df.date)
df['dateofweek'] = df['date'].dt.dayofweek
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['day_of_year'] = df['date'].dt.day_of_year
df['date_of_month'] = df['date'].dt.days_in_month
df = df.merge(gdp)
# df = df.merge(weather)
df = df.sort_values(by = 'row_id')
df = df.reset_index()
# df.drop(columns = 'index',inplace = True)

In [None]:
trend = df[df.type == 'Train'].groupby(['country', 'store', 'product', 'month','dateofweek'])['num_sold'].median().unstack(level='product')
trend['Hat/Sticker'] = trend['Kaggle Hat'] / trend['Kaggle Sticker']
trend['Mug/Sticker'] = trend['Kaggle Mug'] / trend['Kaggle Sticker']
trend

In [None]:
def build_dummy(df,column):
    temp = df[df.type == 'Train']
    # temp['num_sold'] = np.exp(temp['num_sold'])
    weight = temp.groupby([column])['num_sold'].agg({np.median})
    weight = weight.iloc[:,0].tolist()
    dummy = pd.get_dummies(df[column], prefix = column)
    weight = np.log(weight)
    return dummy * weight

In [None]:
# one-hot
df = pd.concat([
    df,
    build_dummy(df,'country'),
    build_dummy(df,'store'),
    build_dummy(df,'product'),
    build_dummy(df,'dateofweek'),
   # build_dummy(df,'date_of_month'),
   # build_dummy(df,'month'),
    build_dummy(df,'quarter')
],axis=1)
df.shape

In [None]:
# add seasonal effet and trend effet
for i in range(1,2):
    df[f'dateofweek_sin{i}'] = np.sin(2* np.pi * df['date'].dt.dayofweek / 7 * i)
    df[f'dateofweek_cos{i}'] = np.cos(2* np.pi * df['date'].dt.dayofweek / 7 * i)
    df[f'month_sin{i}'] = np.sin(2* np.pi * df['date'].dt.month / 12 * i)
    df[f'month_cos{i}'] = np.cos(2* np.pi * df['date'].dt.month / 12 * i)
    
for i in range(1,5):
    df[f'dayofyear_sin{i}'] = np.sin(2* np.pi * df['date'].dt.day_of_year / 365 * i)
    df[f'dayofyear_cos{i}'] = np.cos(2* np.pi * df['date'].dt.day_of_year / 365 * i)

In [None]:
df.shape

In [None]:
df

# Visualization

In [None]:
plt.figure(figsize=(20, 10), dpi=80)
df[df.type == 'Train'].groupby(['country','store','product'])['num_sold'].plot(legend = True)
plt.show()

# Training XGBoost model

In [None]:
df_drop = df.drop(columns = ['row_id','date','country','store','product'])
train_set = df_drop[df.type == 'Train']
test_set = df_drop[df.type == 'Test']
train_set = train_set.drop(columns = 'type')
test_set = test_set.drop(columns = 'type')
X = train_set.loc[:,train_set.columns!='num_sold']
y = train_set['num_sold']
test_x = test_set.loc[:,test_set.columns!='num_sold']
test_y = test_set['num_sold']

In [None]:
params ={'lambda': 0.001,
         'alpha': 3,
         'eta': 0.01, 
         'colsample_bytree': .85, 
         'subsample': 0.6, 
         'learning_rate': 0.0025, 
         'n_estimators': 8000, 
         'max_depth': 18, 
         'min_child_weight': 3}
xgb_reg = xgb.XGBRegressor(**params,tree_method = 'gpu_hist')

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y, test_size = .25, shuffle=False)

In [None]:
# pd.options.display.max_rows = 10
# X_train.describe().T

In [None]:
xgb_reg.fit(X_train, y_train,
            eval_metric = ['mape'],
            eval_set = [[X_train, y_train],[X_val, y_val]])

In [None]:
y_predT = xgb_reg.predict(X_train)
y_predV = xgb_reg.predict(X_val)
plt.figure(figsize=(12, 6), dpi=80)
plt.plot(y_train,y_predT,'ro')
plt.show()

plt.figure(figsize=(12, 6), dpi=80)
plt.plot(y_val,y_predV,'ro')
plt.show()

plt.figure(figsize=(12, 6), dpi=80)
# plt.plot(y_predV,kind = 'density')
# plt.plot(y_val,kind = 'density')
sns.kdeplot(y_predV, color="green", shade=True)
sns.kdeplot(y_val, color="blue", shade=True)

results = xgb_reg.evals_result()
plt.figure(figsize=(10, 8))
plt.plot(results['validation_0']['mape'], label='train')
plt.plot(results['validation_1']['mape'], label='test')

In [None]:
# temp = df[df.type == 'Train']
# temp['pred_sold'] = np.append(y_predT,y_predV)
# temp.groupby(['country','store','product'])['num_sold','pred_sold'].plot(legend = True)
# plt.show()

In [None]:
def mape(y, y_pred, **kwargs):
    epsilon = np.finfo(np.float64).eps
    mape = np.abs(y_pred - y) / np.maximum(np.abs(y), epsilon)
    return np.mean(mape)


In [None]:
xgb_reg.fit(X, y)
pred_train = xgb_reg.predict(X)
pred_test = xgb_reg.predict(test_x)
train_df = df[df.type == 'Train']
test_df = df[df.type == 'Test']
print('Mape: %.4f' % mape(train_df['num_sold'],pred_train))

In [None]:
plt.figure(figsize=(12, 6), dpi=80)
plt.plot(train_df['num_sold'],pred_train,'ro')
plt.show()

In [None]:
plt.figure(figsize=(16, 8), dpi=80)
df[df.type == 'Train'].groupby(['country','store','product'])['num_sold'].plot(legend = True)
plt.show()
train_df['pred_sold'] = pred_train
plt.figure(figsize=(16, 8), dpi=80)
train_df.groupby(['country','store','product'])['pred_sold'].plot(legend = True)
plt.show()
test_df['num_sold'] = pred_test
plt.figure(figsize=(16, 8), dpi=80)
test_df.groupby(['country','store','product'])['num_sold'].plot(legend = True)
plt.show()

In [None]:
train_df.plot.scatter('num_sold','pred_sold')

In [None]:
X

In [None]:
output = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')
output['num_sold'] = np.exp(pred_test)

output.to_csv('submission.csv', index=False)

In [None]:
output.head()