![](https://t4.ftcdn.net/jpg/04/60/05/05/360_F_460050532_7JjxeTTaZLyk7RTOayql8iX4O6Zlctjs.jpg)

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-jan-2022/train.csv')
train.head()

In [None]:
test = pd.read_csv(r'../input/tabular-playground-series-jan-2022/test.csv')
test.head()

In [None]:
sample_submission = pd.read_csv(r'../input/tabular-playground-series-jan-2022/sample_submission.csv')
sample_submission.head()

In [None]:
print(f'train set have {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'test set have {test.shape[0]} rows and {test.shape[1]} columns.') 
print(f'sample_submission set have {sample_submission.shape[0]} rows and {sample_submission.shape[1]} columns.') 

In [None]:
# let's check for missing values
train.isnull().sum()

In [None]:
# let's check count of unique values every cols are having
train.nunique()

In [None]:
# let's drop row_id col
train.drop('row_id',axis=1,inplace=True)
test.drop('row_id',axis=1,inplace=True)

In [None]:
print("country unique values:")
print(train['country'].value_counts())
plt.figure(figsize=(14,5))
count = train['country'].value_counts()
sns.barplot(x=count.index, y=count.values,linewidth=1.5,errcolor=".2", edgecolor=".2")
plt.title("country unique values", fontdict={'fontsize':20})
plt.show()

In [None]:
print("store unique values:")
print(train['store'].value_counts())
plt.figure(figsize=(14,5))
count = train['store'].value_counts()
sns.barplot(x=count.index, y=count.values,linewidth=1.5,errcolor=".2", edgecolor=".2")
plt.title("store unique values", fontdict={'fontsize':20})
plt.show()

In [None]:
print("product unique values:")
print(train['product'].value_counts())
plt.figure(figsize=(14,5))
count = train['product'].value_counts()
sns.barplot(x=count.index, y=count.values,linewidth=1.5,errcolor=".2", edgecolor=".2")
plt.title("product unique values", fontdict={'fontsize':20})
plt.show()

In [None]:
train['date'] = pd.to_datetime(train['date'])
train['year'] = train['date'].dt.year
train['day'] = train['date'].dt.day
train['dayofweek'] = train['date'].dt.dayofweek
train['dayofyear'] = train['date'].dt.dayofyear
train['weekday'] = train['date'].dt.weekday

test['date'] = pd.to_datetime(test['date'])
test['year'] = test['date'].dt.year
test['day'] = test['date'].dt.day
test['dayofweek'] = test['date'].dt.dayofweek
test['dayofyear'] = test['date'].dt.dayofyear
test['weekday'] = test['date'].dt.weekday

In [None]:
# let's drop date col
train.drop('date',axis=1,inplace=True)
test.drop('date',axis=1,inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
val = ['country', 'product', 'store']
for i in val:
    train[i] = le.fit_transform(train[i])
    test[i] = le.transform(test[i])

In [None]:
train.head()

In [None]:
y = train['num_sold']
train.drop('num_sold',axis=1,inplace=True)

In [None]:
# Credit to https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

## XGBOOST

In [None]:
folds = TimeSeriesSplit()

xgb_predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    print(f"Fold: {fold}")
    X_train, X_test = train.iloc[trn_idx], train.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model_xgb = XGBRegressor(tree_method='gpu_hist')
   
    model_xgb.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    pred = model_xgb.predict(X_test)
    smape = SMAPE(y_test, pred)
    print(f" smape_value: {smape}")
    print("-"*50)
    
    xgb_predictions += model_xgb.predict(test) / folds.n_splits

In [None]:
# plot feature importance
from xgboost import plot_importance
fig, ax = plt.subplots(1,1,figsize=(20,12))
plot_importance(model_xgb,ax=ax, xlabel=None)
plt.title('XGB Feature importance')
plt.show()

## LIGHTGBM

In [None]:
folds = TimeSeriesSplit()

lgb_predictions = np.zeros(len(test))


for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    print(f"Fold: {fold}")
    X_train, X_test = train.iloc[trn_idx], train.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model_lgb = LGBMRegressor(device_type='gpu')
   
    model_lgb.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    pred = model_lgb.predict(X_test)
    smape = SMAPE(y_test, pred)
    print(f" smape_value: {smape}")
    print("-"*50)
    
    lgb_predictions += model_lgb.predict(test) / folds.n_splits

In [None]:
# plot feature importance
from lightgbm import plot_importance
fig, ax = plt.subplots(1,1,figsize=(20,12))
plot_importance(model_lgb,ax=ax, xlabel=None)
plt.title('LGB Feature importance')
plt.show()

In [None]:
sample_submission['num_sold'] = xgb_predictions
sample_submission.to_csv(f'xgb.csv',index = False)

sample_submission['num_sold'] = lgb_predictions
sample_submission.to_csv(f'lgb.csv',index = False)