In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, train_test_split
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')


In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv")

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
%%time

country_list = train['country'].unique()
store_list = train['store'].unique()
product_list = train['product'].unique()

print(f'Country list :{country_list}')
print(f'Store list :{store_list}')
print(f'Product list :{product_list}')

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
def evaluate_time(df):
    min_date = df['date'].min()
    max_date = df['date'].max()
    print(f'Minimum date: {min_date} // Maximum date: {max_date}')
    return None
evaluate_time(train)
evaluate_time(test)

In [None]:
def create_time_features(df: pd.DataFrame) -> pd.DataFrame:
    
    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofmonth'] = df['date'].dt.days_in_month
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.weekofyear
    df['weekday'] = df['date'].dt.weekday
    df['weekend'] = np.where((df['weekday'] == 5) & (df['weekday'] == 6), 1, 0)
    
    return df

In [None]:
train = create_time_features(train)
test = create_time_features(test)

In [None]:
CATEGORICAL = ['country','product','store']
train = pd.get_dummies(train,columns = CATEGORICAL)
test = pd.get_dummies(test,columns = CATEGORICAL)

In [None]:
train.head()

In [None]:
y = train.num_sold
train.drop(columns=['num_sold','date',], inplace=True)
test.drop(columns=['date', 'row_id'], inplace=True)

In [None]:
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

In [None]:
# def objective(trial, data=train, target=y):
    
#     train_x, test_x, train_y, test_y = train_test_split(train, y, test_size=0.3, random_state=0, shuffle=False)
#     params = {
#         'max_depth': trial.suggest_int('amx_depth', 6, 15),
#         'eta': trial.suggest_float('eta', 0.005, 0.1),
#         'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
#         'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
#         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4),
#         'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
#         'predictor': "gpu_predictor",
#         'eval_metric': 'mape'
#     }
#     model = XGBRegressor(**params,
#                          tree_method='gpu_hist',
#                          random_state=2021)
#     model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100, verbose=False)
#     preds = model.predict(test_x)
#     score = smape(test_y, preds)
    
#     return score

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
#optuna.visualization.plot_optimization_history(study)


In [None]:
#optuna.visualization.plot_edf(study)


In [None]:
params= {'amx_depth': 15, 
         'eta': 0.08672149472819503, 
         'subsample': 0.9, 
         'colsample_bytree': 0.9, 
         'min_child_weight': 1.5360787357730001, 
         'reg_lambda': 2.829277523741231, 
         'reg_alpha': 0.792656975024693,
         'gamma': 0.0002942945050487883}
print(params)

In [None]:
tss = TimeSeriesSplit(n_splits=4)
m=1
seeds=4
seed_valid_preds={}
seed_test_preds=[]
seed_scores= []
features = [c for c in test.columns if c not in ('row_id', 'date')]

for s in range(seeds):
    fold_valid_preds = {}
    fold_test_preds = []
    fold_scores = []
    seed_valid_ids = []

    for fold, (i_train, i_test) in enumerate(tss.split(train)):
        X_train = train.iloc[i_train]
        y_train = y.iloc[i_train]
        X_test = test.copy()
        
        
        X_valid = train.iloc[i_test]
        y_valid = y.iloc[i_test]
        
        fold_valid_ids = X_valid.row_id.values.tolist()
        seed_valid_ids += fold_valid_ids

        X_train = X_train[features]
        X_valid = X_valid[features]
        
        params = {}
        
        model = XGBRegressor(booster="gbtree",
                            tree_method="gpu_hist",
                            predictor="gpu_predictor")
        model.fit(X_train,
                  y_train,
                  early_stopping_rounds=200,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False)
        
        fold_valid_pred = model.predict(X_valid)
        fold_test_pred = model.predict(X_test)
        
        fold_valid_preds.update(dict(zip(fold_valid_ids, fold_valid_pred)))
        fold_test_preds.append(fold_test_pred)
        
        
        fold_score = np.mean(np.abs(fold_valid_pred - y_valid) / ((np.abs(y_valid) + np.abs(fold_valid_pred)) / 2)) * 100
        fold_scores.append(fold_score)
        print(f'Seed {s} fold {fold} SMAPE: {fold_score}')
        
    print(f'Seed {s} SMAPE {np.mean(fold_scores)}, std {np.std(fold_scores)}')
    
    seed_valid_pred = np.array(list(fold_valid_preds.values()))
    seed_test_pred = np.mean(np.column_stack(fold_test_preds), axis=1)
    
    seed_valid_preds.update(dict(zip(seed_valid_ids, seed_valid_pred)))
    seed_test_preds.append(seed_test_pred)
    
    seed_score = np.mean(fold_scores)
    seed_scores.append(seed_score)
    
print(f'SMAPE of {s+1} seeds: {np.mean(seed_scores)}, std {np.std(seed_scores)}')



In [None]:
# Out-of-fold predictions for later use
valid_preds = pd.DataFrame(list(zip(seed_valid_ids, seed_valid_preds)))
valid_preds.columns = ['row_id', f'CB{m}_pred']
valid_preds.to_csv(f'CB{m}_valid_pred.csv', index=False)

# Test predictions for later use
submission.num_sold = np.mean(np.column_stack(seed_test_preds), axis=1)
submission.columns = ['row_id', f'CB{m}_pred']
submission.to_csv(f'CB{m}_test_pred.csv', index=False)

# Submission
submission.num_sold = np.mean(np.column_stack(seed_test_preds), axis=1)
submission.columns = ['row_id', 'num_sold']
submission.to_csv('submission.csv', index=False)

In [None]:
# %%time
# folds = TimeSeriesSplit(10)

# preds = np.zeros(len(test))
# scores = []

# for fold,(trn_idx, val_idx) in enumerate(folds.split(train)):
#     X_train, y_train = train.iloc[trn_idx] , y.iloc[trn_idx]
#     X_valid, y_valid = train.iloc[val_idx] , y.iloc[val_idx]
    
#     model = XGBRegressor(booster="gbtree",
#                         tree_method="gpu_hist",
#                         predictor="gpu_predictor")
    
#     model.fit(X_train, y_train, verbose=False)
    
#     preds_valid = model.predict(X_valid)
#     score = smape(y_valid, preds_valid)
#     scores.append(score)
    
#     print(f'<------------ Fold:  {fold+1} --------->')
#     print(f'Score:  {score}')
    
#     preds += model.predict(test) / folds.n_splits

# print(f"\nOverall Validation Score: {np.mean(scores)}")


In [None]:
# submission.num_sold = preds
# submission.to_csv('submission.csv', index=False)
# submission.head()