In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import random

from IPython import display as ipd
from tqdm import tqdm
import lightgbm as lgb

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold

from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

import optuna 
from optuna.visualization.matplotlib import plot_optimization_history
from optuna.visualization.matplotlib import plot_param_importances

### Utils

In [None]:
def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')
    
## https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298201
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)    

### Data Load

In [None]:
RANDOM_SEED = 42
DEBUG = True
TUNING = False

seeding(RANDOM_SEED)

train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')

In [None]:
train.head()

## Targets distribution display

In [None]:
## targets distribution by country

f, (ax1,ax2,ax3) = plt.subplots(3, 1, figsize=(16, 16))
sns.despine(f)
g1 = sns.histplot( data=train[train['country'] == 'Finland'], x = 'num_sold', hue='product', ax=ax1,  palette="rainbow")
g1.set_title("Finland")
g2 = sns.histplot( data=train[train['country'] == 'Norway'], x = 'num_sold', hue='product', ax=ax2,  palette="rainbow")
g2.set_title("Norway")
g3 = sns.histplot( data=train[train['country'] == 'Sweden'], x = 'num_sold', hue='product', ax=ax3,  palette="rainbow")
g3.set_title("Sweden")

In [None]:
f, ax1 = plt.subplots(1, 1, figsize=(16, 6))

sns.boxplot( data=train, x="country", y="num_sold", hue="product", ax=ax1, palette="Spectral")
sns.despine(left=True)

In [None]:
## Another nice way of showing distribution

f, ax1 = plt.subplots(1, 1, figsize=(16, 6))
product_order = ["Kaggle Mug", "Kaggle Hat", "Kaggle Sticker"]
sns.boxenplot(x="product", y="num_sold", palette="rainbow", hue='country', order=product_order, scale="linear", data=train)

### Very simple date-based FE

In [None]:
def process_dates(df):
    df.date = pd.to_datetime(df.date)
    df['month'] = df.date.dt.month
    df['week'] = df.date.dt.week
    df['weekday'] = df.date.dt.weekday
    df['dayofweek'] = df.date.dt.dayofweek
    df['dayofyear'] = df.date.dt.dayofyear
    df['day'] = df.date.dt.day
    return df

train = process_dates(train)
test = process_dates(test)

In [None]:
print(f'Train unique days: {train.day.unique().size}, test: {test.day.unique().size}')
print(f'Train unique weeks: {train.week.unique().size}, test: {test.week.unique().size}')
print(f'Train unique dayofweeks: {train.dayofweek.unique().size}, test: {train.dayofweek.unique().size}')
print(f'Train unique months: {train.month.unique().size}, test: {train.month.unique().size}')
print(f'Train unique dayofyear: {train.dayofyear.unique().size}, test: {train.dayofyear.unique().size}')

In [None]:
target = train.num_sold
train.drop(['row_id','num_sold','date'], axis=1, inplace=True)
test.drop(['row_id', 'date'], axis=1, inplace=True)

### Encode category columns 

In [None]:
country_encoder = LabelEncoder()
train['country_enc'] = country_encoder.fit_transform(train['country'])
test['country_enc'] = country_encoder.transform(test['country'])

store_encoder = LabelEncoder()
train['store_enc'] = store_encoder.fit_transform(train['store'])
test['store_enc'] = store_encoder.transform(test['store'])

product_encoder = LabelEncoder()
train['product_enc'] = product_encoder.fit_transform(train['product'])
test['product_enc'] = product_encoder.transform(test['product'])

train.drop(['country','store','product'], axis=1, inplace=True)
test.drop(['country','store','product'], axis=1, inplace=True)

In [None]:
for col in train.columns:
    train[col] = pd.Categorical(train[col])
for col in test.columns:
    test[col] = pd.Categorical(test[col])

### Tuning

In [None]:
NUM_BOOST_ROUND = 2000
EARLY_STOPPING_ROUNDS = 50
VERBOSE_EVAL = 100

def objective(trial, X, y):
    
    param_grid = {
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'n_estimators': trial.suggest_categorical('n_estimators', [2000]),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 50, 2000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 200, 2000, step=100),
        'max_bin': trial.suggest_int('max_bin', 200, 300),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100, step=5),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100, step=5),        
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }    
        
    X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.25, random_state=RANDOM_SEED, shuffle=True)
    eval_results = {}  # to record eval results for plotting
    
    model = lgb.train(
        param_grid, valid_names=["train", "valid"], 
        train_set=lgb.Dataset(X_train, y_train ), 
        num_boost_round = NUM_BOOST_ROUND,
        valid_sets = [lgb.Dataset(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(VERBOSE_EVAL), 
           lgb.early_stopping(EARLY_STOPPING_ROUNDS, False, True),
           lgb.record_evaluation(eval_result=eval_results)],        
    )    
    
    oof_pred = model.predict(X_valid)
    return SMAPE(y_valid, oof_pred)        

In [None]:
N_TRIALS = 100

if TUNING:
    study = optuna.create_study(direction='minimize')
    objective_func = lambda trial: objective(trial, train, target)
    study.optimize(objective_func, n_trials=N_TRIALS)  # number of iterations

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

### Model and train

In [None]:
def run_train(X, y, run_params, splits, num_boost_round, verbose_eval, early_stopping_rounds ):
    scores = []
    models = []
    eval_results = {}  # to record eval results for plotting
    folds = StratifiedKFold(n_splits=splits)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print(f'Fold {fold_n+1} started')
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model = lgb.train(
            run_params, valid_names=["train", "valid"], 
            train_set=lgb.Dataset(X_train, y_train ), 
            num_boost_round = num_boost_round,
            valid_sets = [lgb.Dataset(X_valid, y_valid)],
            callbacks=[lgb.log_evaluation(verbose_eval), 
               lgb.early_stopping(early_stopping_rounds, False, True),
               lgb.record_evaluation(eval_result=eval_results)],
        )

        y_predicted = model.predict(X_valid)
        score = SMAPE(y_valid, y_predicted)   
        print(f'SMAPE: {score}')

        models.append(model)
        scores.append(score)
    return scores, models, eval_results


TOTAL_SPLITS = 5
NUM_BOOST_ROUND = 8000
EARLY_STOPPING_ROUNDS = 100
VERBOSE_EVAL = 200
    
run_params = {
    'verbose': -1, 
    'boosting_type': 'gbdt', 
    'objective': 'regression', 
    'metric': ['rmse'],
    'learning_rate': 0.03600124778051181,
    'num_leaves': 1400,
    'max_depth': 9,
    'min_data_in_leaf': 200,
    'max_bin': 240,
    'lambda_l1': 45,
    'lambda_l2': 20,
    'feature_fraction': 0.9033256488572796,
    'bagging_fraction': 0.9728721582350929,
    'bagging_freq': 1,
    'min_child_samples': 69,
}

scores, models, eval_results = run_train(train, target, run_params, TOTAL_SPLITS, NUM_BOOST_ROUND, 
                                          VERBOSE_EVAL, EARLY_STOPPING_ROUNDS)
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

### Plot metrics

In [None]:
ax = lgb.plot_metric(eval_results, metric='rmse')
plt.show()

In [None]:
y_pred = np.zeros(len(test))
for model in models:
    y_pred += model.predict(test).reshape(-1)
    
y_pred = y_pred / len(models)

In [None]:
submission['num_sold'] = np.round(y_pred).astype(int)
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)