In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
import lightgbm
import xgboost
import catboost
from tensorflow import keras
import tensorflow as tf
import tensorflow_hub as hub
import seaborn as sns
import matplotlib.pyplot as plt
directory = '/kaggle/input/tabular-playground-series-may-2021/'

In [None]:
train = pd.read_csv(directory + 'train.csv')
test = pd.read_csv(directory + 'test.csv')
submission = pd.read_csv(directory + 'sample_submission.csv')
submission = submission.set_index('id')
train.head()

In [None]:
train_features = train.drop(['target', 'id'], axis=1).values
test_features = test.drop('id', axis=1).values
target = train['target'].values
train_features

The following histogram shows a large imabalance between the classes, with class2 being significantly more frequent than the others. The highest number in the training features is 66 and is in column 38:

In [None]:
max_nums = []
for col in train_features.T:
    max_nums.append(max(col))
print(max_nums.index(max(max_nums)))
max(max_nums)

In [None]:
sns.histplot(target)
pd.DataFrame(train_features).describe(include='all')

The following functions simply map the given targets to numbers by taking the last digit of the target and vice versa:

In [None]:
def class_to_num(classes):
    return [int(word[-1]) for word in classes]

def num_to_class(nums):
    return ['Class_' + str(num) for num in nums]

labels = np.array(class_to_num(target))
labels

In [None]:
X_train, X_valid = train_test_split(train_features, test_size=0.3, shuffle=True, random_state=2021)
y_train, y_valid = train_test_split(labels, test_size=0.3, shuffle=True, random_state=2021)

The three models I use are lightgbm, xgboost, and catboost Classifiers which seem to be the most powerful in the tps. These are the baseline scores, and we can see how to improve upon these scores.

In [None]:
lgb = lightgbm.LGBMClassifier()
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict_proba(X_valid)
log_loss(y_valid, lgb_pred)

In [None]:
ctb = catboost.CatBoostClassifier(verbose=False)
ctb.fit(X_train, y_train)
ctb_pred = ctb.predict_proba(X_valid)
log_loss(y_valid, ctb_pred)

In [None]:
xgb = xgboost.XGBClassifier(verbose=False)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict_proba(X_valid)
log_loss(y_valid, xgb_pred)

The following are the optuna functions used to optimize the parameters for each of the models. Better parameters can probably be found as I did not run too many trials for each one. 

In [None]:
def objective(trial):
    params = {
        'n_estimators' : 20000,
        'max_depth' : trial.suggest_int('max_depth', 5, 10),
        'learning_rate' : trial.suggest_float('learning_rate', 0.005, 0.05),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 30),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 30),        
        'subsample':trial.suggest_float('subsample', 0.6, 1),
        'colsample_bytree':trial.suggest_float('colsample_bytree', 0.1, 0.3),
        'colsample_bynode':trial.suggest_float('colsample_bynode', 0.1,1), 
        'colsample_bylevel':trial.suggest_float('colsample_bylevel', 0.1, 1),   

        
    }
    
    xgb = xgboost.XGBClassifier(verbose=0, **params)
    xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=150, verbose=False)
    xgb_pred = xgb.predict_proba(X_valid)
    return log_loss(y_valid, xgb_pred)

In [None]:
def objective(trial):
    params = {
            'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.5),
            'n_estimators' : 20000,
            'max_bin' : int(trial.suggest_int('max_bin', 10, 5000)),
            'num_leaves' : int(trial.suggest_int('num_leaves', 10, 200)),
            'max_depth' : int(trial.suggest_int('max_depth', 5, 200)),
            'reg_alpha' : trial.suggest_float('reg_alpha', 0, 10),
            'reg_lambda' : trial.suggest_float('reg_lambda', 0, 10),
            'subsample' : trial.suggest_float('subsample', 0.5, 1),
        }
    
    lgb = lightgbm.LGBMClassifier(verbose=0, **params)
    lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=150, verbose=False)
    lgb_pred = lgb.predict_proba(X_valid)
    return log_loss(y_valid, lgb_pred)

In [None]:
def objective(trial):
    params = {
        'n_estimators' : (trial.suggest_int('n_estimators', 8000, 13000)),
        'depth' : (trial.suggest_int('depth', 2, 8)),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.008, 0.8),
        'colsample_bylevel' : trial.suggest_float('colsample_bylevel',0.5, 1),
        'bagging_temperature' : trial.suggest_float('bagging_temperature',0.3, 0.7),
        'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0, 15),
    }
    
    ctb = catboost.CatBoostClassifier(verbose=0, **params)
    ctb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=150, verbose=False)
    ctb_pred = ctb.predict_proba(X_valid)
    return log_loss(y_valid, ctb_pred)

The following parameters are the best ones that were recovered by Optuna. 

In [None]:
lgb_params = {'learning_rate': 0.045955784574255566, 'n_estimators': 20000, 'max_bin': 94,
              'num_leaves': 10, 'max_depth': 27, 'reg_alpha': 8.457214771314742, 
              'reg_lambda': 6.853524481506691, 'subsample': 0.7497817286847477}

In [None]:
xgb_params = {'n_estimators': 20000, 'max_depth': 6, 'learning_rate': 0.020120141936692624, 'reg_lambda': 29.32699373396152, 
              'subsample': 0.818335257624409, 'colsample_bytree': 0.23592240474190632, 
              'colsample_bynode': 0.8200588520341814, 'colsample_bylevel': 0.45383415964985685}

In [None]:
ctb_params = {'n_estimators': 20000, 'depth': 4, 'learning_rate': 0.023629454134134822, 
              'colsample_bylevel': 0.6550855840039158, 'bagging_temperature': 0.9219975014443456, 
              'l2_leaf_reg': 10.133650161121691}

Now that the best parameters are found, we can generate 10 sets of predictions for each model using cv. The following three cells take roughly 45-50 minutes, but could vary based on the device you are using.

In [None]:
folds = 10
lgb_train_preds = []
lgb_test_preds = []
train_features = pd.DataFrame(train_features)
labels = pd.DataFrame(labels)
Nfold = StratifiedKFold(folds, shuffle=True, random_state=2021)
for fold, (train_index, test_index) in enumerate(Nfold.split(train_features, labels)):
    
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    
    X_train, X_valid = train_features.iloc[train_index], train_features.iloc[test_index]
    y_train, y_valid = np.array(labels.iloc[train_index]).ravel(), np.array(labels.iloc[test_index]).ravel()
    
    lgb = lightgbm.LGBMClassifier(**lgb_params)
    lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
    lgb_pred = lgb.predict_proba(X_valid)
    lgb_train_preds.append(lgb_pred)
    lgb_test_preds.append(lgb.predict_proba(test_features))
    print(f'Log Loss for Lightgbm Fold {fold+1}: {log_loss(y_valid, lgb_pred)}')

In [None]:
ctb_train_preds = []
ctb_test_preds = []

for fold, (train_index, test_index) in enumerate(Nfold.split(train_features, labels)):
    
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    
    X_train, X_valid = train_features.iloc[train_index], train_features.iloc[test_index]
    y_train, y_valid = np.array(labels.iloc[train_index]).ravel(), np.array(labels.iloc[test_index]).ravel()
    
    ctb = catboost.CatBoostClassifier(**ctb_params)
    ctb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=150, verbose=False)
    ctb_pred = ctb.predict_proba(X_valid)    
    ctb_train_preds.append(ctb_pred)
    ctb_test_preds.append(ctb.predict_proba(test_features))
    print(f'Log Loss for Catboost Fold {fold+1}: {log_loss(y_valid, ctb_pred)}')

In [None]:
import warnings
warnings.filterwarnings("ignore")
xgb_train_preds = []
xgb_test_preds = []

for fold, (train_index, test_index) in enumerate(Nfold.split(train_features, labels)):
    
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    
    X_train, X_valid = train_features.iloc[train_index], train_features.iloc[test_index]
    y_train, y_valid = np.array(labels.iloc[train_index]).ravel(), np.array(labels.iloc[test_index]).ravel()
    
    xgb = xgboost.XGBClassifier(**xgb_params)
    xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=150, verbose=False, eval_metric='mlogloss')
    xgb_pred = xgb.predict_proba(X_valid)    
    xgb_train_preds.append(xgb_pred)
    xgb_test_preds.append(xgb.predict_proba(test_features))
    print(f'Log Loss for XGBoost Fold {fold+1}: {log_loss(y_valid, xgb_pred)}')

The following simply will generate the validation targets for the corresponding validation sets.

In [None]:
y_valids = []
for fold, (train_index, test_index) in enumerate(Nfold.split(train_features, labels)):
        
    y_valid = labels.iloc[test_index]
    y_valids.append(y_valid)

The cell underneath will train a stacked model where the meta-estimator is a Calibrated Ridge Classifier, and is trained on each fold of the booster predictions. 

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.calibration import CalibratedClassifierCV
rd = CalibratedClassifierCV(RidgeClassifier())
blend_probas = []
for i, (lgb_pred, xgb_pred, ctb_pred, lgb_test, xgb_test, ctb_test) in enumerate(
                                                                      zip(lgb_train_preds, xgb_train_preds, ctb_train_preds,
                                                                      lgb_test_preds, xgb_test_preds, ctb_test_preds)):
    blend_train = np.c_[lgb_pred, xgb_pred, ctb_pred]
    blend_test = np.c_[lgb_test, xgb_test, ctb_test]
    rd.fit(blend_train, y_valids[i][0].values)
    print(log_loss(y_valids[i][0].values, rd.predict_proba(blend_train)))
    blend_probas.append(rd.predict_proba(blend_test))

The following cell uses scipy to find the optimal weights in a weighted average between the predictions of lightgbm, xgboost, and catboost.

In [None]:
from scipy.optimize import minimize
scores = []
weights = []
for y, lgb_pred, xgb_pred, ctb_pred in zip(y_valids, lgb_train_preds, xgb_train_preds, ctb_train_preds):
    preds = []
    preds.append(lgb_pred)
    preds.append(xgb_pred)
    preds.append(ctb_pred)
    def log_weight_loss(weights):
        weighted_pred = (weights[0]*preds[0]) + (weights[1]*preds[1]) + (weights[2]*preds[2])
        return log_loss(y, weighted_pred)
    starting_values = [0.3]*len(preds) 
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
    bounds = [(0,1)]*len(preds) 
    res = minimize(log_weight_loss, starting_values, method='Nelder-Mead', bounds=bounds, constraints=cons)
    
    weights.append(res['x'])
    print(res['fun'])
    scores.append(res['fun'])

Finally, we combine the two predictions in a simple non-weighted average to generate the final predictions:

In [None]:
final_weights = sum(weights)/folds
weighted_preds = np.array((final_weights[0] * sum(lgb_test_preds)/folds)+(final_weights[1] * sum(xgb_test_preds)/folds)+(final_weights[2] * sum(ctb_test_preds)/folds))

In [None]:
submission[['Class_1', 'Class_2', 'Class_3', 'Class_4']] = ((sum(blend_probas)/folds)+weighted_preds)/2
submission.to_csv('submission.csv')