# Importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
import optuna
from optuna.integration import LightGBMPruningCallback
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# Loading the data and creating folds

No preprocessing is needed as we will be training out models on the out-of-fold predictions of our base models.

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_solution = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

df_train['kfold'] = -1

y_train = df_train.claim
X_train = df_train.drop('claim', axis=1)

skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (i_train, i_valid) in enumerate (skf.split(X_train, y_train)):
    df_train.loc[i_valid, 'kfold'] = fold

# Loading the out-of-fold and test predictions

In [None]:
m = 1

for i in range(20):
    for s in range(5):
        df_train_temp = pd.read_csv(f'../input/tps-09-2021-base-predictions/m{m}s{s}_valid_pred.csv')
        df_test_temp = pd.read_csv(f'../input/tps-09-2021-base-predictions/m{m}s{s}_test_pred.csv')
        df_train = df_train.merge(df_train_temp, on='id', how='left')
        df_test = df_test.merge(df_test_temp, on='id', how='left')
    preds_temp = [c for c in df_test.columns if f'm{m}' in c]
    df_train[f'm{m}_mean_pred'] = df_train[preds_temp].mean(axis=1)
    df_test[f'm{m}_mean_pred'] = df_test[preds_temp].mean(axis=1)
    m += 1

# Hyperparameter tuning with Optuna

In [None]:
# seed = 0

# features = [c for c in df_train.columns if 'mean' in c]
# df_test = df_test[features]

# def objective(trial):
#     fold = 0
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 2, 20),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1000, 20000),
#         'max_depth': trial.suggest_int('max_depth', 0, 4),
#         'max_bin': trial.suggest_int('max_bin', 200, 400),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5),
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.00001, 50),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.00001, 50),
#         'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 4),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.75, 0.9),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 1)        
#     }

#     X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
#     X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)
        
#     y_train = X_train.claim
#     y_valid = X_valid.claim
    
#     X_train = X_train[features]
#     X_valid = X_valid[features]
    
#     model = LGBMClassifier(
#             objective='binary',
#             tree_learner='serial',
#             seed=seed,
#             n_estimators=50000,
#             **params)
    
#     model.fit(X_train,
#               y_train,
#               early_stopping_rounds=500,
#               eval_set=[(X_valid, y_valid)],
#               eval_metric='auc',
#               callbacks=[LightGBMPruningCallback(trial, 'auc')],
#               verbose=1000)
    
#     valid_pred = model.predict_proba(X_valid)[:,1]
        
#     auc = roc_auc_score(y_valid, valid_pred)
#     return auc

# for i in range(10):
#     study = optuna.create_study(direction="maximize")
#     study.optimize(objective, n_trials=500)

# Model training

Model 1

In [None]:
features = [c for c in df_train.columns if 'mean' in c]
df_test = df_test[features]

test_preds = []
valid_preds = {}
scores = []

for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

    X_test = df_test[features].copy()

    valid_ids = X_valid.id.values.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim

    X_train = X_train[features]
    X_valid = X_valid[features]
    
    params = {'num_leaves': 16,
              'min_data_in_leaf': 8056,
              'max_depth': 0,
              'max_bin': 355,
              'learning_rate': 0.008529108246972048,
              'lambda_l1': 9.855426024301865,
              'lambda_l2': 3.584196138812915e-05,
              'min_gain_to_split': 0.42289363518494794,
              'feature_fraction': 0.42929842197137486,
              'bagging_fraction': 0.8422719204115733,
              'bagging_freq': 1}

    model = LGBMClassifier(
        objective='binary',
        importance_type='split', #default=split. try gain
        boosting_type='gbdt', #default=gbdt. try dart, goss, rf
        tree_learner='serial',
        num_threads=-1,
        random_state=fold,
        n_estimators=5000,
        **params)
    
    model.fit(X_train,
          y_train,
          early_stopping_rounds=500,
          eval_set=[(X_valid, y_valid)],
          eval_metric='auc',
          verbose=1000)
    
    valid_pred = model.predict_proba(X_valid)[:,1]
    test_pred = model.predict_proba(X_test)[:,1]
    
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    test_preds.append(test_pred)
    
    score = roc_auc_score(y_valid, valid_pred)    
    scores.append(score)
    
print(f'Mean auc {np.mean(scores)}, std {np.std(scores)}')

valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
valid_preds.columns = ['id', f'pred_1']
valid_preds.to_csv(f'level1_valid_pred_1.csv', index=False)

sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
sample_solution.columns = ['id', f'pred_1']
sample_solution.to_csv(f'level1_test_pred_1.csv', index=False)


Model 2

In [None]:
test_preds = []
valid_preds = {}
scores = []

for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

    X_test = df_test[features].copy()

    valid_ids = X_valid.id.values.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim

    X_train = X_train[features]
    X_valid = X_valid[features]

    params = {'num_leaves': 4,
              'min_data_in_leaf': 11557,
              'max_depth': 1,
              'max_bin': 330,
              'learning_rate': 0.009478273001225938,
              'lambda_l1': 1.17273085019482e-05,
              'lambda_l2': 1.1150847201215146,
              'min_gain_to_split': 0.6849430970013546,
              'feature_fraction': 0.17224803018739418,
              'bagging_fraction': 0.8745864558511497,
              'bagging_freq': 1}

    model = LGBMClassifier(
        objective='binary',
        importance_type='split', #default=split. try gain
        boosting_type='gbdt', #default=gbdt. try dart, goss, rf
        tree_learner='serial',
        num_threads=-1,
        random_state=fold,
        n_estimators=10000,
        **params)
    
    model.fit(X_train,
          y_train,
          early_stopping_rounds=500,
          eval_set=[(X_valid, y_valid)],
          eval_metric='auc',
          verbose=1000)
    
    valid_pred = model.predict_proba(X_valid)[:,1]
    test_pred = model.predict_proba(X_test)[:,1]
    
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    test_preds.append(test_pred)
    
    score = roc_auc_score(y_valid, valid_pred)    
    scores.append(score)
    
print(f'Mean auc {np.mean(scores)}, std {np.std(scores)}')

valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
valid_preds.columns = ['id', f'pred_2']
valid_preds.to_csv(f'level1_valid_pred_2.csv', index=False)

sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
sample_solution.columns = ['id', f'pred_2']
sample_solution.to_csv(f'level1_test_pred_2.csv', index=False)

Model 3

In [None]:
test_preds = []
valid_preds = {}
scores = []

for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

    X_test = df_test[features].copy()

    valid_ids = X_valid.id.values.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim

    X_train = X_train[features]
    X_valid = X_valid[features]

    params = {'num_leaves': 4,
              'min_data_in_leaf': 1458,
              'max_depth': 2,
              'max_bin': 376,
              'learning_rate': 0.006005480922506577,
              'lambda_l1': 0.00021562065637690435,
              'lambda_l2': 4.87408695501209,
              'min_gain_to_split': 1.97067333984107,
              'feature_fraction': 0.8666273347397625,
              'bagging_fraction': 0.8743916816302966,
              'bagging_freq': 1}

    model = LGBMClassifier(
        objective='binary',
        importance_type='split', #default=split. try gain
        boosting_type='gbdt', #default=gbdt. try dart, goss, rf
        tree_learner='serial',
        num_threads=-1,
        random_state=fold,
        n_estimators=10000,
        **params)
    
    model.fit(X_train,
          y_train,
          early_stopping_rounds=500,
          eval_set=[(X_valid, y_valid)],
          eval_metric='auc',
          verbose=1000)
    
    valid_pred = model.predict_proba(X_valid)[:,1]
    test_pred = model.predict_proba(X_test)[:,1]
    
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    test_preds.append(test_pred)
    
    score = roc_auc_score(y_valid, valid_pred)    
    scores.append(score)
    
print(f'Mean auc {np.mean(scores)}, std {np.std(scores)}')

valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
valid_preds.columns = ['id', f'pred_3']
valid_preds.to_csv(f'level1_valid_pred_3.csv', index=False)

sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
sample_solution.columns = ['id', f'pred_3']
sample_solution.to_csv(f'level1_test_pred_3.csv', index=False)

Model 4

In [None]:
test_preds = []
valid_preds = {}
scores = []

for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

    X_test = df_test[features].copy()

    valid_ids = X_valid.id.values.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim

    X_train = X_train[features]
    X_valid = X_valid[features]
    
    params = {'num_leaves': 15,
              'min_data_in_leaf': 6512,
              'max_depth': 3,
              'max_bin': 310,
              'learning_rate': 0.004125972490699574,
              'lambda_l1': 0.003690793308308078,
              'lambda_l2': 0.3051576539377836,
              'min_gain_to_split': 0.2990942549901452,
              'feature_fraction': 0.918317541041119,
              'bagging_fraction': 0.7574750171832634,'bagging_freq': 1}

    model = LGBMClassifier(
        objective='binary',
        importance_type='split', #default=split. try gain
        boosting_type='gbdt', #default=gbdt. try dart, goss, rf
        tree_learner='serial',
        num_threads=-1,
        random_state=fold,
        n_estimators=10000,
        **params)
    
    model.fit(X_train,
          y_train,
          early_stopping_rounds=500,
          eval_set=[(X_valid, y_valid)],
          eval_metric='auc',
          verbose=1000)
    
    valid_pred = model.predict_proba(X_valid)[:,1]
    test_pred = model.predict_proba(X_test)[:,1]
    
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    test_preds.append(test_pred)
    
    score = roc_auc_score(y_valid, valid_pred)    
    scores.append(score)
    
print(f'Mean auc {np.mean(scores)}, std {np.std(scores)}')

valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
valid_preds.columns = ['id', f'pred_4']
valid_preds.to_csv(f'level1_valid_pred_4.csv', index=False)

sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
sample_solution.columns = ['id', f'pred_4']
sample_solution.to_csv(f'level1_test_pred_4.csv', index=False)

Model 5

In [None]:
test_preds = []
valid_preds = {}
scores = []

for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

    X_test = df_test[features].copy()

    valid_ids = X_valid.id.values.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim

    X_train = X_train[features]
    X_valid = X_valid[features]

    params = {'num_leaves': 20,
              'min_data_in_leaf': 19960,
              'max_depth': 4,
              'max_bin': 331,
              'learning_rate': 0.0019637756042711558,
              'lambda_l1': 47.21817822491236,
              'lambda_l2': 0.00045229975521610875,
              'min_gain_to_split': 2.1533850359022284,
              'feature_fraction': 0.10469850378121182,
              'bagging_fraction': 0.8956146814653573,
              'bagging_freq': 1}

    model = LGBMClassifier(
        objective='binary',
        importance_type='split', #default=split. try gain
        boosting_type='gbdt', #default=gbdt. try dart, goss, rf
        tree_learner='serial',
        num_threads=-1,
        random_state=fold,
        n_estimators=10000,
        **params)
    
    model.fit(X_train,
          y_train,
          early_stopping_rounds=500,
          eval_set=[(X_valid, y_valid)],
          eval_metric='auc',
          verbose=1000)
    
    valid_pred = model.predict_proba(X_valid)[:,1]
    test_pred = model.predict_proba(X_test)[:,1]
    
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    test_preds.append(test_pred)
    
    score = roc_auc_score(y_valid, valid_pred)    
    scores.append(score)
    
print(f'Mean auc {np.mean(scores)}, std {np.std(scores)}')

valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
valid_preds.columns = ['id', f'pred_5']
valid_preds.to_csv(f'level1_valid_pred_5.csv', index=False)

sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
sample_solution.columns = ['id', f'pred_5']
sample_solution.to_csv(f'level1_test_pred_5.csv', index=False)

# Re-loading the data and adding the meta-model predictions

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_solution = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

df_train['kfold'] = -1

y_train = df_train.claim
X_train = df_train.drop('claim', axis=1)

skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (i_train, i_valid) in enumerate (skf.split(X_train, y_train)):
    df_train.loc[i_valid, 'kfold'] = fold
    
df_train1 = pd.read_csv('level1_valid_pred_1.csv')
df_train2 = pd.read_csv('level1_valid_pred_2.csv')
df_train3 = pd.read_csv('level1_valid_pred_3.csv')
df_train4 = pd.read_csv('level1_valid_pred_4.csv')
df_train5 = pd.read_csv('level1_valid_pred_5.csv')

df_test1 = pd.read_csv('level1_test_pred_1.csv')
df_test2 = pd.read_csv('level1_test_pred_2.csv')
df_test3 = pd.read_csv('level1_test_pred_3.csv')
df_test4 = pd.read_csv('level1_test_pred_4.csv')
df_test5 = pd.read_csv('level1_test_pred_5.csv')

df_train = df_train.merge(df_train1, on='id', how='left')
df_train = df_train.merge(df_train2, on='id', how='left')
df_train = df_train.merge(df_train3, on='id', how='left')
df_train = df_train.merge(df_train4, on='id', how='left')
df_train = df_train.merge(df_train5, on='id', how='left')

df_test = df_test.merge(df_test1, on='id', how='left')
df_test = df_test.merge(df_test2, on='id', how='left')
df_test = df_test.merge(df_test3, on='id', how='left')
df_test = df_test.merge(df_test4, on='id', how='left')
df_test = df_test.merge(df_test5, on='id', how='left')

# Calculating model weights with Optuna (does not work)

I experimented with the calculation of model weight with Optuna, but couldn't get the percentages to add up to one.

In [None]:
# pred_1 = df_test1.pred_1
# pred_2 = df_test2.pred_2
# pred_3 = df_test3.pred_3
# pred_4 = df_test4.pred_4
# pred_5 = df_test5.pred_5

# features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
# df_test = df_test[features]

# def objective(trial):
    
#     w_1 = trial.suggest_uniform('w1', 0, 1)
#     w_2 = trial.suggest_uniform('w2', 0, 1)
#     w_3 = trial.suggest_uniform('w3', 0, 1)
#     w_4 = trial.suggest_uniform('w4', 0, 1)
#     w_5 = trial.suggest_uniform('w5', 0, 1)

#     X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
#     X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

#     y_train = X_train.claim
#     y_valid = X_valid.claim

#     X_train = X_train[features]
#     X_valid = X_valid[features]

#     pred_1 = X_valid.pred_1
#     pred_2 = X_valid.pred_2
#     pred_3 = X_valid.pred_3
#     pred_4 = X_valid.pred_4
#     pred_5 = X_valid.pred_5

#     pred = (w_1 * pred_1 + w_2 * pred_2 + w_3 * pred_3 + w_4 * pred_4 + w_5 * pred_5) / 5

#     auc = roc_auc_score(y_valid, pred)
    
#     return auc

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=400)

# Hyperparameter tuning the level 2 model

In [None]:
# seed = 0

# features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
# df_test = df_test[features]

# def objective(trial):
#     fold = 0
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 2, 20),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1000, 20000),
#         'max_depth': trial.suggest_int('max_depth', 0, 4),
#         'max_bin': trial.suggest_int('max_bin', 200, 400),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5),
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.00001, 50),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.00001, 50),
#         'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 4),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.75, 0.9),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 1)        
#     }

#     X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
#     X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)
        
#     y_train = X_train.claim
#     y_valid = X_valid.claim
    
#     X_train = X_train[features]
#     X_valid = X_valid[features]
    
#     model = LGBMClassifier(
#             objective='binary',
#             tree_learner='serial',
#             seed=seed,
#             n_estimators=50000,
#             **params)
    
#     model.fit(X_train,
#               y_train,
#               early_stopping_rounds=500,
#               eval_set=[(X_valid, y_valid)],
#               eval_metric='auc',
#               callbacks=[LightGBMPruningCallback(trial, 'auc')],
#               verbose=1000)
    
#     valid_pred = model.predict_proba(X_valid)[:,1]
        
#     auc = roc_auc_score(y_valid, valid_pred)
#     return auc

# for i in range(1):
#     study = optuna.create_study(direction="maximize")
#     study.optimize(objective, n_trials=5000)

# Level 2 model

In [None]:
features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
df_test = df_test[features]

test_preds = []
valid_preds = {}
scores = []

for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

    X_test = df_test[features].copy()

    valid_ids = X_valid.id.values.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim

    X_train = X_train[features]
    X_valid = X_valid[features]
    
#     model = LogisticRegression()
#     model.fit(X_train, y_train)

    params = {'num_leaves': 20,
              'min_data_in_leaf': 12875,
              'max_depth': 0,
              'max_bin': 268,
              'learning_rate': 0.00480893584809694,
              'lambda_l1': 0.00020929737984329072,
              'lambda_l2': 0.0006179415934221924,
              'min_gain_to_split': 2.7035405403285804,
              'feature_fraction': 0.8117162013587371,
              'bagging_fraction': 0.8175123327757648,
              'bagging_freq': 1}
    
    model = LGBMClassifier(
        objective='binary',
        importance_type='split', #default=split. try gain
        boosting_type='gbdt', #default=gbdt. try dart, goss, rf
        tree_learner='serial',
        num_threads=-1,
        random_state=42,
        n_estimators=10000,
        **params)
    
    model.fit(X_train,
          y_train,
          early_stopping_rounds=500,
          eval_set=[(X_valid, y_valid)],
          eval_metric='auc',
          verbose=1000)
    
    valid_pred = model.predict_proba(X_valid)[:,1]
    test_pred = model.predict_proba(X_test)[:,1]
    
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    test_preds.append(test_pred)
    
    score = roc_auc_score(y_valid, valid_pred)    
    scores.append(score)
    
print(f'Mean auc {np.mean(scores)}, std {np.std(scores)}')

# Creating submission files

In [None]:
sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
sample_solution.to_csv('submission.csv', index=False)

In [None]:
# features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
# df_test = df_test[features]

# df_test['mean'] = df_test[features].mean(axis=1)

# sample_solution.claim = df_test['mean']
# sample_solution.to_csv(f'stack_blend.csv', index=False)