In [None]:
# import necessary libraries
import numpy as np
import pandas as pd
import optuna

from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn import impute

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

## Create train folds

In [None]:
from pathlib import Path

data_dir = Path('../input/tabular-playground-series-sep-2021/')

In [None]:
# Uncomment this if you want to set your own folds
# df_train = pd.read_csv(data_dir / 'train.csv')
# skf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=42) #create 10 folds, modify n_splits to change the number of folds

# df_train['kfold'] = -1

# for fold, (train_idx, valid_idx) in enumerate(skf.split(X=df_train, y=df_train['claim'])):
#     df_train.loc[valid_idx, 'kfold'] = fold

# df_train.to_csv('train_stratfold.csv', index=False)

In [None]:
df = pd.read_csv('../input/tpssept2021skfold10/train_stratfold.csv')
df_test = pd.read_csv(data_dir/'test.csv')
submission = pd.read_csv(data_dir/'sample_solution.csv')

useful_features = [c for c in df.columns if c not in ['id', 'claim', 'kfold']]
df_test = df_test[useful_features]

In [None]:
df.isna().sum() # there are missing values but we opt to not impute. Try imputing and check if the score improves

## Train model

In [None]:
# Model 1 - XGBClassifier
# df = pd.read_csv('../input/tpssept2021skfold10/train_stratfold.csv')
# df_test = pd.read_csv(data_dir/'test.csv')
submission = pd.read_csv(data_dir/'sample_solution.csv')

useful_features = [c for c in df.columns if c not in ['id', 'claim', 'kfold']]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(10):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain['claim']
    yvalid = xvalid['claim']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    imputer = impute.SimpleImputer(missing_values=np.nan, strategy='constant', 
                               fill_value=-999, add_indicator=True)
    
    xtrain = imputer.fit_transform(xtrain)
    xvalid = imputer.transform(xvalid)
    xtest = imputer.transform(xtest)
    
    scaler = preprocessing.StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xvalid = scaler.transform(xvalid)
    xtest = scaler.transform(xtest)     
    
    model = XGBClassifier(
        max_depth= 2, 
        n_estimators= 10000,
        random_state=0,
        use_label_encoder=False,
        objective='binary:logistic',
        tree_method='gpu_hist',  # Use GPU acceleration
        gpu_id=0,
        predictor='gpu_predictor',
    )
    
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_metric='auc', eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    auc = roc_auc_score(yvalid, preds_valid)
    scores.append(auc)
    
print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

submission['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
submission.columns = ["id", "pred_1"]
submission.to_csv("test_pred_1.csv", index=False)

In [None]:
# Model 2 - XGBClassifier
# df = pd.read_csv('../input/tpssept2021skfold10/train_stratfold.csv')
# df_test = pd.read_csv(data_dir/'test.csv')
submission = pd.read_csv(data_dir/'sample_solution.csv')

useful_features = [c for c in df.columns if c not in ['id', 'claim', 'kfold']]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(10):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain['claim']
    yvalid = xvalid['claim']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]   
    
    imputer = impute.SimpleImputer(missing_values=np.nan, strategy='constant', 
                               fill_value=-999, add_indicator=True)
    
    xtrain = imputer.fit_transform(xtrain)
    xvalid = imputer.transform(xvalid)
    xtest = imputer.transform(xtest)
    
    scaler = preprocessing.StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xvalid = scaler.transform(xvalid)
    xtest = scaler.transform(xtest)     

    params = {
            'lambda': 0.004562711234493688, 
            'alpha': 7.268146704546314, 
            'colsample_bytree': 0.6468987558386358, 
            'colsample_bynode': 0.29113878257290376, 
            'colsample_bylevel': 0.8915913499148167, 
            'subsample': 0.37130229826185135, 
            'learning_rate': 0.021671163563123198, 
            'grow_policy': 'lossguide', 
            'max_depth': 18, 
            'min_child_weight': 215, 
            'max_bin': 272
            }
    
    model = XGBClassifier(
        **params,
        n_estimators = 10000,
        random_state=0,
        use_label_encoder=False,
        objective='binary:logistic',
        tree_method='gpu_hist',  # Use GPU acceleration
        gpu_id=0,
        predictor='gpu_predictor',
    )
    
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_metric='auc', eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    auc = roc_auc_score(yvalid, preds_valid)
    scores.append(auc)
    
print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("train_pred_2.csv", index=False)

submission['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
submission.columns = ["id", "pred_2"]
submission.to_csv("test_pred_2.csv", index=False) # 0.8038324284174964 0.0011344720245805547

In [None]:
# Model 3 - CatBoostClassifier
# df = pd.read_csv('../input/tpssept2021skfold10/train_stratfold.csv')
# df_test = pd.read_csv(data_dir/'test.csv')
submission = pd.read_csv(data_dir/'sample_solution.csv')

useful_features = [c for c in df.columns if c not in ['id', 'claim', 'kfold']]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(10):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()
    
    ytrain = xtrain['claim']
    yvalid = xvalid['claim']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    imputer = impute.SimpleImputer(missing_values=np.nan, strategy='constant', 
                               fill_value=-999, add_indicator=True)
    
    xtrain = imputer.fit_transform(xtrain)
    xvalid = imputer.transform(xvalid)
    xtest = imputer.transform(xtest)
    
    scaler = preprocessing.StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xvalid = scaler.transform(xvalid)
    xtest = scaler.transform(xtest)     

    params = {
        'verbose': 0,
        'n_estimators': 10000,
         'max_depth': 6,
         'learning_rate': 0.04,
        "grow_policy": "SymmetricTree",
        "l2_leaf_reg": 3.0,
        "random_strength": 1.0,
        'task_type':'GPU'
        }
    
    # Define the model 
    model = CatBoostClassifier(**params, random_state=1, eval_metric='AUC')
    model.fit(xtrain, ytrain, early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], verbose=1000)

    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    auc = roc_auc_score(yvalid, preds_valid)
    scores.append(auc)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ['id', 'pred_3']
final_valid_predictions.to_csv('train_pred_3.csv', index=False)

submission['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
submission.columns = ['id', 'pred_3']
submission.to_csv('test_pred_3.csv', index=False)

In [None]:
# LightGBM
# df = pd.read_csv('../input/tpssept2021skfold10/train_stratfold.csv')
# df_test = pd.read_csv(data_dir/'test.csv')
submission = pd.read_csv(data_dir/'sample_solution.csv')

useful_features = [c for c in df.columns if c not in ['id', 'claim', 'kfold']]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()
    
    ytrain = xtrain['claim']
    yvalid = xvalid['claim']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    params = {
        'device_type':'gpu',  # Use GPU acceleration
        'gpu_device_id':0,
        'gpu_platform_id':0,
        'objective':'binary',
        'metric': 'auc',
        'num_leaves': 150,
         'learning_rate': 0.001921842542288596,
         'max_depth': 1,
         'reg_alpha': 1.9202032662611252e-08,
         'reg_lambda': 0.00024793231855030215,
         'min_child_weight': 0.9162585787332472,
        'n_estimators': 10000,
        }
    
    # Define the model 
    model = LGBMClassifier(**params, random_state=fold)
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)

    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    auc = roc_auc_score(yvalid, preds_valid)
    scores.append(auc)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ['id', 'pred_4']
final_valid_predictions.to_csv('train_pred_4.csv', index=False)

submission['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
submission.columns = ['id', 'pred_4']
submission.to_csv('test_pred_4.csv', index=False)

In [None]:
## Stack train and test oof predictions
df = pd.read_csv('../input/tpssept2021skfold10/train_stratfold.csv')
df_test = pd.read_csv(data_dir/'test.csv')
submission = pd.read_csv(data_dir/'sample_solution.csv')

df1 = pd.read_csv("train_pred_1.csv")
df2 = pd.read_csv("train_pred_2.csv")
df3 = pd.read_csv("train_pred_3.csv")
df4 = pd.read_csv("train_pred_4.csv")

df_test1 = pd.read_csv("test_pred_1.csv")
df_test2 = pd.read_csv("test_pred_2.csv")
df_test3 = pd.read_csv("test_pred_3.csv")
df_test4 = pd.read_csv("test_pred_4.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
df = df.merge(df4, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")
df_test = df_test.merge(df_test4, on="id", how="left")

df.head()

df.to_csv('level0_train_fold.csv', index=False)
df_test.to_csv('level0_test.csv', index=False)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
# Final predictions
# df = pd.read_csv('./level0_train_fold.csv')
# df_test = pd.read_csv('./level0_test.csv')
submission = pd.read_csv(data_dir/'sample_solution.csv')

useful_features = ['pred_1', 'pred_2', 'pred_3', 'pred_4']
df_test = df_test[useful_features]

final_predictions = []
scores = []

for fold in range(10):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain['claim']
    yvalid = xvalid['claim']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    imputer = IterativeImputer(random_state=0)
    xtrain[useful_features] = imputer.fit_transform(xtrain[useful_features])
    xvalid[useful_features] = imputer.transform(xvalid[useful_features])
    xtest[useful_features] = imputer.transform(xtest[useful_features])
    
    model = LogisticRegression(fit_intercept=False)
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_predictions.append(test_preds)
    auc = roc_auc_score(yvalid, preds_valid)
    print(fold, auc)
    scores.append(auc)

print(np.mean(scores), np.std(scores))

In [None]:
submission['claim'] = np.mean(np.column_stack(final_predictions), axis=1)
submission.to_csv('submission_blend.csv', index=False)

In [None]:
!pip install chime --quiet

import chime
import time
chime.success()
time.sleep(0.5)
chime.success()