In [None]:
# /kaggle/input/tabular-playground-series-sep-2021/train.csv
# /kaggle/input/tabular-playground-series-sep-2021/test.csv
# /kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv

#!pip install klib

In [None]:
import pandas as pd
import numpy as np
import klib
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgbm

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
train=klib.data_cleaning(train)
test = klib.data_cleaning(test)

# Feature Engineering

In [None]:
X = train.drop('claim',axis=1)
y = train.claim

X.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

features = X.columns

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

X[features] = scaler.fit_transform(X)
test[features] = scaler.transform(test)

### Filling missing values

In [None]:
for feat in features:
    md = X[feat].median()
    X[feat].fillna(md, inplace=True)
    test[feat].fillna(md, inplace=True)

In [None]:
X.head()

In [None]:
# Feature selection

# from sklearn.feature_selection import SelectKBest, f_classif
# feat_selector = SelectKBest(f_classif, k=100) # k = hyperparameter
# _ = feat_selector.fit(X, y)

# selected_features = features[feat_selector.get_support()]

# X = X[selected_features]
# test = test[selected_features]

## Modeling

In [None]:
def train_lgbm(X_train, X_val, y_train, y_val):
    lgb_train = lgbm.Dataset(X_train, label=y_train)
    lgb_eval = lgbm.Dataset(X_val, label=y_val, reference=lgb_train)
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': 'true',
        'boosting': 'gbdt',
        'num_leaves': 31,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 20,
        'learning_rate': 0.05,
        'verbose': 0,
        'device': 'gpu'
    }
    model = lgbm.train(
        params,
        lgb_train,
        num_boost_round=2000,
        valid_sets=lgb_eval,
        early_stopping_rounds=100,
        verbose_eval=100)
    
    return model

In [None]:
from catboost import CatBoostClassifier
def train_catboost(X_train, X_val, y_train, y_val):
    
    clf = CatBoostClassifier(iterations=1000,
                            task_type="GPU",
                            random_seed=42,
                            learning_rate=0.2,
                            custom_loss=['AUC'])
    
    clf.fit(X_train, y_train,
            eval_set=(X_val, y_val),
            early_stopping_rounds=100,
            verbose_eval=100)
    return clf

In [None]:
from xgboost import XGBClassifier
def train_xgboost(X_train, X_val, y_train, y_val):
    clf = XGBClassifier(n_estimators=500,
                        use_label_encoder=False,
                        learning_rate=0.2,
                        tree_method='gpu_hist',
                        gpu_id=0,
                        eval_metric='auc')
    
    _ = clf.fit(X_train, y_train)
    return clf

In [None]:
K = 3
split = StratifiedKFold(random_state=1, n_splits=K, shuffle=True)

## LGBM

In [None]:
lgbm_submit_preds = np.zeros(len(test))

for i, (train_idx, val_idx) in enumerate(split.split(X, y)):
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = train_lgbm(X_train, X_val, y_train, y_val)
    
    val_preds = model.predict(X_val, num_iteration=model.best_iteration)
    test_preds = model.predict(test, num_iteration=model.best_iteration)
    
    val_auc = roc_auc_score(y_val, val_preds)
    print(f'\nAUC score for validation set is {val_auc}\n')
    
    lgbm_submit_preds+=test_preds/K

    
del model

# Catboost

In [None]:
cat_submit_preds = np.zeros(len(test))

for i, (train_idx, val_idx) in enumerate(split.split(X, y)):
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = train_catboost(X_train, X_val, y_train, y_val)
    
    val_preds = model.predict_proba(X_val)[:,1]
    test_preds = model.predict_proba(test)[:,1]
    
    val_auc = roc_auc_score(y_val, val_preds)
    print(f'\nAUC score for validation set is {val_auc}\n')
    
    cat_submit_preds+=test_preds/K

del model

# XGBoost

In [None]:
xgb_submit_preds = np.zeros(len(test))

for i, (train_idx, val_idx) in enumerate(split.split(X, y)):
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = train_xgboost(X_train, X_val, y_train, y_val)
    
    val_preds = model.predict_proba(X_val)[:,1]
    test_preds = model.predict_proba(test)[:,1]
    
    val_auc = roc_auc_score(y_val, val_preds)
    print(f'\nAUC score for validation set is {val_auc}\n')
    
    xgb_submit_preds+=test_preds/K

del model

## Submission

In [None]:
final_preds = (lgbm_submit_preds + cat_submit_preds + xgb_submit_preds)/3
submission['claim'] = final_preds 
submission.to_csv("output_normal.csv", index=False)

In [None]:
final_preds2 = (lgbm_submit_preds**2 + cat_submit_preds**2 + xgb_submit_preds**2)/3
submission['claim'] = final_preds2 
submission.to_csv("output_powerof2.csv", index=False)