In [None]:
import numpy as np
import pandas as pd
import optuna

from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
from pathlib import Path
data_dir = Path('../input/tabular-playground-series-sep-2021/')
df = pd.read_csv('../input/sep-tsp-folds/train_folds.csv')
df_test = pd.read_csv(data_dir/'test.csv')
submission = pd.read_csv(data_dir/'sample_solution.csv')



In [None]:
df.isna().sum()

In [None]:
df = pd.read_csv('../input/sep-tsp-folds/train_folds.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

useful_features = [c for c in df.columns if c not in ['id', 'claim', 'kfold']]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain['claim']
    yvalid = xvalid['claim']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]   
    
    model = XGBClassifier(
        colsample_bytree= 0.500798910783641, 
        subsample= 0.2682967088853332, 
        learning_rate= 0.01187431306013263, 
        max_depth= 2, 
        min_child_weight= 272,
        n_estimators= 10000,
#         n_jobs=-1,
        random_state=0,
        use_label_encoder=False,
        objective='binary:logistic',
        tree_method='gpu_hist',  # Use GPU acceleration
        gpu_id=0,
        predictor='gpu_predictor',
    )
    
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_metric='auc', eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    auc = roc_auc_score(yvalid, preds_valid)
    scores.append(auc)
    
print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

submission['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
submission.columns = ["id", "pred_1"]
submission.to_csv("test_pred_1.csv", index=False)

In [None]:
df = pd.read_csv('../input/sep-tsp-folds/train_folds.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

useful_features = [c for c in df.columns if c not in ['id', 'claim', 'kfold']]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain['claim']
    yvalid = xvalid['claim']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]   
    
    params = {
        'verbose': 0,
        'n_estimators': 10000,
         'max_depth': 6,
         'learning_rate': 0.04,
        "grow_policy": "SymmetricTree",
        "l2_leaf_reg": 3.0,
        "random_strength": 1.0,
        'task_type':'GPU'
        }
    
    # Define the model 
    model = CatBoostClassifier(**params, random_state=1, eval_metric='AUC')
    model.fit(xtrain, ytrain, early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], verbose=1000)
    
   
    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    auc = roc_auc_score(yvalid, preds_valid)
    scores.append(auc)
    
print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("train_pred_2.csv", index=False)

submission['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
submission.columns = ["id", "pred_2"]
submission.to_csv("test_pred_2.csv", index=False)