In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
import warnings 
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv("../input/tbs0921/train_10fold.csv")
test=pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample=pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

In [None]:
useful_features=[col for col in test.columns if col!="id"]

train['n_missing'] = train[useful_features].isna().sum(axis=1)
test['n_missing'] = test[useful_features].isna().sum(axis=1)

train['std'] = train[useful_features].std(axis=1)
test['std'] = test[useful_features].std(axis=1)

In [None]:
useful_features=[col for col in test.columns if col!="id"]

preds=[]

df_test=test[useful_features]

val_score=[]
training_score=[]
for i in range(10):
    xtrain=train[train.kfold!=i].reset_index(drop=True)
    xvalid=train[train.kfold==i].reset_index(drop=True)
    xtest=df_test.copy()
    
    ytrain=xtrain['claim']
    yvalid=xvalid['claim']
    
    xtrain=xtrain[useful_features]
    xvalid=xvalid[useful_features]
    
    imputer=SimpleImputer(strategy='mean')
    xtrain=imputer.fit_transform(xtrain)
    xvalid=imputer.transform(xvalid)
    xtest=imputer.transform(xtest)
    
    scale=StandardScaler()
    xtrain=scale.fit_transform(xtrain)
    xvalid=scale.transform(xvalid)
    xtest=scale.transform(xtest)
    
    model = LGBMClassifier(max_depth = 3, 
                           num_leaves = 7, 
                           n_estimators = 10000, 
                           colsample_bytree = 0.3, 
                           subsample = 0.5, 
                           random_state = 41, 
                           reg_alpha=18, 
                           reg_lambda=17, 
                           learning_rate = 0.095, 
                           device = 'gpu', 
                           objective= 'binary')
    
    model.fit(xtrain, ytrain, 
              eval_metric="auc",
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_pred=model.predict_proba(xvalid)
    train_pred=model.predict_proba(xtrain)
    test_pred=model.predict_proba(xtest)
    
    valid_score=roc_auc_score(yvalid,valid_pred[:,1])
    train_score=roc_auc_score(ytrain,train_pred[:,1])
    
    val_score.append(valid_score)
    training_score.append(train_score)
    
    print(f"fold {i} | validation score: {valid_score}")
    print(f"fold {i} | training score:   {train_score}")
    print("------------------------------------------")
    
    preds.append(test_pred[:,1])

print("mean validation auc score: ",sum(val_score)/len(val_score))
print("mean training auc score: ", sum(training_score)/len(training_score))

In [None]:
sample.claim = np.mean(np.column_stack(preds), axis=1)
sample.to_csv("sub5.csv",index=False)