In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from catboost import *
from sklearn.linear_model import LogisticRegression
import warnings 
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv("../input/kfolds/TPS_august_folds10.csv")
test=pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

In [None]:
useful_features=[col for col in test.columns if col!="id"]

train['n_missing'] = train[useful_features].isna().sum(axis=1)
test['n_missing'] = test[useful_features].isna().sum(axis=1)

train['std'] = train[useful_features].std(axis=1)
test['std'] = test[useful_features].std(axis=1)

In [None]:
#XGBClassifier
sample=pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

useful_features=[col for col in test.columns if col!="id"]

x=train[useful_features]
y=train['claim']
df_test=test[useful_features]

model = XGBClassifier(max_depth=6,
                      n_estimators=5221,
                      learning_rate=0.011614449472389812,
                      gamma=0.4,
                      min_child_weight=1,
                      subsample=0.75,
                      colsample_bytree= 0.919999999,
                      reg_alpha=6.54,
                      reg_lambda=7.88,
                      n_jobs=-1,
                      random_state=41,
                      tree_method='gpu_hist',
                      predictor="gpu_predictor")

val_score=[]
training_score=[]
final_test_predictions=[]
final_valid_predictions={}
for i in range(10):
    xtrain=train[train.kfold!=i].reset_index(drop=True)
    xvalid=train[train.kfold==i].reset_index(drop=True)
    xtest=df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()
    
    ytrain=xtrain['claim']
    yvalid=xvalid['claim']
    
    xtrain=xtrain[useful_features]
    xvalid=xvalid[useful_features]
    
    scale=StandardScaler()
    xtrain=scale.fit_transform(xtrain)
    xvalid=scale.transform(xvalid)
    xtest=scale.transform(xtest)
    
    model.fit(xtrain, ytrain, 
              eval_metric="auc",
              early_stopping_rounds=200, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_pred=model.predict_proba(xvalid)
    test_pred=model.predict_proba(xtest)
    train_pred=model.predict_proba(xtrain)
    
    final_test_predictions.append(test_pred[:,1])
    final_valid_predictions.update(dict(zip(valid_ids,valid_pred[:,1])))
    
    valid_score=roc_auc_score(yvalid,valid_pred[:,1])
    train_score=roc_auc_score(ytrain,train_pred[:,1])
    
    val_score.append(valid_score)
    training_score.append(train_score)
    
    print(f"fold {i} | validation score: {valid_score}")
    print(f"fold {i} | training score: {train_score}")

print("mean validation auc score: ",sum(val_score)/len(val_score))
print("mean training auc score: ",sum(training_score)/len(training_score))
print("----------------------------------")

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]

final_valid_predictions.to_csv("train_pred_1.csv", index=False)

sample['claim']=np.mean(np.column_stack(final_test_predictions),axis=1)
sample.columns=['id','pred_1']

sample.to_csv("test_pred_1.csv", index=False)

In [None]:
#LGBMClassifier
sample=pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

useful_features=[col for col in test.columns if col!="id"]

x=train[useful_features]
y=train['claim']
df_test=test[useful_features]

model = LGBMClassifier(max_depth = 2, 
                       num_leaves = 6, 
                       n_estimators = 18837, 
                       colsample_bytree = 0.49, 
                       subsample = 0.44, 
                       random_state = 41, 
                       reg_alpha=17.45, 
                       reg_lambda=18.27, 
                       learning_rate = 0.09896394522048331, 
                       device = 'gpu', 
                       objective= 'binary')

val_score=[]
training_score=[]
final_test_predictions=[]
final_valid_predictions={}
for i in range(10):
    xtrain=train[train.kfold!=i].reset_index(drop=True)
    xvalid=train[train.kfold==i].reset_index(drop=True)
    xtest=df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()
    
    ytrain=xtrain['claim']
    yvalid=xvalid['claim']
    
    xtrain=xtrain[useful_features]
    xvalid=xvalid[useful_features]
    
    imputer=SimpleImputer(strategy='mean')
    xtrain=imputer.fit_transform(xtrain)
    xvalid=imputer.transform(xvalid)
    xtest=imputer.transform(xtest)
    
    scale=StandardScaler()
    xtrain=scale.fit_transform(xtrain)
    xvalid=scale.transform(xvalid)
    xtest=scale.transform(xtest)
    
    model.fit(xtrain, ytrain, 
              eval_metric="auc",
              early_stopping_rounds=200, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_pred=model.predict_proba(xvalid)
    test_pred=model.predict_proba(xtest)
    train_pred=model.predict_proba(xtrain)
    
    final_test_predictions.append(test_pred[:,1])
    final_valid_predictions.update(dict(zip(valid_ids,valid_pred[:,1])))
    
    valid_score=roc_auc_score(yvalid,valid_pred[:,1])
    train_score=roc_auc_score(ytrain,train_pred[:,1])
    
    val_score.append(valid_score)
    training_score.append(train_score)
    
    print(f"fold {i} | validation score: {valid_score}")
    print(f"fold {i} | training score: {train_score}")

print("mean validation auc score: ",sum(val_score)/len(val_score))
print("mean training auc score: ",sum(training_score)/len(training_score))
print("----------------------------------")

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]

final_valid_predictions.to_csv("train_pred_2.csv", index=False)

sample['claim']=np.mean(np.column_stack(final_test_predictions),axis=1)
sample.columns=['id','pred_2']

sample.to_csv("test_pred_2.csv", index=False)

In [None]:
#CatboostClassifier
sample=pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

useful_features=[col for col in test.columns if col!="id"]

x=train[useful_features]
y=train['claim']
df_test=test[useful_features]

params={'depth': 4,
        'iterations': 5500,
        'learning_rate': 0.009859122495188591,
        'l2_leaf_reg': 4.604960795553966,
        'random_strength': 2.9194118955712285,
        'grow_policy': 'Depthwise',
        'leaf_estimation_method': 'Newton',
        'bootstrap_type': 'Bernoulli',
        'loss_function': 'CrossEntropy',
        'eval_metric': 'AUC',
        'task_type': 'GPU',
        'verbose': 1000}
    
model = CatBoostClassifier(**params)

val_score=[]
training_score=[]
final_test_predictions=[]
final_valid_predictions={}
for i in range(10):
    xtrain=train[train.kfold!=i].reset_index(drop=True)
    xvalid=train[train.kfold==i].reset_index(drop=True)
    xtest=df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()
    
    ytrain=xtrain['claim']
    yvalid=xvalid['claim']
    
    xtrain=xtrain[useful_features]
    xvalid=xvalid[useful_features]
    
    imputer=SimpleImputer(strategy='mean')
    xtrain=imputer.fit_transform(xtrain)
    xvalid=imputer.transform(xvalid)
    xtest=imputer.transform(xtest)
    
    scale=StandardScaler()
    xtrain=scale.fit_transform(xtrain)
    xvalid=scale.transform(xvalid)
    xtest=scale.transform(xtest)
    
    model.fit(xtrain, ytrain, 
              early_stopping_rounds=200, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_pred=model.predict_proba(xvalid)
    test_pred=model.predict_proba(xtest)
    train_pred=model.predict_proba(xtrain)
    
    final_test_predictions.append(test_pred[:,1])
    final_valid_predictions.update(dict(zip(valid_ids,valid_pred[:,1])))
    
    valid_score=roc_auc_score(yvalid,valid_pred[:,1])
    train_score=roc_auc_score(ytrain,train_pred[:,1])
    
    val_score.append(valid_score)
    training_score.append(train_score)
    
    print(f"fold {i} | validation score: {valid_score}")
    print(f"fold {i} | training score: {train_score}")

print("mean validation auc score: ",sum(val_score)/len(val_score))
print("mean training auc score: ",sum(training_score)/len(training_score))
print("----------------------------------")

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]

final_valid_predictions.to_csv("train_pred_3.csv", index=False)

sample['claim']=np.mean(np.column_stack(final_test_predictions),axis=1)
sample.columns=['id','pred_3']

sample.to_csv("test_pred_3.csv", index=False)

In [None]:
train=pd.read_csv("../input/kfolds/TPS_august_folds10.csv")
test=pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")


df1=pd.read_csv("./train_pred_1.csv")
df2=pd.read_csv("./train_pred_2.csv")
df3=pd.read_csv("./train_pred_3.csv")

df_test1=pd.read_csv("./test_pred_1.csv")
df_test2=pd.read_csv("./test_pred_2.csv")
df_test3=pd.read_csv("./test_pred_3.csv")

train=train.merge(df1, on="id", how="left")
train=train.merge(df2, on="id", how="left")
train=train.merge(df3, on="id", how="left")

test=test.merge(df_test1, on="id", how="left")
test=test.merge(df_test2, on="id", how="left")
test=test.merge(df_test3, on="id", how="left")

train.head()

In [None]:
#Logistic Regression
sample=pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

useful_features=["pred_1","pred_2","pred_3"]
df_test=test[useful_features]

final_predictions=[]
val_scores=[]
train_scores=[]
for i in range(10):
    xtrain =  train[train.kfold != i].reset_index(drop=True)
    xvalid = train[train.kfold == i].reset_index(drop=True)
    
    xtest=df_test.copy()
    
    ytrain = xtrain.claim
    yvalid = xvalid.claim
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LogisticRegression()
    model.fit(xtrain, ytrain)
    
    valid_pred = model.predict_proba(xvalid)
    test_preds = model.predict_proba(xtest)
    train_preds = model.predict_proba(xtrain)
    
    valid_score=roc_auc_score(yvalid,valid_pred[:,1])
    train_score=roc_auc_score(ytrain,train_preds[:,1])
    
    print(f"fold {i} | validation score: {valid_score}")
    print(f"fold {i} | training score: {train_score}")
    
    val_scores.append(valid_score)
    train_scores.append(train_score)
    final_predictions.append(test_preds[:,1])
    
print("mean validation auc score: ",sum(val_scores)/len(val_scores))
print("mean training auc score: ",sum(train_scores)/len(train_scores))

In [None]:
sample['claim']= np.mean(np.column_stack(final_predictions), axis=1)
sample.to_csv("submission.csv",index=False)