In [None]:
import numpy as np 
import pandas as pd 
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/tps-sep-train-kfold/train_10_folds.csv', index_col = 0)
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col = 0)
sample_submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
train['n_missing'] = train[train.columns].isna().sum(axis=1)
test['n_missing'] = test[test.columns].isna().sum(axis=1)

In [None]:
print(train.shape)
print(test.shape)
useful_features = [column for column in train.columns if column not in ["claim", "kfold"]]
test_feature = test.columns

In [None]:
my_imputer = SimpleImputer(strategy = 'mean')

train[useful_features] = my_imputer.fit_transform(train[useful_features])
test[useful_features] = my_imputer.transform(test[useful_features])

# test.columns = test_feature

In [None]:
test.head()

In [None]:
final_predictions = []
scores = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop = True)
    xvalid = train[train.kfold == fold].reset_index(drop = True)
    xtest = test.copy()
    
    ytrain = xtrain.claim
    yvalid = xvalid.claim
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    my_scaler = StandardScaler()
    xtrain[useful_features] = my_scaler.fit_transform(xtrain[useful_features])
    xvalid[useful_features] = my_scaler.transform(xvalid[useful_features])
    xtest[useful_features] = my_scaler.transform(xtest[useful_features])
    
    parameter = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
    }

    model = CatBoostClassifier(**parameter)
    model.fit(xtrain,ytrain)
    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_predictions.append(test_preds)
    temp = roc_auc_score(yvalid,preds_valid)
    scores.append(temp)
    print(fold, temp)
print(np.mean(scores), np.std(scores))

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis = 1)

In [None]:
sample_submission.claim = preds
sample_submission.to_csv("submission.csv", index = False)
sample_submission.head()