In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from catboost import CatBoostClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc

In [2]:
train_df = pd.read_csv('train.csv', index_col = 'id')
test_df = pd.read_csv('test.csv', index_col = 'id')
sample_sub = pd.read_csv('sample_solution.csv')

In [3]:
feature_cols = [c for c in train_df.columns if c not in ('claim', 'fold')]
train_df['fold'] = -1
fold_numbers = 10

KF = KFold(n_splits = fold_numbers, shuffle = True, random_state = 42)
for fold, (train_indices, valid_indices) in enumerate(KF.split(X = train_df)):
    train_df.loc[valid_indices, 'fold'] = fold

In [4]:
pipeline = Pipeline([('impute', SimpleImputer(strategy='mean')), ('scale', StandardScaler())])

In [5]:
params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

params1= {'iterations': 10350,
          'objective': 'CrossEntropy',
          'bootstrap_type': 'Bernoulli', 
          'od_wait': 1490, 
          'learning_rate': 0.38975793913029355, 
          'reg_lambda': 71.45622917064668, 
          'random_strength': 28.12936814985794, 
          'depth': 1, 
          'min_data_in_leaf': 1, 
          'leaf_estimation_iterations': 5, 
          'subsample': 0.8172979703145598,
          'task_type': 'GPU',
          'devices': '0',
          'verbose': 0
         }

In [None]:
# Raw Data as input, Mean imputed
test_preds = []
for fold in range(fold_numbers):
    xtrain = train_df.loc[train_df.fold != fold]
    xval = train_df.loc[train_df.fold == fold]
    
    xtest = test_df.copy()
    
    ytrain = xtrain.claim
    yval = xval.claim
    
    xtrain = xtrain[feature_cols]
    xval = xval[feature_cols]
    
    xtrain = pd.DataFrame(columns= feature_cols, data=pipeline.fit_transform(xtrain))
    xval = pd.DataFrame(columns= feature_cols, data=pipeline.transform(xval))
    
    xtest = pd.DataFrame(columns= feature_cols, data=pipeline.transform(xtest))
    
    model = CatBoostClassifier(**params1)
    
    model.fit(xtrain, ytrain)
    predval = model.predict_proba(xval)[:,1]
    
    fpr, tpr, _ = roc_curve(yval, predval)
    score = auc(fpr, tpr)
    
    print("Fold : {} Score : {}".format(fold, score))
    print('--'*18)
    
    test_pred = model.predict_proba(xtest)[:,1]
    test_preds.append(test_pred)

Fold : 0 Score : 0.7925703935013715
------------------------------------
Fold : 1 Score : 0.7932096978617517
------------------------------------


In [None]:
sample_sub.claim = np.mean(np.column_stack(test_preds), axis = 1)
sample_sub.to_csv('submission.csv', index = False)