# VotingClassifier with only one CatBoost model



## Importing the required Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt

import os

In [None]:
import copy
import time
import random

import warnings

from sklearn.preprocessing import RobustScaler    # FOr scaling the data

from catboost import CatBoostClassifier           # Model for classification

from sklearn.ensemble import VotingClassifier     # combine conceptually different machine learning classifiers and use a majority vote or the
                                                  # average predicted probabilities (soft vote) to predict the class labels.

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def ht(df, n=2):
    display(df.head(n))
    display(df.tail(n))
    display(df.shape)
    
target = 'claim'

In [None]:
# Importing the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Dividing the data as Independent and dependent variables

In [None]:
y = train[target].copy()
features = train.columns.tolist()
features.remove('id')
features.remove(target)

## Preprocessing

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

n_missing = train['n_missing'].copy()

train[features] = train[features].fillna(train[features].mean())
test[features] = test[features].fillna(test[features].mean())

features += ['n_missing', 'std']

scaler = RobustScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

train.shape, test.shape

((957919, 122), (493474, 121))

## Creating the classifiers

In [None]:
catb_params = {
'eval_metric' : 'AUC',
'iterations': 15585,
'objective': 'CrossEntropy',
'bootstrap_type': 'Bernoulli',
'od_wait': 1144,
'learning_rate': 0.023575206684596582,
'reg_lambda': 36.30433203563295,
'random_strength': 43.75597655616195,
'depth': 7,
'min_data_in_leaf': 11,
'leaf_estimation_iterations': 1,
'subsample': 0.8227911142845009,
'devices' : '0',
'verbose' : 0
}

In [None]:
cat_clf0 = CatBoostClassifier(**catb_params, random_state=17)
cat_clf1 = CatBoostClassifier(**catb_params, random_state=43)
cat_clf2 = CatBoostClassifier(**catb_params, random_state=2021)
cat_clf3 = CatBoostClassifier(**catb_params, random_state=31)
cat_clf4 = CatBoostClassifier(**catb_params, random_state=19)
cat_clf5 = CatBoostClassifier(**catb_params, random_state=77)
cat_clf6 = CatBoostClassifier(**catb_params, random_state=177)

cat_clf7 = CatBoostClassifier(**catb_params, random_state=200)
cat_clf8 = CatBoostClassifier(**catb_params, random_state=205)
cat_clf9 = CatBoostClassifier(**catb_params, random_state=210)
cat_clf10 = CatBoostClassifier(**catb_params, random_state=215)
cat_clf11 = CatBoostClassifier(**catb_params, random_state=220)
cat_clf12 = CatBoostClassifier(**catb_params, random_state=555)
cat_clf13 = CatBoostClassifier(**catb_params, random_state=998)

In [None]:
if 'claim' in train.columns.tolist():
    y = train.pop('claim')
print(train.shape, test.shape)    

estimators=[('cat0', cat_clf0), 
            ('cat1', cat_clf1), 
            ('cat2', cat_clf2), 
            ('cat3', cat_clf3), 
            ('cat4', cat_clf4), 
            ('cat5', cat_clf5),
            ('cat6', cat_clf6), 
            ('cat7', cat_clf7), 
            ('cat8', cat_clf8), 
            ('cat9', cat_clf9), 
            ('cat10', cat_clf10), 
            ('cat11', cat_clf11), 
            ('cat12', cat_clf12),
            ('cat13', cat_clf13), 
           ]

start = time.time()
print(f'fitting ...')
model = VotingClassifier(estimators=estimators, voting='soft', verbose=True)
model.fit(train, y)

print('predicting ...')
model_pred = model.predict_proba(test)[:, -1]

elapsed = time.time() - start
print(f'elapsed time: {elapsed:.2f}sec\n')

(957919, 121) (493474, 121)
fitting ...
[Voting] .................... (1 of 14) Processing cat0, total= 5.9min
[Voting] .................... (2 of 14) Processing cat1, total= 4.9min
[Voting] .................... (3 of 14) Processing cat2, total= 4.8min
[Voting] .................... (4 of 14) Processing cat3, total= 4.9min
[Voting] .................... (5 of 14) Processing cat4, total= 4.9min
[Voting] .................... (6 of 14) Processing cat5, total= 4.8min
[Voting] .................... (7 of 14) Processing cat6, total= 4.8min
[Voting] .................... (8 of 14) Processing cat7, total= 4.9min
[Voting] .................... (9 of 14) Processing cat8, total= 4.8min
[Voting] ................... (10 of 14) Processing cat9, total= 4.8min
[Voting] .................. (11 of 14) Processing cat10, total= 4.9min
[Voting] .................. (12 of 14) Processing cat11, total= 4.9min
[Voting] .................. (13 of 14) Processing cat12, total= 4.9min
[Voting] .................. (14 of 14

In [None]:
sample_solution = pd.read_csv('sample_solution.csv')
sample_solution[target] = model_pred
ht(sample_solution)
sample_solution.to_csv('submission.csv', index=False)
print()
print('==================== R E A D Y ====================')

Unnamed: 0,id,claim
0,957919,0.549421
1,957920,0.123483


Unnamed: 0,id,claim
493472,1451391,0.126697
493473,1451392,0.783256


(493474, 2)


