In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt

import os

# VotingClassifier with only one model  

I started the competition with no model [notebook](https://www.kaggle.com/martynovandrey/prediction-without-model)  just for fun, but the public score **0.79946** is good for start :)

Then I tried a number of solutions with different FE and models, and the best was LGBMClassifier with very simple FE (the same as in this notebook) with public score **0.81800**. See the  [notebook](https://www.kaggle.com/martynovandrey/tps-september-lgbm) if you are interested.  

Well, it's a time for blending. I used VotingClassifier (and StackingClassifier) with LGBMClassifier, CatBoostClassifier and XGBClassifier. As usual. The result of voting was not very good. One estimator has highest single score, but the others are worse ...  

What if use only the best model as all estimators? But it's no use to blend with itself. May be use one model with different parameters? It works, but only the best parameters are the best.

The idea is to use one best model with the same parameters, but different `random_state`s. 

> Thanks [Towhidul.Tonmoy](https://www.kaggle.com/towhidultonmoy) for [random_state: Things you need to know](https://www.kaggle.com/c/tabular-playground-series-sep-2021/discussion/271694) post.

Let's try!

The result surprised me, public score jumped to **0.81837**!

In [None]:
import copy
import time
import random

import warnings

from sklearn.preprocessing import RobustScaler

from lightgbm import LGBMClassifier

from sklearn.ensemble import VotingClassifier

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def ht(df, n=2):
    display(df.head(n))
    display(df.tail(n))
    display(df.shape)
    
target = 'claim'

In [None]:
SEED = 2021

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
y = train[target].copy()
features = train.columns.tolist()
features.remove('id')
features.remove(target)

## Preprocessing

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

n_missing = train['n_missing'].copy()

train[features] = train[features].fillna(train[features].mean())
test[features] = test[features].fillna(test[features].mean())

features += ['n_missing', 'std']

scaler = RobustScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

train.shape, test.shape

> Thanks [BIZEN](https://www.kaggle.com/hiro5299834) for **lgbm_params** from this [notebook](https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm)

In [None]:
lgbm_params = {
    'objective': 'binary',
    'n_estimators': 20000,
    'learning_rate': 5e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'importance_type': 'gain',
} 

In [None]:
lgbm_clf0 = LGBMClassifier(**lgbm_params, random_state=17)
lgbm_clf1 = LGBMClassifier(**lgbm_params, random_state=42)
lgbm_clf2 = LGBMClassifier(**lgbm_params, random_state=2021)
lgbm_clf3 = LGBMClassifier(**lgbm_params, random_state=31)
lgbm_clf4 = LGBMClassifier(**lgbm_params, random_state=19)
lgbm_clf5 = LGBMClassifier(**lgbm_params, random_state=77)
lgbm_clf6 = LGBMClassifier(**lgbm_params, random_state=119)

In [None]:
if 'claim' in train.columns.tolist():
    y = train.pop('claim')
print(train.shape, test.shape)    

estimators=[('lgbm0', lgbm_clf0), ('lgbm1', lgbm_clf1), ('lgbm2', lgbm_clf2), ('lgbm3', lgbm_clf3), ('lgbm4', lgbm_clf4), ('lgbm5', lgbm_clf5), ('lgbm6', lgbm_clf6)]

start = time.time()
print(f'fitting ...')
model = VotingClassifier(estimators=estimators, voting='soft', verbose=True)
model.fit(train, y)

print('predicting ...')
model_pred = model.predict_proba(test)[:, -1]

elapsed = time.time() - start
print(f'elapsed time: {elapsed:.2f}sec\n')

In [None]:
sample_solution = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sample_solution[target] = model_pred
ht(sample_solution)
sample_solution.to_csv('submission.csv', index=False)
print()
print('==================== R E A D Y ====================')

The same method increased the score of CatBoost from 0.81751 to 0.81816

#### Thanks for reading. Don't forget to upvote if you find it usefull.