This notebook uses codes from an excellent notebook, TPS Feb 2022 ExtraTreeClassifier, by BIZEN.

<h3>Upvote, if this notebook was helpful</h3>

<h1> Libraries</h1

In [None]:
import pandas as pd
import numpy as np
import plotly as py
from statistics import mean
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns
import random
import time
import os

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.ensemble import VotingClassifier

from tqdm.notebook import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

from scipy.stats import mode

import warnings
warnings.simplefilter('ignore')

<h2>Dataset</h2>

In [None]:
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")
train = train[0:30000]
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")
#submission = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
target = 'target'
target_encoded = 'target_encoded'
features = [col for col in test.columns if 'id' not in col]

<h2>Preprocessing</h2>

In [None]:
le = LabelEncoder()
train[target_encoded] = le.fit_transform(train[target])

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
reduce_mem_usage(train)

In [None]:
reduce_mem_usage(test)

<h2>XGB</h2>

In [None]:
paramsXGB = {'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 400, 'min_child_weight': 2, 'gamma': 0, 'colsample_bytree': 0.6, 'subsample': 0.5,
             #'tree_method': 'gpu_hist',
             'booster': 'gbtree'}

<h2>ExtraTreeClassifier</h2>

In [None]:
extratree_params = {
    'n_estimators' : 1000,
    'random_state' : 42,
    'verbose' : 1,
    'n_jobs' : -1
}
extratree_params1 = {
    'n_estimators' : 1000,
    'random_state' : 64,
    'verbose' : 1,
    'n_jobs' : -1
}

<h2>Voting</h2>

In [None]:
xgb_model = XGBClassifier(**paramsXGB)
extratree_model = ExtraTreesClassifier(**extratree_params)
extratree_model1 = ExtraTreesClassifier(**extratree_params1)

In [None]:
X=train[features]
y=train[target_encoded]
X_test = test[features]
del train,test

In [None]:
pred = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (trn_idx, val_idx) in enumerate(tqdm(skf.split(X, y), total=5)):
    X_train = X.iloc[trn_idx]
    y_train = y.iloc[trn_idx]
    X_valid = X.iloc[val_idx]
    y_valid = y.iloc[val_idx]

    start = time.time()
    model = VotingClassifier(
            estimators = [
                ('xgb', xgb_model),
                ('extratree', extratree_model), 
                #('extratree1', extratree_model1)
            ],
            voting = 'soft',
            weights = [0.4, 0.6],
            n_jobs = -1
        )
    model.fit(X_train, y_train)

    pred.append(model.predict(X_test))

    elapsed = time.time() - start

<p>In the above voting classifier you can use multiple models(i.e. xgb,cb,lgmb,extratree,etc)</p>

In [None]:
del model,X_train,X_valid

<h2>Submission</h2>

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
pred_decoded = le.inverse_transform(mode(pred).mode[0])
submission[target] = pred_decoded
submission.to_csv("submission.csv", index=False)
submission

<h1>If you found this notebook interesting & helpful, please consider to upvote!!</h1>