# VotingClassifier ensemble (lightgbm, catboost, LogisticRegression)

## 1. read datasets and initialize random seed.

In [None]:
import pandas as pd
import numpy as np
import random

train = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv')
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv')

SEED=2147483647
random.seed(SEED)
np.random.seed(SEED)

## 2. preprocessing datasets.

### 2.1. convert 'target' value to 0,1,2,3,...,9. 

In [None]:
target_dict = {
    'Class_1' : 0,
    'Class_2' : 1,
    'Class_3' : 2,
    'Class_4' : 3,
    'Class_5' : 4,
    'Class_6' : 5,
    'Class_7' : 6,
    'Class_8' : 7,
    'Class_9' : 8,
}
train['target'] = train['target'].map(target_dict)

## 3. define my transformer.

In [None]:
from sklearn.base import TransformerMixin

features=['feature_{}'.format(x) for x in range(75)]

class MyTransformer1(TransformerMixin):
    def fit_transform(self, X, y=None,**fit_params):
        return self.transform(X)

    def transform(self, X):
        X = X.copy()
        X = X[features]
        return X

class MyTransformer2(TransformerMixin):
    def clip(self,X):
        for feature in features:
            X[feature] = X[feature].clip(upper=4)
        return X

    def setup_new_feature(self,X):
        for feature_value in [0,1,2,3]:
            new_feature = 'count_{}'.format(feature_value)
            X[new_feature] = X[features].apply(lambda x:(x.values==feature_value).sum(),axis=1)
        return X

    def fit_transform(self, X, y=None,**fit_params):
        return self.transform(X)

    def transform(self, X):
        X = X.copy()
        X = X[features]
        X = self.setup_new_feature(self.clip(X))
        return X

class MyTransformer3(TransformerMixin):
    def clip(self,X):
        for feature in features:
            X[feature] = X[feature].clip(upper=4)
        return X

    def fit_transform(self, X, y=None,**fit_params):
        return self.transform(X)

    def transform(self, X):
        X = X.copy()
        X = X[features]
        X = pd.get_dummies(self.clip(X),columns=features)
        return X

class MyTransformer4(TransformerMixin):
    def clip(self,X):
        for feature in features:
            X[feature] = X[feature].clip(upper=4)
        return X

    def setup_new_feature(self,X):
        for feature_value in [0,1,2,3]:
            new_feature = 'count_{}'.format(feature_value)
            X[new_feature] = X[features].apply(lambda x:(x.values==feature_value).sum(),axis=1)
        return X

    def fit_transform(self, X, y=None,**fit_params):
        return self.transform(X)

    def transform(self, X):
        X = X.copy()
        X = X[features]
        X = self.setup_new_feature(self.clip(X))
        return pd.get_dummies(X, columns=features)

    

## 4. define my classifier.

In [None]:
from sklearn.base import ClassifierMixin
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

optuna_recommended_params1 = {
    'objective': 'multiclass',
    'num_classes': 9,
    'metric': 'multi_logloss',
    'verbosity': 0,
    'boosting_type': 'gbdt',
    'feature_pre_filter': False,
    'lambda_l1': 8.934626150848016,
    'lambda_l2': 1.3751929899381281e-08,
    'num_leaves': 8,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.8781081160423493,
    'bagging_freq': 4,
    'min_child_samples': 50,
}

optuna_recommended_best_iteration1=123

optuna_recommended_params2 = {
    'objective': 'multiclass',
    'num_classes': 9,
    'metric': 'multi_logloss',
    'verbosity': 0,
    'boosting_type': 'gbdt',
    'feature_pre_filter': False,
    'lambda_l1': 8.934626150848016,
    'lambda_l2': 6.580392901707003e-06,
    'num_leaves': 4,
    'feature_fraction': 0.4,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 20,
}

optuna_recommended_best_iteration2=276

my_model1 = LGBMClassifier(
    n_estimators=optuna_recommended_best_iteration1,
    random_state=SEED,
    **optuna_recommended_params1
)

my_model2 = LGBMClassifier(
    n_estimators=optuna_recommended_best_iteration2,
    random_state=SEED,
    **optuna_recommended_params2
)

my_model3 = LogisticRegression(random_state=SEED,max_iter=2000)

grid_search_recommended_params4 = {
    'min_data_in_leaf': 50,
    'depth': 4,
    'iterations': 300,
    'learning_rate': 0.1
}

my_model4 = CatBoostClassifier(loss_function='MultiClass', random_state=SEED,**grid_search_recommended_params4,verbose=0)

pipeline1 = make_pipeline(MyTransformer1(),my_model1)
pipeline2 = make_pipeline(MyTransformer2(),my_model2)
pipeline3 = make_pipeline(MyTransformer3(),my_model3)
pipeline4 = make_pipeline(MyTransformer4(),my_model4)


## 5. train the VotingClassifier.

In [None]:
from sklearn.ensemble import VotingClassifier,StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss,accuracy_score

voting_estimators = [
    ('mod1', pipeline1),
    ('mod2', pipeline2),
    ('mod3', pipeline3),
    ('mod4', pipeline4),
]

X = train[features]
y = train['target']

mod_vot = VotingClassifier(
    estimators=voting_estimators,
    voting = 'soft',
).fit(X, y)



## 6. create my submission file.

In [None]:
y_pred_test  = mod_vot.predict_proba(test[features]) 

submission = test[['id']].copy()
submission['Class_1'] = y_pred_test[:,0]
submission['Class_2'] = y_pred_test[:,1]
submission['Class_3'] = y_pred_test[:,2]
submission['Class_4'] = y_pred_test[:,3]
submission['Class_5'] = y_pred_test[:,4]
submission['Class_6'] = y_pred_test[:,5]
submission['Class_7'] = y_pred_test[:,6]
submission['Class_8'] = y_pred_test[:,7]
submission['Class_9'] = y_pred_test[:,8]

submission.to_csv('submission.csv', index=False)