 # November TPS - Building an Ensemble

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
from xgboost import XGBClassifier

#### Intel Scikit-learn Patch
Source: https://intel.github.io/scikit-learn-intelex/

In [None]:
# Use this line to install Intel's update to library if needed
!pip install scikit-learn-intelex --progress-bar off >> /tmp/pip_sklearnex.log

# Can give a small (or large) boost to speed depending on the available processor
from sklearnex import patch_sklearn
patch_sklearn()

### Getting Competition Data

The target column contains 'True' and 'False' values instead of 1 and 0, so LabelBinarizer is used to process it. Also the features and target columns are separated into the variables X and y.

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv', index_col=False).drop(columns=['id'])

test_df = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
test_ids = test_df.id
test_df.drop(columns=['id'], inplace=True)

lb = LabelBinarizer() # Need to map target values (true and false) into 1's and 0's
df['target'] = np.ravel(lb.fit_transform(df['target'])) # ravel makes y a 1d vector instead of a column vector

continuous_cols = list(df.columns[:-1]) # All columns are continuous besides 'target', which is a binary label

y = df.target
X = df.drop(columns=['target']).to_numpy()

Using Cross Validation will give us a better idea of whether a parameter change or preprocessing step has improved the model. The evaluate_model function performs multiple cross validations, which gives us more consistent scores to compare models and sets of parameters.

In [None]:
def evaluate_model(X, y, model):
    cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv_method, error_score='raise')
    return scores

#### Baseline XGBoost

In [None]:
xgb = XGBClassifier(n_estimators=100, tree_method='gpu_hist', gpu_id=0, eval_metric='auc', use_label_encoder=False, verbosity=0)
print("Cross Validation Score: ", np.mean(evaluate_model(X, y, xgb)))

### Adding scaling and tuning parameters

The XGB model for the ensemble has the same parameters as the one defined in this next code cell.

In [None]:
p = Pipeline([
    ('scale', MinMaxScaler()),
    ('model', XGBClassifier(n_estimators=200, max_depth=4, reg_lambda=60, reg_alpha=60,
                            tree_method='gpu_hist', gpu_id=0, eval_metric='auc', use_label_encoder=False))
])

print("Score: ", np.mean(evaluate_model(X, y, p)))

Best params so far:

XGBClassifier(n_estimators=200, max_depth=4, reg_lambda=60, reg_alpha=60,
                            tree_method='gpu_hist', gpu_id=0, eval_metric='auc', use_label_encoder=False)

## Building the Ensemble

#### Logistic Regression

In [None]:
p_2 = Pipeline([
    ('scale', MinMaxScaler()),
    ('logr', LogisticRegression(solver='sag', n_jobs=-1))
])

print("Score: ", np.mean(evaluate_model(X, y, p_2)))

Using the 'sag' solver because the scikit-learn documentation it is recommended with medium to large size datasets. The default solver is much slower to train. With default parameters otherwise, this model performs the best.

In [None]:
p_2.fit(X, y)
pd.DataFrame({'id': test_ids, 'target': p_2.predict_proba(test_df)[:,1]}).to_csv('logregr_submission.csv', index=False)

#### Naive Bayes

I experimented with adding naive bayes but it did not improve the ensemble's score. I may try to experiment with the parameters, but it may be more worthwile finding other models that can score above 0.73 without tuning.

In [None]:
p_3 = Pipeline([
    ('scale', MinMaxScaler([0,1])), # Naive Bayes' input can't contain negative values, so scale to positive range
    ('nb', MultinomialNB())
])

print("Score: ", np.mean(evaluate_model(X, y, p_3)))

### Voting Ensemble

First the list of estimators needs to be created. Add more models to this list to try different ensembles.

In [None]:
to_ens = [('xgb', XGBClassifier(n_estimators=200, max_depth=4, reg_lambda=60, reg_alpha=60, tree_method='gpu_hist', gpu_id=0, 
                        eval_metric='auc', use_label_encoder=False)),
          ('logr', LogisticRegression(solver='sag', n_jobs=-1))]

In [None]:
final_pipe = Pipeline([
    ('scale', MinMaxScaler()),
    ('ensemble', VotingClassifier(to_ens, voting='soft'))
])

print("Score: ", np.mean(evaluate_model(X, y, final_pipe)))

The ensemble of estimators is not an improvement over all individual estimators. Next step: tuning XGBoost and adding more models to the ensemble. 

#### The predictions for LogisticRegression are the current submission for this version of this notebook. 

### Thanks for reading!

In [None]:
final_pipe.fit(X, y)
pd.DataFrame({'id': test_ids, 'target': final_pipe.predict_proba(test_df)[:,1]}).to_csv('ensemble_submission.csv', index=False)