# Problem Statement

In this competition, we predict whether or not an email is spam.

We are going to cover the following steps:
1. Install Vaex
2. Adjust Matplotlib Parameters
3. Load Data
4. Shuffling
5. Split into Train and Validation
6. Sanity Checks
7. Modeling (Part 1): Gradient Boosting Trees
8. Performance on Training Set
9. Performance on Validation Set
10. Feature Importance
11. Modeling (Part 2): Linear Models and Ensembles
12. Ensemble
13. References

Let's get started.

# Install Vaex

In [None]:
!pip install -I vaex

In [None]:
# Load Libraries
import vaex
vaex.multithreading.thread_count_default = 8
import vaex.ml

import numpy as np
import pylab as plt
import time
from pathlib import Path
import pprint
import pandas
from IPython.core.interactiveshell import InteractiveShell  # for printing all outputs of a cell 
InteractiveShell.ast_node_interactivity = "all" # to revert to original setting set InteractiveShell.ast_node_interactivity = "last_expr"

import warnings
warnings.filterwarnings("ignore")

# Adjusting matplotlib parmeters

Let's modify some of the matplotlib default settings, just to make the plots a bit more legible.

In [None]:
SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

# Load Data

In [None]:
# Load data using Vaex
start = time.time()
data_dir = Path('../input/tabular-playground-series-nov-2021/')
vaex_train = vaex.read_csv(data_dir / "train.csv")
vaex_test = vaex.read_csv(data_dir / "test.csv")
end = time.time()
print(end - start)

In [None]:
# See the description
vaex_train.info()

# Shuffling

If required, we can shuffle the dataset.

In [None]:
# let's shuffle
vaex_train = vaex_train.shuffle(random_state=31)

# Split into train and Validation

Once the data is shuffled, let’s split it into train and validation sets. The validation set will comprise 20% of the training data.

In [None]:
# Train and validation split, no shuffling occurs
df_train, df_validation = vaex_train.ml.train_test_split(test_size=0.2, verbose=False)

# Sanity Checks

let’s verify that our train and test sets are “similar” enough.

Let us check the fraction of the target variable.

In [None]:
# Inspect the target variable
train_spam_value_counts = df_train.target.value_counts()
validation_spam_value_counts = df_validation.target.value_counts()


plt.figure(figsize=(12, 4))

plt.subplot(121)
train_spam_value_counts.plot.bar()
train_spam_ratio = train_spam_value_counts[1]/train_spam_value_counts[0]
plt.title(f'Train set: spam ratio: {train_spam_ratio:.2f}')
plt.ylabel('Number of Emails')

plt.subplot(122)
validation_spam_value_counts.plot.bar()
validation_spam_ratio = validation_spam_value_counts[1]/validation_spam_value_counts[0]
plt.title(f'Validation set: spam ratio: {validation_spam_ratio:.2f}')
plt.ylabel('Number of Emails')

plt.tight_layout()
plt.show()

# Modeling (part 1): gradient boosted trees

In [None]:
import xgboost
import vaex.ml.sklearn

features = vaex_train.column_names[1:-1] # because we want to exclude id and target columns from the training dataset

# Instantiate the xgboost model normally, using the scikit-learn API
xgb_model = xgboost.sklearn.XGBClassifier(
#                                           max_depth=11,
                                          learning_rate=0.1,
#                                           n_estimators=500,
#                                           subsample=0.75,
#                                           colsample_bylevel=1,
#                                           colsample_bytree=1,
#                                           scale_pos_weight=1.5,
                                          reg_lambda=1.5,
                                          reg_alpha=5,
#                                           n_jobs=8,
                                          random_state=42,
                                          use_label_encoder=False,
                                          verbosity=0)

# Make it work with vaex (for the automagic pipeline and lazy predictions)
vaex_xgb_model = vaex.ml.sklearn.Predictor(features=features,
                                           target='target',
                                           model=xgb_model,
                                           prediction_name='prediction_xgb')
# Train the model
vaex_xgb_model.fit(df_train)

# Get the prediction of the model on the training data
df_train = vaex_xgb_model.transform(df_train)

# Preview the resulting train dataframe that contans the predictions
df_train

# Performance on training set

let’s see what the performance is of the model on the training set. First let’s create a convenience function that will help us get multiple metrics at once.

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
def binary_metrics(y_true, y_pred):
    acc = accuracy_score(y_true=y_true, y_pred=y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred)
    roc = roc_auc_score(y_true=y_true, y_score=y_pred)
    print(f'Accuracy: {acc:.3f}')
    print(f'f1 score: {f1:.3f}')
    print(f'roc-auc: {roc:.3f}')

Now let’s check the performance of the model on the training set.

In [None]:
print('Metrics for the training set:')
binary_metrics(y_true=df_train.target.values, y_pred=df_train.prediction_xgb.values)

# Performance on validation set

Let's check the model performance on the validation set.

In [None]:
# Train the model
vaex_xgb_model.fit(df_validation)

# Get the prediction of the model on the validation data
df_validation = vaex_xgb_model.transform(df_validation)

# Preview the resulting train dataframe that contans the predictions
df_validation

In [None]:
print('Metrics for the validation set:')
binary_metrics(y_true=df_validation.target.values, y_pred=df_validation.prediction_xgb.values)

# Feature importance

Let’s now look at the feature importance of the xgboost model.

In [None]:
plt.figure(figsize=(6, 9))

ind = np.argsort(xgb_model.feature_importances_)[::-1]
features_sorted = np.array(features)[ind]
importances_sorted = xgb_model.feature_importances_[ind]

plt.barh(y=range(len(features)), width=importances_sorted, height=0.2)
plt.title('Gain')
plt.yticks(ticks=range(len(features)), labels=features_sorted)
plt.gca().invert_yaxis()
plt.show()

# Modeling (part 2): Linear models & Ensembles

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
# The Support Vector Classifier
vaex_svc = vaex.ml.sklearn.Predictor(features=features,
                                     target='target',
                                     model=SVC(max_iter=1000, random_state=42),
                                     prediction_name='prediction_svc')

# Logistic Regression
vaex_logistic = vaex.ml.sklearn.Predictor(features=features,
                                          target='target',
                                          model=LogisticRegression(max_iter=1000, random_state=42, solver='liblinear'),
                                          prediction_name='prediction_lr')

# Train the new models and apply the transformation to the train dataframe
for model in [vaex_svc, vaex_logistic]:
    model.fit(df_train)
    df_train = model.transform(df_train)

# Preview of the train DataFrame
df_train.head(5)

# Ensemble

the predictions from the SVC and the LogisticRegression classifiers are added as virtual columns in the training dataset. This is quite powerful, since now we can easily use them to create an ensemble! For example, let’s do a weighted mean.

In [None]:
# Weighed mean of the classes
prediction_final = (df_train.prediction_xgb.astype('int') * 0.3 +
                    df_train.prediction_svc.astype('int') * 0.5 +
                    df_train.prediction_xgb.astype('int') * 0.2)
# Get the predicted class
prediction_final = (prediction_final >= 0.5)

# Add the expression to the train DataFrame
df_train['prediction_final'] = prediction_final

# Preview
df_train[df_train.get_column_names(regex='^predict')]

let’s check the performance of all the individual models as well as on the ensembler, on the validation set

In [None]:
pred_columns = df_validation.get_column_names(regex='^prediction_')
for i in pred_columns:
    print(i)
    binary_metrics(y_true=df_validation.target.values, y_pred=df_validation[i].values)
    print(' ')

# References

1. Thank you to Vaex Documentation for showing [how to use Vaex](https://vaex.io/docs/example_ml_titanic.html#).
2. This [Stack Overflow link](https://stackoverflow.com/questions/65682019/attributeerror-str-object-has-no-attribute-decode-in-fitting-logistic-regre) was used to resolve an error.