In [1]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score, precision_score, confusion_matrix
from pprint import pprint
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from sklearn.model_selection import RandomizedSearchCV

from venn_abers import VennAbersCalibrator

In [31]:
# Load the training data
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
test_labels = pd.read_csv('data/test_labels.csv')

TARGET_LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

valid_test_mask = (test_labels[TARGET_LABELS] != -1).all(axis=1)
test_data = test_data.merge(test_labels[valid_test_mask], on="id")

X_train = train_data["comment_text"]
Y_train = train_data[TARGET_LABELS]

X_test = test_data["comment_text"]
Y_test = test_data[TARGET_LABELS]

In [32]:
pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ('clf', OneVsRestClassifier(ComplementNB()))
    ]
)
pipeline

0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,estimator,ComplementNB()
,n_jobs,
,verbose,0

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,norm,False


In [29]:
parameter_grid = {
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    "vect__norm": ("l1", "l2"),
    "clf__estimator__alpha": np.logspace(-3, 1, 5),
}

In [21]:
def multilabel_macro_auc(estimator, X, y_true):
    # estimator: fitted Pipeline (vect -> OneVsRest(ComplementNB))
    y_proba = estimator.predict_proba(X)  # shape (n_samples, n_labels)
    return roc_auc_score(y_true, y_proba, average="macro")

auc_scorer = make_scorer(multilabel_macro_auc, needs_proba=True)

In [10]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_splits = list(mskf.split(X_train, Y_train)) 

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=40,
    scoring=multilabel_macro_auc,
    random_state=0,
    cv=cv_splits,
    n_jobs=-1,
    verbose=1,
    refit=True,
)

print("Performing grid search...")
print("Hyperparameters to be evaluated:")
pprint(parameter_grid)

Performing grid search...
Hyperparameters to be evaluated:
{'clf__estimator__alpha': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01]),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__norm': ('l1', 'l2')}


In [11]:

from time import time

print("Searching best params (macro ROC AUC)...")

t0 = time()
random_search.fit(X_train, Y_train)
print(f"Done in {time() - t0:.3f}s")

print("Best params:", random_search.best_params_)
print(f"Best CV macro AUC: {random_search.best_score_:.4f}")
print(f"Accuracy on test set: {random_search.score(X_test, Y_test):.3f}")

Searching best params (macro ROC AUC)...
Fitting 5 folds for each of 20 candidates, totalling 100 fits




Done in 142.386s
Best params: {'vect__norm': 'l1', 'vect__ngram_range': (1, 2), 'clf__estimator__alpha': 0.001}
Best CV macro AUC: 0.9553
Accuracy on test set: 0.956


In [12]:
# --- Evaluate on validation
# For multi-label, .score uses underlying estimator's default (accuracy on exact matches).
# Better: compute metrics per label and macro-average.
best_model = random_search.best_estimator_

Y_test_proba = best_model.predict_proba(X_test)  # shape (n_samples, n_labels)
Y_test_pred = (Y_test_proba >= 0.5).astype(int)

macro_auc = roc_auc_score(Y_test, Y_test_proba, average="macro")
print(f"Validation macro ROC AUC: {macro_auc:.4f}")

for i, label in enumerate(TARGET_LABELS):
    y_true = Y_test[label].values
    y_pred = Y_test_pred[:, i]
    y_proba = Y_test_proba[:, i]
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    auc = roc_auc_score(y_true, y_proba)
    print(f"{label:13s}  AUC={auc:.4f}  Acc={acc:.4f}  Prec={prec:.4f}\nConfusion matrix:\n{cm}\n")


Validation macro ROC AUC: 0.9556
toxic          AUC=0.9524  Acc=0.9369  Prec=0.6544
Confusion matrix:
[[27714  1155]
 [  859  2187]]

severe_toxic   AUC=0.9646  Acc=0.9797  Prec=0.2676
Confusion matrix:
[[31068   542]
 [  107   198]]

obscene        AUC=0.9609  Acc=0.9561  Prec=0.5596
Confusion matrix:
[[29292   961]
 [  441  1221]]

threat         AUC=0.9637  Acc=0.9938  Prec=0.2059
Confusion matrix:
[[31682   135]
 [   63    35]]

insult         AUC=0.9590  Acc=0.9539  Prec=0.5238
Confusion matrix:
[[29300  1039]
 [  433  1143]]

identity_hate  AUC=0.9332  Acc=0.9805  Prec=0.2010
Confusion matrix:
[[31171   485]
 [  137   122]]



In [13]:
import pandas as pd


def shorten_param(param_name):
    """Remove components' prefixes in param_name."""
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = pd.DataFrame(random_search.cv_results_)
cv_results = cv_results.rename(shorten_param, axis=1)

In [16]:
import plotly.express as px

param_names = [shorten_param(name) for name in parameter_grid.keys()]
labels = {
    "mean_score_time": "CV Score time (s)",
    "mean_test_score": "CV score (AUC ROC)",
}
fig = px.scatter(
    cv_results,
    x="mean_score_time",
    y="mean_test_score",
    error_x="std_score_time",
    error_y="std_test_score",
    hover_data=param_names,
    labels=labels,
)
fig.update_layout(
    title={
        "text": "trade-off between scoring time and mean test score",
        "y": 0.95,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    }
)
fig

In [17]:
import math

column_results = param_names + ["mean_test_score", "mean_score_time"]

transform_funcs = dict.fromkeys(column_results, lambda x: x)
# Using a logarithmic scale for alpha
# transform_funcs["alpha"] = math.log10
# L1 norms are mapped to index 1, and L2 norms to index 2
transform_funcs["norm"] = lambda x: 2 if x == "l2" else 1
# Unigrams are mapped to index 1 and bigrams to index 2
transform_funcs["ngram_range"] = lambda x: x[1]

fig = px.parallel_coordinates(
    cv_results[column_results].apply(transform_funcs),
    color="mean_test_score",
    color_continuous_scale=px.colors.sequential.Viridis_r,
    labels=labels,
)
fig.update_layout(
    title={
        "text": "Parallel coordinates plot of text classifier pipeline",
        "y": 0.99,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    }
)
fig