In [10]:
import numpy as np
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer

In [None]:
!pip install optuna

In [None]:
import optuna

In [11]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [12]:
train['isHate'] = train['isHate'].astype(int)
test['isHate'] = test['isHate'].astype(int)

In [13]:
nlp = spacy.load("en_core_web_sm")

In [14]:
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier

In [77]:
def objective(trial):
    # preprocessing
    remove_punct = trial.suggest_categorical('remove_punct', [0, 1])
    remove_like_num = trial.suggest_categorical('remove_like_num', [0, 1])
    remove_like_email = trial.suggest_categorical('remove_like_email', [0, 1])
    remove_stop = trial.suggest_categorical('remove_stop', [0, 1])

    # sentence-transformers
    normalize_embeddings = trial.suggest_categorical('normalize_embeddings', [0, 1])
    model_kind = trial.suggest_categorical('model_kind', [
        'all-mpnet-base-v2',
        'multi-qa-mpnet-base-dot-v1',
        'all-distilroberta-v1',
        'all-MiniLM-L12-v2',
        'multi-qa-distilbert-cos-v1',
        'all-MiniLM-L6-v2',
        'paraphrase-albert-small-v2',
    ])

    metric = trial.suggest_categorical('metric', ['manhattan', 'euclidean'])

    train_dataframe = train.copy()
    
    def callee(t):
        result = True
        result = (result and not t.is_punct) if remove_punct else result
        result = (result and not t.like_num) if remove_like_num else result
        result = (result and not t.like_email) if remove_like_email else result
        result = (result and not t.is_stop) if remove_stop else result
        return result

    train_dataframe['comment'] = train_dataframe['comment'].apply(
        lambda x: ' '.join([
            tok.lemma_.lower() for tok in nlp(x) if callee(tok)
        ])
    )

    model = SentenceTransformer(model_kind)
    train_embeddings = model.encode(train_dataframe['comment'].values, normalize_embeddings=normalize_embeddings)
    
    clf = NearestCentroid(metric=metric)
    
    return cross_val_score(clf, train_embeddings, train_dataframe['isHate'], cv=5).mean()

In [78]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, n_jobs=1)

[I 2023-06-06 18:45:20,298] A new study created in memory with name: no-name-54cb0233-1079-43a1-af1d-d85d8fa13ae0
[I 2023-06-06 18:45:24,356] Trial 0 finished with value: 0.7092688679245283 and parameters: {'remove_punct': 0, 'remove_like_num': 1, 'remove_like_email': 0, 'remove_stop': 0, 'normalize_embeddings': 1, 'model_kind': 'paraphrase-albert-small-v2', 'metric': 'euclidean'}. Best is trial 0 with value: 0.7092688679245283.
[I 2023-06-06 18:45:28,007] Trial 1 finished with value: 0.7155738993710692 and parameters: {'remove_punct': 1, 'remove_like_num': 1, 'remove_like_email': 0, 'remove_stop': 0, 'normalize_embeddings': 1, 'model_kind': 'all-MiniLM-L6-v2', 'metric': 'euclidean'}. Best is trial 1 with value: 0.7155738993710692.
[I 2023-06-06 18:45:31,733] Trial 2 finished with value: 0.7368396226415095 and parameters: {'remove_punct': 0, 'remove_like_num': 0, 'remove_like_email': 1, 'remove_stop': 0, 'normalize_embeddings': 0, 'model_kind': 'all-MiniLM-L6-v2', 'metric': 'manhattan'

In [None]:
!pip install plotly

In [2]:
from optuna.visualization import plot_param_importances, plot_parallel_coordinate

In [6]:
plot_param_importances(study).write_html('importances.html')

In [7]:
plot_parallel_coordinate(study).write_html('parallel.html')

In [8]:
study.best_params

{'metric': 'manhattan',
 'model_kind': 'all-distilroberta-v1',
 'normalize_embeddings': 0,
 'remove_like_email': 1,
 'remove_like_num': 0,
 'remove_punct': 1,
 'remove_stop': 1}

In [15]:
train['comment'] = train['comment'].apply(
    lambda x: ' '.join([
        tok.lemma_.lower() for tok in nlp(x)
        if not tok.like_email 
        and not tok.is_punct
        and not tok.is_stop
    ])
)

In [16]:
test['comment'] = test['comment'].apply(
    lambda x: ' '.join([
        tok.lemma_.lower() for tok in nlp(x)
        if not tok.like_email 
        and not tok.is_punct
        and not tok.is_stop
    ])
)

In [17]:
model = SentenceTransformer('all-distilroberta-v1')
train_embeddings = model.encode(train['comment'].values)
test_embeddings = model.encode(test['comment'].values)

clf = NearestCentroid(metric='manhattan')

In [18]:
clf.fit(train_embeddings, train['isHate'])

In [19]:
roc_auc_score(test['isHate'], clf.predict(test_embeddings))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0.7375249500998003