In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
random_state = 42
np.random.seed(random_state)

### Dataset

In [3]:
df = pd.read_csv("data/Ethos_Dataset_Binary.csv", sep=";")
df

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0
...,...,...
993,From the midnight sun where the hot springs blow,0.0
994,Don't say I'm not your type,0.0
995,And therefore never send to know for whom the...,0.0
996,And I can't stand another day,0.0


In [4]:
df["isHate"].unique()

array([1.        , 0.98387097, 0.98360656, 0.97826087, 0.97333333,
       0.96666667, 0.95454545, 0.94545455, 0.9375    , 0.90384615,
       0.85714286, 0.8490566 , 0.84615385, 0.83333333, 0.82142857,
       0.75      , 0.72222222, 0.67857143, 0.66666667, 0.60344828,
       0.53061224, 0.5       , 0.4       , 0.33333333, 0.30232558,
       0.296875  , 0.25      , 0.2       , 0.16666667, 0.16071429,
       0.15254237, 0.11111111, 0.10344828, 0.09090909, 0.03896104,
       0.03773585, 0.03174603, 0.03030303, 0.02985075, 0.02631579,
       0.01886792, 0.01639344, 0.        ])

In [5]:
train_size = 0.8
train_df, test_df = train_test_split(
    df, train_size=train_size, random_state=random_state
)
len(train_df), len(test_df)

(798, 200)

In [6]:
n_embs_dim = 300
train_corpora = train_df["comment"].to_list()
train_y = train_df["isHate"].to_numpy()

In [7]:
threshold = 0.5
train_y[train_y < threshold] = 0
train_y[train_y > threshold] = 1
t_size = len(train_y[train_y == threshold])
train_y[train_y == threshold] = np.random.randint(0, 2, size=(t_size))

In [8]:
test_corpora = test_df["comment"].to_list()
vectorizer = TfidfVectorizer(max_features=n_embs_dim)
train_emb = vectorizer.fit_transform(train_corpora).toarray()
test_emb = vectorizer.transform(test_corpora).toarray()
test_y = test_df["isHate"].to_numpy()

In [9]:
test_y[test_y < threshold] = 0
test_y[test_y > threshold] = 1
t_size = len(test_y[test_y == threshold])
test_y[test_y == threshold] = np.random.randint(0, 2, size=(t_size))

### Model Selection

Будем использовать случайный лес<br>
В качестве фичей возьмем эмбеддинги текста на основе `tf-idf` векторизации 

In [10]:
forest = RandomForestClassifier()
forest.fit(train_emb, train_y);

In [11]:
forest.score(test_emb, test_y)

0.62

### Hyperparameter Optimization

In [12]:
import optuna
from optuna.trial import Trial
from dvclive import Live

In [None]:
with Live("experiments/dvc-optuna", save_dvc_exp=True) as live:
    def objective(trial: Trial):
        n_embs_dim = trial.suggest_int("n_embs_dim", 100, 768)
        vectorizer = TfidfVectorizer(max_features=n_embs_dim)
        train_emb = vectorizer.fit_transform(train_corpora).toarray()
        test_emb = vectorizer.transform(test_corpora).toarray()
        n_estimators = trial.suggest_int("n_estimators", 1, 500)
        criterion = trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])
        forest = RandomForestClassifier(
            criterion=criterion, n_estimators=n_estimators, random_state=random_state
        )
        forest.fit(train_emb, train_y);
        score = forest.score(test_emb, test_y)
        live.log_metric("n_embs_dim", n_embs_dim)
        live.log_metric("n_estimators", n_estimators)
        live.log_metric("accuracy", score)
        live.next_step()
        return score
    live.log_param("seed", random_state)
    sampler = optuna.samplers.TPESampler(seed=random_state)
    study = optuna.create_study(sampler=sampler, direction='maximize')
    study.optimize(objective, n_trials=50)
    best_params, best_value = study.best_params, study.best_value
print(best_params, best_value)

### Model evaluation

In [None]:
vectorizer = TfidfVectorizer(max_features=best_params["n_embs_dim"])
train_emb = vectorizer.fit_transform(train_corpora).toarray()
test_emb = vectorizer.transform(test_corpora).toarray()
forest = RandomForestClassifier(
    criterion=best_params["criterion"],
    n_estimators=best_params["n_estimators"],
    random_state=random_state
)
forest.fit(train_emb, train_y)
forest.score(test_emb, test_y)

0.69

In [None]:
test_y_pred = forest.predict(test_emb)
pd.DataFrame(classification_report(test_y, test_y_pred, output_dict=True))

Unnamed: 0,0.0,1.0,accuracy,macro avg,weighted avg
precision,0.69863,0.666667,0.69,0.682648,0.685845
recall,0.85,0.45,0.69,0.65,0.69
f1-score,0.766917,0.537313,0.69,0.652115,0.675076
support,120.0,80.0,0.69,200.0,200.0


In [None]:
train_y_pred = forest.predict(train_emb)
pd.DataFrame(classification_report(train_y, train_y_pred, output_dict=True))

Unnamed: 0,0.0,1.0,accuracy,macro avg,weighted avg
precision,0.997942,1.0,0.998747,0.998971,0.998749
recall,1.0,0.996805,0.998747,0.998403,0.998747
f1-score,0.99897,0.9984,0.998747,0.998685,0.998747
support,485.0,313.0,0.998747,798.0,798.0
