In [1]:
import sys
sys.path.insert(0, '..')
import os
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from src.data_collection import get_data

In [2]:
DATA_PATH = os.path.join(os.path.pardir, "data", "fasttext")

In [3]:
data = get_data().to_numpy()
train_data, test_data = train_test_split(data, train_size=0.80)

Fetching data...


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd
Reusing dataset parquet (/Users/badr/.cache/huggingface/datasets/parquet/ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)
100%|██████████| 1/1 [00:00<00:00, 105.14it/s]


Processing...
Done!


In [4]:
def save_data_to_fasttext(data, saving_path):
    with open(saving_path, "w") as f:
        for text, category in data:
            label = "__label__no" if category == 0 else "__label__yes"
            f.write(label + " " + text + "\n")


def f1_score_fasttext(model, test_path):
    gold = []
    system = []

    with open(test_path) as f:
        for line in f:
            split_line = line.split(" ")
            label = split_line[0]
            text = " ".join(split_line[1:]).strip()

            gold.append(label)
            system.append(model.predict(text)[0][0])

    return f1_score(gold, system, pos_label="__label__yes")


def classification_report_fasttext(model, test_path):
    gold = []
    system = []

    with open(test_path) as f:
        for line in f:
            split_line = line.split(" ")
            label = split_line[0]
            text = " ".join(split_line[1:]).strip()

            gold.append(label)
            system.append(model.predict(text)[0][0])

    return classification_report(gold, system)

In [5]:
save_data_to_fasttext(train_data, os.path.join(DATA_PATH, "data.train"))
save_data_to_fasttext(test_data, os.path.join(DATA_PATH, "data.test"))

In [6]:
model = fasttext.train_supervised(
    input=os.path.join(DATA_PATH, "data.train"), 
    epoch=50, 
    lr=0.01,
    wordNgrams=1,
    dim=200,
    ws=10
)


Read 0M words
Number of words:  83227
Number of labels: 2
Progress: 100.0% words/sec/thread: 2562774 lr:  0.000000 avg.loss:  0.351826 ETA:   0h 0m 0s


In [7]:
precision = model.test(os.path.join(DATA_PATH, "data.test"))[1]
precision

0.7545810691267535

In [8]:
print(f1_score_fasttext(model, os.path.join(DATA_PATH, "data.test")))

0.5776424532405392


In [9]:
print(classification_report_fasttext(model, os.path.join(DATA_PATH, "data.test")))

              precision    recall  f1-score   support

 __label__no       0.78      0.88      0.83      5270
__label__yes       0.68      0.50      0.58      2643

    accuracy                           0.75      7913
   macro avg       0.73      0.69      0.70      7913
weighted avg       0.75      0.75      0.74      7913

