[link to the dataset](https://www.kaggle.com/code/akkefa/eda-toxic-comment-classification-challenge)

### Text Preprocessing

In [1]:
import pandas as pd

# Load your dataset (assumed to be in CSV format)
data = pd.read_csv("train.csv")

# Define your multilabel classes
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Convert the labels from '0|0|0|...' format to FastText format '__label__toxic __label__obscene'
def convert_labels(row):
    labels = [f"__label__{label}" for label, val in zip(label_columns, row) if val == 1]
    return ' '.join(labels)

# Apply the conversion and create the FastText formatted training data
data['labels'] = data[label_columns].apply(convert_labels, axis=1)

# Prepare the data in FastText format
with open("train.txt", "w") as f:
    for _, row in data.iterrows():
        labels = row['labels']
        text = row['comment_text'].replace("\n", " ")  # Remove newlines from the text
        if labels:
            f.write(f"{labels} {text}\n")


In [16]:
import pandas
csv_file = 'test.csv'
txt_file = 'test.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        comments=pd.read_csv(my_input_file)
        comments=comments['comment_text']
        text = comments.replace("\n", " ")
        comments.to_csv(my_output_file, index=False, header=False)
    my_output_file.close()

In [2]:
! wc train.txt

  16225  890406 5515291 train.txt


### Text Classification

In [3]:
!head -n 11225 train.txt > hspeech.train
!tail -n 5000 train.txt > hspeech.valid

### Initial Model Creation

In [4]:
import fasttext

# Train the multilabel FastText classifier
model = fasttext.train_supervised(input="hspeech.train")

# Save the model
model.save_model("fasttext_multilabel.bin")

Read 0M words
Number of words:  61479
Number of labels: 6
Progress: 100.0% words/sec/thread: 5874837 lr:  0.000000 avg.loss:  1.178751 ETA:   0h 0m 0s


In [5]:
# Example comment to classify
comment = "I hate you and you're awful!"

# Predict multilabel classification
labels, probabilities = model.predict(comment, k=-1)  # k=-1 returns all possible labels

# Print the predicted labels and their probabilities
for label, prob in zip(labels, probabilities):
    print(f"{label}: {prob}")

__label__toxic: 0.7589246034622192
__label__obscene: 0.12835420668125153
__label__insult: 0.10290884226560593
__label__identity_hate: 0.005275001749396324
__label__severe_toxic: 0.0034729638136923313
__label__threat: 0.001124377828091383


In [5]:
model.test("hspeech.valid", k=1)

(5000, 0.9382, 0.43195211786372006)

### Using subword features

In [33]:
model = fasttext.train_supervised(input="hspeech.train",minn=2, maxn=5, dim=300)

Read 0M words
Number of words:  61479
Number of labels: 6
Progress: 100.0% words/sec/thread:  289259 lr:  0.000000 avg.loss:  1.242902 ETA:   0h 0m 0s


In [8]:
model.test("hspeech.valid", k=1)

(5000, 0.9386, 0.4321362799263352)

In [34]:
def evaluate(model):
    result=model.test('hspeech.valid',k=1)
    return {"N":result[0],"precision":result[1],"recall":result[2]}

In [39]:
evaluate(model)

{'N': 5000, 'precision': 0.9404, 'recall': 0.43296500920810316}

### Using hierarchical softmax

In [38]:
model = fasttext.train_supervised(input="hspeech.train",minn=2, maxn=5, dim=300,loss="hs")

Read 0M words
Number of words:  61479
Number of labels: 6
Progress: 100.0% words/sec/thread:  299334 lr:  0.000000 avg.loss:  1.218488 ETA:   0h 0m 0s


In [40]:
evaluate(model)

{'N': 5000, 'precision': 0.9404, 'recall': 0.43296500920810316}

In [9]:
result=model.test("hspeech.valid", k=1)

In [17]:
print("N",result[0])
print("precision",result[1])
print("recall",result[2])

N 5000
precision 0.9386
recall 0.4321362799263352


### Using Multiple features classfication

In [41]:
model = fasttext.train_supervised(input="hspeech.train", lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='ova')

Read 0M words
Number of words:  61479
Number of labels: 6
Progress: 100.0% words/sec/thread: 4891737 lr:  0.000000 avg.loss:  0.366729 ETA:   0h 0m 0s


In [42]:
evaluate(model)

{'N': 5000, 'precision': 0.9082, 'recall': 0.4181399631675875}

## Word Representation

In [22]:
model.get_word_vector("were")

array([ 0.33102927, -0.00505708, -0.06864711,  0.06713303, -0.26286536,
       -0.17741205, -0.16304077, -0.4402635 ,  0.4674509 , -0.06525651,
       -0.40054867, -0.12056362, -0.067091  , -0.33751154, -0.5043269 ,
        0.03185381, -0.2323712 , -0.02592087,  0.23117666, -0.01893312,
        0.42657536,  0.20391884, -0.17958076, -0.0933025 , -0.05848107,
        0.05749403,  0.24501762, -0.3789084 , -0.18331921,  0.15757018,
        0.10749961,  0.4476397 , -0.04886856, -0.2935044 ,  0.00423955,
        0.1513073 , -0.1794089 ,  0.024643  , -0.1008096 , -0.03077056,
        0.33447737,  0.280899  ,  0.0217999 , -0.092246  ,  0.04432036,
       -0.26113498,  0.28969112,  0.13137189,  0.28717273,  0.34986067],
      dtype=float32)

In [24]:
model=fasttext.load_model("fasttext_multilabel.bin")

In [25]:
model = fasttext.train_unsupervised("hspeech.train", "cbow")

Read 0M words
Number of words:  8013
Number of labels: 6
Progress: 100.0% words/sec/thread:  370445 lr:  0.000000 avg.loss:  2.103318 ETA:   0h 0m 0s


In [29]:
model.get_nearest_neighbors("dumb")

[(0.9932989478111267, 'dumb,'),
 (0.9895120859146118, 'dumb.'),
 (0.9768179655075073, 'loser.'),
 (0.9762691855430603, 'lol.'),
 (0.9748433232307434, 'ugly,'),
 (0.9736857414245605, 'cunts.'),
 (0.9719087481498718, 'job'),
 (0.9717002511024475, 'slut.'),
 (0.9713154435157776, 'boy,'),
 (0.9704710245132446, 'punk')]