# Fast Text Embeddings + LSTM Head

In [5]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.0-py3-none-any.whl (292 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4508432 sha256=8c8ed13da6122d9204882dc1523fc5fb78c5ec51c462176706ea6e2e452efda8
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a513

In [1]:
import pandas as pd
import fasttext
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import class_weight

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

## Load Model for Embeddings

In [6]:
# pretrained english embeddings from facebook's fast text
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip cc.en.300.bin.gz

--2025-08-12 21:03:38--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.25, 13.226.210.15, 13.226.210.111, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2025-08-12 21:04:55 (56.0 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [3]:
# Load pretrained FastText model (takes a few seconds)
ft = fasttext.load_model('cc.en.300.bin')

## Load Data

In [4]:
train = pd.read_csv('comments_train.csv')
test = pd.read_csv('comments_test.csv')

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,moderation_label
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


In [6]:
test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,moderation_label
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,0
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,0
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,0
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,0
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,0


## Make Embeddings

In [7]:
def get_embedding(text):
    words = str(text).lower().split()
    vectors = [ft.get_word_vector(w) for w in words]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(ft.get_dimension())

In [8]:
train['ft_embedding'] = train['comment_text'].apply(get_embedding)
test['ft_embedding'] = test['comment_text'].apply(get_embedding)

In [12]:
train['ft_embedding'].head()

Unnamed: 0,ft_embedding
0,"[-0.0016522653, 0.044181224, -0.0013345327, 0...."
1,"[0.0033707179, 0.0063960715, 0.0066807913, 0.0..."
2,"[-0.0003818978, 0.0012663372, 0.00684883, 0.03..."
3,"[-0.0045231674, -0.031273425, 0.005112849, 0.0..."
4,"[0.005956408, 0.042522732, -0.002054709, 0.076..."


In [13]:
test['ft_embedding'].head()

Unnamed: 0,ft_embedding
0,"[0.0098184375, -0.03694679, 0.001696917, 0.033..."
1,"[-0.021565393, -0.011511437, 0.008615208, 0.06..."
2,"[-0.005951238, -0.04057545, 0.016621577, 0.055..."
3,"[-0.009195016, -0.056916747, -0.0020851076, 0...."
4,"[-0.009587349, -0.025252663, 0.004535417, 0.03..."


## NN Model

### Without Class Imbalance

In [10]:
X_train = np.vstack(train['ft_embedding'].values)  # shape: (num_samples, 300)
y_train = train['moderation_label'].values

In [11]:
X_test = np.vstack(test['ft_embedding'].values)
y_test = test['moderation_label'].values

In [17]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

In [19]:
model.fit(X_train,
          y_train,
          epochs = 10,
          batch_size = 32,
          validation_split = 0.1)

Epoch 1/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 0.9285 - loss: 0.2032 - val_accuracy: 0.9481 - val_loss: 0.1438
Epoch 2/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9441 - loss: 0.1517 - val_accuracy: 0.9472 - val_loss: 0.1417
Epoch 3/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.9451 - loss: 0.1487 - val_accuracy: 0.9495 - val_loss: 0.1387
Epoch 4/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9469 - loss: 0.1450 - val_accuracy: 0.9511 - val_loss: 0.1364
Epoch 5/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.9471 - loss: 0.1419 - val_accuracy: 0.9503 - val_loss: 0.1373
Epoch 6/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9495 - loss: 0.1383 - val_accuracy: 0.9497 - val_loss: 0.1353
Epoch 7/10

<keras.src.callbacks.history.History at 0x7ac0d9f18910>

In [20]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy:.4f}')

[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9263 - loss: 0.2028
Test accuracy: 0.9260


In [22]:
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
Confusion Matrix:
[[55318  2417]
 [ 2316  3927]]


In [33]:
report = classification_report(y_test, y_pred, target_names=['Not Toxic', 'Toxic'])
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.96      0.96      0.96     57735
       Toxic       0.62      0.63      0.62      6243

    accuracy                           0.93     63978
   macro avg       0.79      0.79      0.79     63978
weighted avg       0.93      0.93      0.93     63978



### With Class Imbalance Handled

In [12]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 classes = np.array([0, 1]),
                                                 y = y_train)
class_weights_dict = dict(enumerate(class_weights))

In [13]:
class_weights_dict

{0: 0.5565938358935721, 1: 4.917442218798151}

In [14]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [16]:
model.fit(X_train,
          y_train,
          epochs = 10,
          batch_size = 32,
          validation_split = 0.1)

Epoch 1/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 5ms/step - accuracy: 0.9292 - loss: 0.2040 - val_accuracy: 0.9461 - val_loss: 0.1446
Epoch 2/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9442 - loss: 0.1530 - val_accuracy: 0.9493 - val_loss: 0.1424
Epoch 3/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.9465 - loss: 0.1453 - val_accuracy: 0.9494 - val_loss: 0.1393
Epoch 4/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.9475 - loss: 0.1425 - val_accuracy: 0.9506 - val_loss: 0.1369
Epoch 5/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.9480 - loss: 0.1423 - val_accuracy: 0.9510 - val_loss: 0.1350
Epoch 6/10
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.9494 - loss: 0.1394 - val_accuracy: 0.9507 - val_loss: 0.1364
Epoch 7/10

<keras.src.callbacks.history.History at 0x795a26590810>

In [17]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy:.4f}')

[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9216 - loss: 0.2205
Test accuracy: 0.9219


In [18]:
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Confusion Matrix:
[[54947  2788]
 [ 2208  4035]]


In [19]:
report = classification_report(y_test, y_pred, target_names=['Not Toxic', 'Toxic'])
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.96      0.95      0.96     57735
       Toxic       0.59      0.65      0.62      6243

    accuracy                           0.92     63978
   macro avg       0.78      0.80      0.79     63978
weighted avg       0.93      0.92      0.92     63978



Accuracy and precision decreased but recall improved which is important.