# **BiLSTM Model Training**

In [1]:
!pip install torchtext==0.10.0
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 14.9 MB/s eta 0:00:01
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.0 kB/s ta 0:00:0101��█▊              | 459.5 MB 77 kB/s eta 1:19:57��████████████████████████▋   | 742.9 MB 65 kB/s eta 0:22:3108.9 MB 1.3 MB/s eta 0:00:18
[31mERROR: kornia 0.3.1 has requirement torch==1.5.0, but you'll have torch 1.9.0 which is incompatible.[0m
[31mERROR: allennlp 1.0.0 has requirement torch<1.6.0,>=1.5.0, but you'll have torch 1.9.0 which is incompatible.[0m
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.5.1
    Uninstalling torch-1.5.1:
      Successfully uninstalled torch-1.5.1
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.6.0
    Uninstalli

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
from torchtext.legacy import data
import pandas as pd
import spacy
import re
import random
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

nltk.download("stopwords")
spacy_en = spacy.load("en_core_web_sm")
SEED = 32
torch.manual_seed(SEED)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load dataset
df = pd.read_csv("/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv")
df = df[["tweet", "class"]]
df["label"] = df["class"].apply(lambda x: 0 if x == 2 else 1)
df = df[["tweet", "label"]]  # Keep only needed columns


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Device: cuda


In [4]:
STOP_WORDS = set(stopwords.words("english"))

def tokenize_and_clean(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = text.lower()
    return [token.lemma_ for token in spacy_en.tokenizer(text)
            if token.text not in STOP_WORDS and token.text.strip() != ""]


In [5]:
TEXT = data.Field(tokenize=tokenize_and_clean, lower=True, batch_first=True, fix_length=100)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

class DataFrameDataset(data.Dataset):
    def __init__(self, df, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = [data.Example.fromlist([row['tweet'], row['label']], fields) for _, row in df.iterrows()]
        super().__init__(examples, fields, **kwargs)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)
train_dataset = DataFrameDataset(train_df, TEXT, LABEL)
train_data, val_data = train_dataset.split(split_ratio=0.8, random_state=random.seed(SEED))
test_dataset = DataFrameDataset(test_df, TEXT, LABEL)


In [6]:
TEXT.build_vocab(train_data, min_freq=3, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)

print("Vocab size:", len(TEXT.vocab))
print("Common tokens:", TEXT.vocab.freqs.most_common(10))


.vector_cache/glove.6B.zip: 862MB [02:58, 4.83MB/s]                               
100%|█████████▉| 399999/400000 [00:14<00:00, 27774.50it/s]


Vocab size: 4697
Common tokens: [('bitch', 7167), ('rt', 4881), ('not', 2961), ('hoe', 2741), ('get', 2208), ('like', 1811), ('pussy', 1484), ('fuck', 1471), ('go', 1003), ('ass', 993)]


In [7]:
BATCH_SIZE = 64

train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_dataset),
    batch_size=BATCH_SIZE,
    device=device,
    sort=False
)


In [8]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.embedding.weight.data.copy_(TEXT.vocab.vectors)
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.dropout(torch.cat((lstm_out[:, -1, :self.lstm.hidden_size],
                                      lstm_out[:, 0, self.lstm.hidden_size:]), dim=1))
        return torch.sigmoid(self.fc(out))

model = BiLSTM(len(TEXT.vocab), 100, 128, 1, TEXT.vocab.stoi[TEXT.pad_token])
model.to(device)


BiLSTM(
  (embedding): Embedding(4697, 100, padding_idx=1)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [11]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()
criterion.to(device)

def train(model, iterator):
    model.train()
    epoch_loss, all_preds, all_labels = 0, [], []

    for batch in iterator:
        optimizer.zero_grad()
        pred = model(batch.text).squeeze()
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        all_preds += torch.round(pred).cpu().tolist()
        all_labels += batch.label.cpu().tolist()

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return epoch_loss / len(iterator), acc, f1

def evaluate(model, iterator):
    model.eval()
    epoch_loss, all_preds, all_labels = 0, [], []

    with torch.no_grad():
        for batch in iterator:
            pred = model(batch.text).squeeze()
            loss = criterion(pred, batch.label)
            epoch_loss += loss.item()
            all_preds += torch.round(pred).cpu().tolist()
            all_labels += batch.label.cpu().tolist()

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return epoch_loss / len(iterator), acc, f1


In [12]:
EPOCHS = 50
for epoch in range(EPOCHS):
    train_loss, train_acc, train_f1 = train(model, train_iterator)
    val_loss, val_acc, val_f1 = evaluate(model, val_iterator)

    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
    print(f"Val   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")


Epoch 1
Train Loss: 0.0764 | Acc: 0.9691 | F1: 0.9080
Val   Loss: 0.1612 | Acc: 0.9415 | F1: 0.8307
Epoch 2
Train Loss: 0.0653 | Acc: 0.9753 | F1: 0.9258
Val   Loss: 0.1810 | Acc: 0.9377 | F1: 0.8113
Epoch 3
Train Loss: 0.0599 | Acc: 0.9782 | F1: 0.9347
Val   Loss: 0.1789 | Acc: 0.9400 | F1: 0.8208
Epoch 4
Train Loss: 0.0518 | Acc: 0.9818 | F1: 0.9452
Val   Loss: 0.2104 | Acc: 0.9415 | F1: 0.8368
Epoch 5
Train Loss: 0.0466 | Acc: 0.9841 | F1: 0.9521
Val   Loss: 0.2033 | Acc: 0.9377 | F1: 0.8139
Epoch 6
Train Loss: 0.0377 | Acc: 0.9875 | F1: 0.9623
Val   Loss: 0.2265 | Acc: 0.9397 | F1: 0.8264
Epoch 7
Train Loss: 0.0362 | Acc: 0.9883 | F1: 0.9648
Val   Loss: 0.2247 | Acc: 0.9339 | F1: 0.8045
Epoch 8
Train Loss: 0.0367 | Acc: 0.9873 | F1: 0.9616
Val   Loss: 0.2116 | Acc: 0.9372 | F1: 0.8109
Epoch 9
Train Loss: 0.0277 | Acc: 0.9907 | F1: 0.9721
Val   Loss: 0.2562 | Acc: 0.9359 | F1: 0.8157
Epoch 10
Train Loss: 0.0334 | Acc: 0.9888 | F1: 0.9662
Val   Loss: 0.2393 | Acc: 0.9367 | F1: 0.8214