<a href="https://colab.research.google.com/github/reginafeles/transformer/blob/main/model/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import re
from collections import Counter
from tqdm import tqdm

In [None]:
df = pd.read_csv("cyberbullying_tweets.csv")
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [None]:
class_names = sorted(df['cyberbullying_type'].unique())
class_to_idx = {name: i for i, name in enumerate(class_names)}
df['label'] = df['cyberbullying_type'].map(class_to_idx)
num_classes = len(class_to_idx)

In [None]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,label
0,"In other words #katandandre, your food was cra...",not_cyberbullying,3
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,3
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,3
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,3
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,3


In [None]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.split()

In [None]:
all_tokens = [token for text in df['tweet_text'] for token in tokenize(text)]
vocab = {'<PAD>': 0, '<UNK>': 1, '<CLS>': 2}
vocab.update({word: i+3 for i, (word, _) in enumerate(Counter(all_tokens).most_common(10000))})

In [None]:
def encode(text, max_len=32):
    tokens = ['<CLS>'] + tokenize(text)
    ids = [vocab.get(t, vocab['<UNK>']) for t in tokens]
    if len(ids) < max_len:
        ids += [vocab['<PAD>']] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids


In [None]:
class BullyDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = encode(self.texts[idx])
        return torch.tensor(encoded), torch.tensor(self.labels[idx])

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['tweet_text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

train_dataset = BullyDataset(train_texts, train_labels)
val_dataset = BullyDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, num_classes, d_model=64, nhead=4, num_layers=2, dim_feedforward=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        emb = emb.permute(1, 0, 2)
        encoded = self.encoder(emb)
        cls = encoded[0]
        return self.classifier(cls)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerClassifier(len(vocab), num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()



In [None]:
def evaluate(loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    return correct / total

In [None]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader):
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = loss_fn(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    val_acc = evaluate(val_loader)
    print(f"Epoch {epoch+1}, Train Loss: {total_loss:.4f}, Val Accuracy: {val_acc:.4f}")

100%|██████████| 1193/1193 [00:57<00:00, 20.74it/s]


Epoch 1, Train Loss: 766.5009, Val Accuracy: 0.7957


100%|██████████| 1193/1193 [00:53<00:00, 22.51it/s]


Epoch 2, Train Loss: 557.0082, Val Accuracy: 0.8046


100%|██████████| 1193/1193 [00:52<00:00, 22.94it/s]


Epoch 3, Train Loss: 492.3482, Val Accuracy: 0.8212


100%|██████████| 1193/1193 [00:51<00:00, 22.95it/s]


Epoch 4, Train Loss: 449.9923, Val Accuracy: 0.8168


100%|██████████| 1193/1193 [00:55<00:00, 21.61it/s]


Epoch 5, Train Loss: 415.2251, Val Accuracy: 0.8218


In [None]:
idx_to_class = {i: name for name, i in class_to_idx.items()}

def predict(text):
    model.eval()
    encoded = encode(text)
    x = torch.tensor(encoded).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(x)
        probs = F.softmax(logits, dim=1)
        pred_idx = torch.argmax(probs, dim=1).item()
        confidence = probs[0, pred_idx].item()
    return idx_to_class[pred_idx], confidence

In [None]:
text = input("Введите текст для классификации")
label, conf = predict(text)
print(f"класс: {label} ({conf:.2f})")


In [None]:
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab': vocab,
    'class_to_idx': class_to_idx
}, 'transformer_classifier.pt')

NameError: name 'torch' is not defined