In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

In [3]:
# Load train and validation datasets
train_df = pd.read_csv('/content/drive/MyDrive/seq2seq/train_dataset.csv').dropna()
validation_df = pd.read_csv('/content/drive/MyDrive/seq2seq/validation_dataset.csv').dropna()

In [4]:
# Extract text and labels
train_texts, train_labels = train_df['WORD'].tolist(), train_df['LABEL'].tolist()
validation_texts, validation_labels = validation_df['WORD'].tolist(), validation_df['LABEL'].tolist()

In [5]:
# Encode labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
validation_labels_encoded = label_encoder.transform(validation_labels)

In [6]:
# Parameters
batch_size = 8
max_length = 128
num_classes = len(label_encoder.classes_)  # Number of unique classes

In [7]:
# Load tokenizer and model
bert_model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_classes)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Create dataset function
def create_dataset(texts, labels, tokenizer, max_length):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for text, label in zip(texts, labels):
        encoding = tokenizer(
            str(text),
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        input_ids_list.append(encoding['input_ids'].squeeze())
        attention_mask_list.append(encoding['attention_mask'].squeeze())
        labels_list.append(torch.tensor(label, dtype=torch.long))

    dataset = list(zip(input_ids_list, attention_mask_list, labels_list))
    return dataset

train_dataset = create_dataset(train_texts, train_labels_encoded, tokenizer, max_length)
validation_dataset = create_dataset(validation_texts, validation_labels_encoded, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size)

In [9]:
# Set up training
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [10]:
# Training loop
num_epochs = 5
print("Start training")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0

    with tqdm(train_loader, unit="batch") as t:
        for batch in t:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

            t.set_postfix({'loss': total_loss / (t.n + 1), 'accuracy': correct_train / total_train})

Start training


100%|██████████| 4139/4139 [14:46<00:00,  4.67batch/s, loss=0.484, accuracy=0.875]
100%|██████████| 4139/4139 [14:46<00:00,  4.67batch/s, loss=0.33, accuracy=0.923]
100%|██████████| 4139/4139 [14:46<00:00,  4.67batch/s, loss=0.268, accuracy=0.939]
100%|██████████| 4139/4139 [14:46<00:00,  4.67batch/s, loss=0.223, accuracy=0.951]
100%|██████████| 4139/4139 [14:46<00:00,  4.67batch/s, loss=0.198, accuracy=0.959]


In [11]:
# Evaluation
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    with tqdm(validation_loader, unit="batch") as t:
        for batch in t:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

100%|██████████| 1768/1768 [01:36<00:00, 18.34batch/s]


In [12]:
# Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8980
Precision: 0.8956
Recall: 0.8980
F1 Score: 0.8957
