In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW  # <--- Import AdamW from torch.optim
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import f1_score


In [3]:
df = pd.read_csv('smile-annotations-final.csv', names=['id', 'text', 'category'])
df.set_index('id', inplace=True)

In [4]:
df = df[~df.category.str.contains('\|')]
df = df[df.category != 'nocode']

  df = df[~df.category.str.contains('\|')]


In [5]:
label_dict = {label: i for i, label in enumerate(df.category.unique())}
df['label'] = df.category.replace(label_dict)


  df['label'] = df.category.replace(label_dict)


In [6]:
train_idx, val_idx = train_test_split(df.index, test_size=0.15, stratify=df.label, random_state=17)
df.loc[train_idx, 'data_type'] = 'train'
df.loc[val_idx, 'data_type'] = 'val'

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
def encode_data(texts):
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors='pt'
    )

In [9]:
train_texts = df[df.data_type == 'train'].text.values
val_texts = df[df.data_type == 'val'].text.values

train_encodings = encode_data(train_texts)
val_encodings = encode_data(val_texts)

train_labels = torch.tensor(df[df.data_type == 'train'].label.values)
val_labels = torch.tensor(df[df.data_type == 'val'].label.values)

In [10]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

batch_size = 8

In [11]:
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [12]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_dict))
model = model.to('cpu')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
num_training_steps = epochs * len(train_loader)

scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [14]:
def compute_f1(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [15]:
device = torch.device('cpu')
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch in loop:
        batch = [item.to(device) for item in batch]
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        loop.set_description(f"Loss {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_preds = []
    val_labels_list = []

    with torch.no_grad():
        for batch in val_loader:
            batch = [item.to(device) for item in batch]
            input_ids, attention_mask, labels = batch

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            val_preds.append(logits.detach().cpu().numpy())
            val_labels_list.append(labels.detach().cpu().numpy())

    val_preds = np.concatenate(val_preds, axis=0)
    val_labels = np.concatenate(val_labels_list, axis=0)

    f1 = compute_f1(val_preds, val_labels)
    print(f"Validation F1 Score: {f1:.4f}")

    model.train()

Epoch 1/3


  0%|          | 0/158 [00:00<?, ?it/s]

Average training loss: 0.7713
Validation F1 Score: 0.7876
Epoch 2/3


  0%|          | 0/158 [00:00<?, ?it/s]

Average training loss: 0.4035
Validation F1 Score: 0.8452
Epoch 3/3


  0%|          | 0/158 [00:00<?, ?it/s]

Average training loss: 0.2574
Validation F1 Score: 0.8545


In [17]:
# Save the model
model_save_path = "bert_sentiment_model.pt"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Save the tokenizer
tokenizer.save_pretrained("bert_tokenizer/")
print("Tokenizer saved to bert_tokenizer/")


Model saved to bert_sentiment_model.pt
Tokenizer saved to bert_tokenizer/
