In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/dl_course/lesson-21/data/smile-annotations-final.csv"
df = pd.read_csv(file_path, header=None)

In [None]:
df.columns = ["id", "text", "label"]

In [None]:
df['label'].drop_duplicates()
rm_values = ["nocode", "not-relevant", "disgust|angry", "happy|surprise", "happy|sad", "sad|disgust", "sad|angry", "sad|disgust|angry"]
df = df[~df['label'].isin(rm_values)]
df['label'].value_counts()

In [None]:
labels = list(df["label"].drop_duplicates())
itos = {i:v for i,v in enumerate(labels)}
stoi = {v:i for i,v in itos.items()}
stoi

In [None]:
X = df['text'].to_list()
Y = df["label"].apply(lambda row: stoi[row]).values

In [None]:
from sklearn.model_selection import train_test_split

Xtr, Xtemp, Ytr, Ytemp = train_test_split(
    X, Y,
    train_size=0.7,
    stratify=Y,
    shuffle=True,
    random_state=42
  )

Xval, Xts, Yval, Yts = train_test_split(
    Xtemp, Ytemp,
    train_size=0.5,
    stratify=Ytemp,
    shuffle=True,
    random_state=42
)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize(texts):
  return tokenizer(
      texts,
      padding=True,
      truncation=True,
      max_length=128,
      return_tensors='pt'
  )

Xtr_tokenized = tokenize(Xtr)
Xval_tokenized = tokenize(Xval)
Xts_tokenized = tokenize(Xts)

In [None]:
Xtr_tokenized['input_ids'][0]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
  def __init__(self, X, Y):
    super().__init__()
    self.X = X
    self.Y = Y

  def __len__(self):
    return len(self.X['input_ids'])

  def __getitem__(self, id):
    message = {k: v[id] for k, v in self.X.items()}
    label = torch.tensor(self.Y[id])
    return message, label

Dtr = CustomDataset(Xtr_tokenized, Ytr)
Dval = CustomDataset(Xval_tokenized, Yval)
Dts = CustomDataset(Xts_tokenized, Yts)

In [None]:
Dtr[0]

In [None]:
DLtr = DataLoader(Dtr, batch_size=32, shuffle=True)
DLval = DataLoader(Dval, batch_size=32, shuffle=True)
DLts = DataLoader(Dts, batch_size=32, shuffle=False)

In [None]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=len(labels)
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=0.03)

In [None]:
def train_epoch(model, dataloader):
    model.train()
    total_loss = 0

    for inputs, labels in dataloader:

        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [None]:
def eval_epoch(model, dataloader):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)
            total_loss += loss.item()

    acc = correct / total
    return total_loss / len(dataloader), acc


In [None]:
EPOCHS = 3

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, DLtr)
    val_loss, val_acc = eval_epoch(model, DLval)

    print(f"""
    Epoch {epoch+1}
    Train loss: {train_loss:.4f}
    Val loss:   {val_loss:.4f}
    Val acc:    {val_acc:.4f}
    """)
