<a href="https://colab.research.google.com/github/saparbayev-azizbek-12/bi-and-ai-talents-dl/blob/main/lesson-21/lesson_21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/dl_course/lesson-21/data/smile-annotations-final.csv"
df = pd.read_csv(file_path, header=None)

In [None]:
df.columns = ["id", "text", "label"]

In [None]:
df1 = df[df['label'] == 'sad']
df1['text'].values

In [None]:
df['label'].drop_duplicates()
rm_values = ["nocode", "not-relevant", "disgust|angry", "happy|surprise", "happy|sad", "sad|disgust", "sad|angry", "sad|disgust|angry"]
df = df[~df['label'].isin(rm_values)]
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
happy,1137
angry,57
surprise,35
sad,32
disgust,6


In [None]:
labels = sorted(df["label"].unique())
itos = {i:v for i,v in enumerate(labels)}
stoi = {v:i for i,v in itos.items()}
stoi

{'angry': 0, 'disgust': 1, 'happy': 2, 'sad': 3, 'surprise': 4}

In [None]:
import json

with open("label_map.json", "w") as f:
    json.dump(itos, f)

In [None]:
X = df['text'].to_list()
Y = df["label"].apply(lambda row: stoi[row]).values

In [None]:
from sklearn.model_selection import train_test_split

Xtr, Xtemp, Ytr, Ytemp = train_test_split(
    X, Y,
    train_size=0.7,
    stratify=Y,
    shuffle=True,
    random_state=42
  )

Xval, Xts, Yval, Yts = train_test_split(
    Xtemp, Ytemp,
    train_size=0.5,
    stratify=Ytemp,
    shuffle=True,
    random_state=42
)


In [None]:
print(pd.DataFrame(Ytr).value_counts())
print(pd.DataFrame(Yval).value_counts())

0
2    795
0     40
4     25
3     22
1      4
Name: count, dtype: int64
0
2    171
0      8
3      5
4      5
1      1
Name: count, dtype: int64


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import torch
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(Ytr),
    y=Ytr
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print(f"Calculated Class Weights: {class_weights}")

Calculated Class Weights: tensor([ 4.4300, 44.3000,  0.2229,  8.0545,  7.0880], device='cuda:0')


In [None]:
from torch.nn import CrossEntropyLoss

criterion = CrossEntropyLoss(weight=class_weights)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-large-uncased",
    num_labels=len(labels)
)
model.to(device)

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Xtr_tokenized = tokenizer(Xtr, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
Xval_tokenized = tokenizer(Xval, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
Xts_tokenized = tokenizer(Xts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
  def __init__(self, X, Y):
    super().__init__()
    self.X = X
    self.Y = Y

  def __len__(self):
    return len(self.X['input_ids'])

  def __getitem__(self, id):
    message = {k: v[id] for k, v in self.X.items()}
    label = torch.tensor(self.Y[id])
    return message, label

Dtr = CustomDataset(Xtr_tokenized, Ytr)
Dval = CustomDataset(Xval_tokenized, Yval)
Dts = CustomDataset(Xts_tokenized, Yts)

In [None]:
DLtr = DataLoader(Dtr, batch_size=16, shuffle=True)
DLval = DataLoader(Dval, batch_size=16, shuffle=True)
DLts = DataLoader(Dts, batch_size=16, shuffle=False)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
def train_epoch(model, dataloader, criterion):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for inputs, labels in dataloader:

        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)

        outputs = model(**inputs)
        logits = outputs.logits
        loss = criterion(logits, labels)
        preds = torch.argmax(logits, dim=1)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        correct += (preds == labels).sum().item()
        total += labels.size(0)
        total_loss += loss.item()

    acc = correct / total
    return total_loss / len(dataloader), acc

def eval_epoch(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)

            outputs = model(**inputs)
            logits = outputs.logits
            loss = criterion(logits, labels)

            preds = torch.argmax(logits, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)
            total_loss += loss.item()

    acc = correct / total
    return total_loss / len(dataloader), acc

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    train_loss, train_acc = train_epoch(model, DLtr, criterion)
    val_loss, val_acc = eval_epoch(model, DLval, criterion)

    print(f"""
    Epoch {epoch+1}
    Train loss: {train_loss:.4f}
    Train acc:  {train_acc:.4f}
    Val loss:   {val_loss:.4f}
    Val acc:    {val_acc:.4f}
    """)

In [None]:
def predict_text(text):
  model.eval()
  inputs = tokenizer(text, return_tensors='pt', padding=True)
  inputs = {k:v.to(device) for k,v in inputs.items()}


  with torch.no_grad():
    out = model(**inputs)
  logits = out.logits
  softmax_probs = torch.softmax(logits, dim=1)
  pred = torch.argmax(logits, dim=1)
  predicted_label = itos[pred.item()]
  return predicted_label

In [None]:
predict_text("This man lose his keys. He is so upset")

'sad'

In [None]:
torch.save(model.state_dict(), "emotional_model.pt")