In [24]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [25]:
import pandas as pd

In [26]:
train_path = '/drive/MyDrive/ml/sequora/datasets/internal_data/sentiment/sentiment_train.csv'
valid_path = '/drive/MyDrive/ml/sequora/datasets/internal_data/sentiment/sentiment_valid.csv'
test_path  = '/drive/MyDrive/ml/sequora/datasets/internal_data/sentiment/sentiment_test.csv'

In [27]:
train_data = pd.read_csv(train_path)
valid_data = pd.read_csv(valid_path)
test_data  = pd.read_csv(test_path)

In [28]:
train_data = train_data.drop(columns=['Unnamed: 0'])
valid_data = valid_data.drop(columns=['Unnamed: 0'])
test_data  = test_data.drop(columns=['Unnamed: 0'])

In [29]:
train_data.columns

Index(['text', 'label'], dtype='object')

In [30]:
!pip install transformers sentencepiece



In [31]:
import torch
# from transformers import AutoTokenizer, AutoModel

In [32]:
from transformers import BertTokenizer, BertForSequenceClassification

In [33]:
tokenizer = BertTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = BertForSequenceClassification.from_pretrained("cointegrated/rubert-tiny")

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

In [34]:
model.classifier = torch.nn.Linear(312, 3)

In [35]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [36]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, element

In [37]:
# Dataset
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=256):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [38]:
train_set = CustomDataset(
    texts=list(train_data['text']),
    targets=list(train_data['label']),
    tokenizer=tokenizer
)

In [39]:
valid_set = CustomDataset(
    texts=list(valid_data['text']),
    targets=list(valid_data['label']),
    tokenizer=tokenizer
)

In [40]:
test_set = CustomDataset(
    texts=list(test_data['text']),
    targets=list(test_data['label']),
    tokenizer=tokenizer
)

In [41]:
# DataLoader
train_loader = DataLoader(
        train_set,
        batch_size=2,
        shuffle=True,
        num_workers=0
    )

In [42]:
valid_loader = DataLoader(
        valid_set,
        batch_size=2,
        shuffle=True,
        num_workers=0
    )

In [43]:
test_loader = DataLoader(
        test_set,
        batch_size=1,
        shuffle=False,
        num_workers=0
    )

In [44]:
EPOCHS = 10

In [45]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

In [46]:
total_steps = len(train_loader) * EPOCHS

In [47]:
from transformers import get_linear_schedule_with_warmup

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [48]:
loss_fn = torch.nn.CrossEntropyLoss().to(device)

In [49]:
import numpy as np

def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
    ):

    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )

        preds = torch.argmax(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, targets)

        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [50]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [52]:
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_data)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        valid_loader,
        loss_fn,
        device,
        len(valid_data)
      )

    print(f'Val loss {val_loss} accuracy {val_acc}')

    if val_acc > best_accuracy:
        torch.save(model, '/drive/MyDrive/ml/sequora/sentiment/model/sent_bert.pt')
        best_accuracy = val_acc

Epoch 1/10
----------




Train loss 0.5245558921321145 accuracy 0.8816385343762866
Val loss 0.3511570269340001 accuracy 0.9282325029655991
Epoch 2/10
----------
Train loss 0.391467561904569 accuracy 0.9186908192671881
Val loss 0.2939057175957824 accuracy 0.9454329774614473
Epoch 3/10
----------
Train loss 0.291646966267689 accuracy 0.9425689584191025
Val loss 0.2148947766041582 accuracy 0.9612495057334915
Epoch 4/10
----------
Train loss 0.20362107071837554 accuracy 0.962536023054755
Val loss 0.1299458675006126 accuracy 0.9754843811783314
Epoch 5/10
----------
Train loss 0.16139746729072435 accuracy 0.9715932482503088
Val loss 0.11157251601026229 accuracy 0.9816132858837485
Epoch 6/10
----------
Train loss 0.11382813374177335 accuracy 0.9796212433100041
Val loss 0.08122409733226932 accuracy 0.9859628311585608
Epoch 7/10
----------
Train loss 0.08462926000179344 accuracy 0.9839440098806093
Val loss 0.08149966069587805 accuracy 0.9875444839857651
Epoch 8/10
----------
Train loss 0.055582685270760435 accuracy 0.9

In [53]:
test_acc, _ = eval_model(
  model,
  test_loader,
  loss_fn,
  device,
  len(test_data)
)



In [54]:
test_acc

tensor(0.8600, device='cuda:0', dtype=torch.float64)

In [55]:
def get_predictions(model, data_loader):
    model = model.eval()
    target_texts = []
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            preds = torch.argmax(outputs.logits, dim=1)
            target_texts.extend(texts)
            predictions.extend(preds)

            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()

    return target_texts, predictions, real_values

In [56]:
target_texts, predictions, real_values = get_predictions(model, test_loader)



In [57]:
model = torch.load('/drive/MyDrive/ml/sequora/sentiment/model/sent_bert.pt')

In [58]:
torch.save(model, '/drive/MyDrive/ml/sequora/sentiment/model/sent_bert.pt')

https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03

https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b