In [None]:
# get data from github
%%capture
!wget https://raw.githubusercontent.com/shitkov/bert4classification/main/train.csv
!wget https://raw.githubusercontent.com/shitkov/bert4classification/main/valid.csv
!wget https://raw.githubusercontent.com/shitkov/bert4classification/main/test.csv

In [None]:
import pandas as pd

In [None]:
train_data = pd.read_csv('/content/train.csv')
valid_data = pd.read_csv('/content/valid.csv')
test_data  = pd.read_csv('/content/test.csv')

In [None]:
# install libraries
%%capture
!pip install transformers sentencepiece

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
# get BERT tokenizer and model from huggingface
tokenizer = BertTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = BertForSequenceClassification.from_pretrained("cointegrated/rubert-tiny")

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

In [None]:
# add a linear layer for binary classification
model.classifier = torch.nn.Linear(312, 2)

In [None]:
# define device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# send model to device
%%capture
model.to(device)

In [None]:
# create class for custom dataset
from torch.utils.data import Dataset

class CustomDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=256):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
# create datasets
train_set = CustomDataset(
    texts=list(train_data['text']),
    targets=list(train_data['label']),
    tokenizer=tokenizer
)

valid_set = CustomDataset(
    texts=list(valid_data['text']),
    targets=list(valid_data['label']),
    tokenizer=tokenizer
)

test_set = CustomDataset(
    texts=list(test_data['text']),
    targets=list(test_data['label']),
    tokenizer=tokenizer
)

In [None]:
# Create dataloaders
from torch.utils.data import DataLoader

train_loader = DataLoader(
        train_set,
        batch_size=2,
        shuffle=True,
        num_workers=0
    )

valid_loader = DataLoader(
        valid_set,
        batch_size=2,
        shuffle=True,
        num_workers=0
    )

test_loader = DataLoader(
        test_set,
        batch_size=1,
        shuffle=False,
        num_workers=0
    )

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

In [None]:
EPOCHS = 1

In [None]:
total_steps = len(train_loader) * EPOCHS

In [None]:
from transformers import get_linear_schedule_with_warmup

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss().to(device)

In [None]:
import numpy as np

def train_epoch(
        model,
        data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        n_examples
    ):

    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )

        preds = torch.argmax(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, targets)

        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_data)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        valid_loader,
        loss_fn,
        device,
        len(valid_data)
      )

    print(f'Val loss {val_loss} accuracy {val_acc}')

    if val_acc > best_accuracy:
        torch.save(model, '/content/bert.pt')
        best_accuracy = val_acc

Epoch 1/1
----------
Train loss 0.7889281882888523 accuracy 0.6810990678939072
Val loss 0.8146469163302729 accuracy 0.7057793345008757


In [None]:
test_acc, _ = eval_model(
  model,
  test_loader,
  loss_fn,
  device,
  len(test_data)
)

In [None]:
test_acc

tensor(0.7103, device='cuda:0', dtype=torch.float64)

In [None]:
torch.save(model, '/content/bert.pt')

In [None]:
model = torch.load('/content/bert.pt')

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
    target_texts = []
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            preds = torch.argmax(outputs.logits, dim=1)
            target_texts.extend(texts)
            predictions.extend(preds)

            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()

    return target_texts, predictions, real_values

In [None]:
target_texts, predictions, real_values = get_predictions(model, test_loader)

In [None]:
df = pd.DataFrame()
df['text'] = target_texts
df['labels'] = real_values
df['prediction'] = real_values

In [None]:
df.head()

Unnamed: 0,text,labels,prediction
0,раз он знает кто такой блейн он таки смотрел г...,1,1
1,явган явж даврахг й ш х лд иргэдээ м лх д явца...,0,0
2,ахах в каждом фандоме свои извращения у кого т...,1,1
3,почему в инстаграмме видео не воспроизводтся т...,0,0
4,мне ни разу так не было обидно за вещи а за пе...,0,0
