In [None]:
!pip install transformers
!pip install datasets

In [2]:
import torch
import pandas as pd
import os
import numpy as np

from torch.utils.data import Subset, DataLoader
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModel, TrainingArguments, 
                          Trainer, BertForSequenceClassification)
from google.colab import drive

drive.mount('/content/drive')
os.chdir('drive/MyDrive/hack4ukraine')

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
class BertDataset(torch.utils.data.Dataset):
    """
    Dataset compatible with BERT transformer from hugging face library
    """
    def __init__(self, vector_len: int = 130) -> None:
        df = pd.read_csv('lemmatized.csv')

        # Oversampling of class with less recores
        ones_count = df['label'].sum()
        zeros_count = df.shape[0] - ones_count
        ones_Xs = df.loc[df['label'] == 1]
        t = ones_count
        while t < zeros_count:
          df = pd.concat([df, ones_Xs])
          t += ones_count
        df = df.sample(frac=1, replace=False)

        # Tokenize
        tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base', 
                                                  normalize=True)
        self.texts = df['text']
        self.texts = tokenizer(self.texts.tolist(),
                               padding='max_length',
                               max_length=vector_len,
                               truncation=True)

        # Process tokenized texts
        self.ids = self.texts['input_ids']
        self.token_type_ids = self.texts['token_type_ids']
        self.attention_mask = self.texts['attention_mask']
        
        # Save labels
        self.labels = df['label']
        self.labels = torch.Tensor(self.labels.values)

    def __len__(self) -> int:
        return self.labels.shape[0]

    def get_batch_labels(self, idx: slice) -> torch.Tensor:
        return self.labels[idx]

    def get_batch_texts(self, idx):
        return {'input_ids': self.ids[idx],
                'token_type_ids': self.token_type_ids[idx],
                'attention_mask': self.attention_mask[idx]}

    def __getitem__(self, idx: slice):
        item = self.get_batch_texts(idx)
        item['labels'] = self.get_batch_labels(idx).long()
        return item

In [None]:
dataset = BertDataset()

In [23]:
train_indices, test_indices = train_test_split(list(range(len(dataset))), test_size = .4)
test_indices, val_indices = train_test_split(test_indices, test_size = .5)

train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
test_dataset = Subset(dataset, test_indices)

train_dataloader = DataLoader(train_dataset, batch_size = 32)
val_dataloader = DataLoader(val_dataset, batch_size = 32)
test_dataloader = DataLoader(test_dataset, batch_size = 32)

In [None]:
accuracy = load_metric("accuracy")
recall = load_metric('recall')
precision = load_metric('precision')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy.compute(predictions=predictions, references=labels)['accuracy'],
            'recall': recall.compute(predictions=predictions, references=labels)['recall'],
            'precision': precision.compute(predictions=predictions, references=labels)['precision']}

model = BertForSequenceClassification.from_pretrained('vinai/bertweet-base', 
                                                      num_labels = 2)

model.to(device)

training_args = TrainingArguments("bert_trainer", 
                                  num_train_epochs = 7,
                                  per_device_train_batch_size = 32,
                                  per_device_eval_batch_size = 32,
                                  evaluation_strategy = 'steps',
                                  eval_steps = 40,
                                  logging_steps = 40,
                                  )

In [27]:
trainer = Trainer(model=model, 
                  args = training_args, 
                  train_dataset = train_dataset, 
                  eval_dataset = val_dataset, 
                  compute_metrics = compute_metrics)
trainer.train()

***** Running training *****
  Num examples = 1363
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 301


Step,Training Loss,Validation Loss,Accuracy,Recall,Precision
40,0.753,0.701349,0.468132,1.0,0.468132
80,0.7002,0.695721,0.468132,1.0,0.468132
120,0.6995,0.671547,0.569231,0.70892,0.529825
160,0.5507,1.36603,0.569231,0.948357,0.521964
200,0.3515,0.595161,0.745055,0.755869,0.715556
240,0.2718,0.620995,0.797802,0.755869,0.800995
280,0.2168,0.650634,0.764835,0.873239,0.699248


***** Running Evaluation *****
  Num examples = 455
  Batch size = 32
***** Running Evaluation *****
  Num examples = 455
  Batch size = 32
***** Running Evaluation *****
  Num examples = 455
  Batch size = 32
***** Running Evaluation *****
  Num examples = 455
  Batch size = 32
***** Running Evaluation *****
  Num examples = 455
  Batch size = 32
***** Running Evaluation *****
  Num examples = 455
  Batch size = 32
***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=301, training_loss=0.4813436146986445, metrics={'train_runtime': 242.2308, 'train_samples_per_second': 39.388, 'train_steps_per_second': 1.243, 'total_flos': 637391670497400.0, 'train_loss': 0.4813436146986445, 'epoch': 7.0})

In [31]:
model.to('cpu')
torch.save(model, 'model.pt')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base', 
                                           normalize=True)
torch.save(tokenizer, 'preprocessing.pt')