In [None]:
!pip3 install transformers

In [None]:
import os
import time
from enum import Enum

from tqdm import tqdm
import yaml

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [None]:
os.getcwd()

'/content'

In [None]:
torch.cuda.is_available()

True

In [None]:
TRAIN_CONST = 'train'
VAL_CONST = 'val'

In [None]:
def load_config(config_file):
    with open(os.path.abspath(config_file)) as f:
        config = yaml.safe_load(f)
    return config


def one_hot(a):
    b = np.zeros((a.shape[0], a.max() + 1))
    b[np.arange(a.shape[0]), a] = 1
    return b

In [None]:
class TokenizerType(Enum):
    bert_tokenizer = 'bert_tokenizer'


TOKENIZERS_CLASSES = {
    TokenizerType.bert_tokenizer: BertTokenizer
}


class ModelType(Enum):
    bert_for_sequence_classification = 'bert_for_sequence_classification'


MODELS_CLASSES = {
    ModelType.bert_for_sequence_classification: BertForSequenceClassification
}

In [None]:
class OptimizerType(Enum):
    adam_w = 'adam_w'


OPTIMIZERS_CLASSES = {
    OptimizerType.adam_w: AdamW
}


class SchedulerType(Enum):
    linear_schedule_with_warmup = 'linear_schedule_with_warmup'


SCHEDULERS_CLASSES = {
    SchedulerType.linear_schedule_with_warmup: get_linear_schedule_with_warmup
}

In [None]:
class Evaluator:
    def __init__(self, config_file):
        self.input_label = 'input'
        self.target_label = 'target'
        self.prediction_label = 'predictions'

        config = load_config(config_file)
        self.device = config['model']['device']
        self.device = torch.device('cuda' if torch.cuda.is_available() and self.device == 'gpu' else 'cpu')
        print('Device = {}'.format(self.device))
        if self.device == torch.device('cuda'):
            torch.cuda.empty_cache()

        self._setup_output_config(config)
        self._setup_data_config(config)
        self._setup_model_config(config)

    def _setup_data_config(self, config):
        pass

    def _setup_model_config(self, config):
        pass

    def _setup_output_config(self, config):
        pass

    def _save_prediction(self, input, targets, predictions, output_path):
        prediction_df = pd.DataFrame(
            data=
            {
                self.input_label: input,
                self.target_label: targets,
                self.prediction_label: predictions
            }
        )
        prediction_df.to_csv(output_path, sep=',', index=False)

In [None]:
class Trainer(Evaluator):
    def __init__(self, config_file):
        super(Trainer, self).__init__(config_file)

    def _setup_data_config(self, config):
        self.train_df = pd.read_csv(config['dataset']['train_path'], sep=',')
        self.val_df = pd.read_csv(config['dataset']['val_path'], sep=',')
        self.input_label = config['dataset'].get('input_label', self.train_df.columns[0])
        self.target_label = config['dataset'].get('target_label', self.train_df.columns[-1])
        self.prediction_label = config['dataset'].get('target_label', 'prediction')

        self.tokenizer = TOKENIZERS_CLASSES[TokenizerType[config['tokenizer']['type']]].from_pretrained(
            config['tokenizer']['name'], do_lower_case=config['tokenizer']['do_lower_case']
        )
        encoded_train_data = self.tokenizer.batch_encode_plus(
            self.train_df[self.input_label].values,
            add_special_tokens=config['tokenizer']['add_special_tokens'],
            return_attention_mask=config['tokenizer']['return_attention_mask'],
            pad_to_max_length=config['tokenizer']['pad_to_max_length'],
            max_length=config['tokenizer']['seq_length'],
            return_tensors=config['tokenizer']['return_tensors']
        )
        train_input_ids = encoded_train_data['input_ids']
        train_attention_masks = encoded_train_data['attention_mask']
        train_labels = torch.tensor(one_hot(self.train_df[self.target_label].values))
        train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
        self.train_loader = DataLoader(train_dataset,
                                       sampler=RandomSampler(train_dataset),
                                       batch_size=config['dataset']['batch_size'])

        encoded_val_data = self.tokenizer.batch_encode_plus(
            self.val_df[self.input_label].values,
            add_special_tokens=config['tokenizer']['add_special_tokens'],
            return_attention_mask=config['tokenizer']['return_attention_mask'],
            pad_to_max_length=config['tokenizer']['pad_to_max_length'],
            max_length=config['tokenizer']['seq_length'],
            return_tensors=config['tokenizer']['return_tensors']
        )
        val_input_ids = encoded_val_data['input_ids']
        val_attention_masks = encoded_val_data['attention_mask']
        val_labels = torch.tensor(one_hot(self.val_df[self.target_label].values))
        val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
        self.val_loader = DataLoader(val_dataset,
                                     sampler=SequentialSampler(val_dataset),
                                     batch_size=config['dataset']['batch_size'])

    def _setup_model_config(self, config):
        self.model = MODELS_CLASSES[ModelType[config['model']['type']]].from_pretrained(
            config['model']['name'],
            num_labels=config['model'].get(
                'num_label',
                len(np.unique(
                    np.concatenate(
                        (self.train_df[self.target_label].values, self.val_df[self.target_label].values),
                        axis=0
                    )
                )
                )
            ),
            output_attentions=False,
            output_hidden_states=False
        )
        torch.save(self.model.state_dict(), os.path.join(self.model_path, 'init.pt'))
        self.model.to(self.device)
        self.epoch_count = config['eval']['epoch_count']
        self.optimizer = OPTIMIZERS_CLASSES[OptimizerType[config['eval']['optimizer']]](
            self.model.parameters(),
            lr=config['eval']['lr'],
            eps=config['eval']['eps']
        )
        self.scheduler = SCHEDULERS_CLASSES[SchedulerType[config['eval']['scheduler']]](
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=len(self.train_loader) * self.epoch_count
        )

    def _setup_output_config(self, config):
        self.model_path = config['res']['model_path']
        os.makedirs(self.model_path, exist_ok=True)
        self.tb_path = config['res']['tb_path']
        os.makedirs(self.tb_path, exist_ok=True)
        self.writer = SummaryWriter(log_dir=self.tb_path)
        self.prediction_path = config['res']['prediction_path']
        os.makedirs(self.prediction_path, exist_ok=True)
        self.all_train_outputs = []
        self.all_val_outputs = []
        self.mean_epoch_loss = None

    def epoch(self, epoch_num, mode):
        batch_num = 0
        epoch_losses = list()
        loader = None
        all_outputs = list()
        target = list()
        if mode == TRAIN_CONST:
            self.model.train(True)
            loader = self.train_loader
            target = self.train_df[self.target_label].values
        elif mode == VAL_CONST:
            self.model.eval()
            loader = self.val_loader
            target = self.val_df[self.target_label].values
        epoch_start = time.perf_counter()
        for batch in tqdm(loader, desc='Epoch {}, {}'.format(epoch_num + 1, mode)):
            self.model.zero_grad()
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(self.device)
            attention_mask = attention_mask.to(torch.uint8).to(self.device)
            labels = labels.to(torch.float32).to(self.device)
            if mode == TRAIN_CONST:
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            else:
                with torch.no_grad():
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            epoch_loss = outputs.loss
            epoch_losses.append(epoch_loss.item())
            preds = torch.argmax(outputs.logits, dim=1)
            all_outputs.append(preds.detach().cpu().numpy())
            epoch_loss.backward() if mode == TRAIN_CONST else 0
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0) if mode == TRAIN_CONST else 0
            self.optimizer.step() if mode == TRAIN_CONST else 0
            self.scheduler.step() if mode == TRAIN_CONST else 0
            batch_num += 1
        epoch_end = time.perf_counter()
        all_outputs = np.concatenate(all_outputs, axis=0)

        self.mean_epoch_loss = np.mean(epoch_losses)
        self.writer.add_scalar('Loss/{}'.format(mode), self.mean_epoch_loss, epoch_num)

        accuracy = accuracy_score(target, all_outputs)
        self.writer.add_scalar('Accuracy/{}'.format(mode), accuracy, epoch_num)

        print('Mean loss = {0}, accuracy = {1:.3f},  time: {2:.7f}'.format(
            self.mean_epoch_loss, accuracy,
            epoch_end - epoch_start)
        )
        return all_outputs

    def train(self):
        for epoch_num in range(self.epoch_count):
            self.all_train_outputs = self.epoch(epoch_num, mode=TRAIN_CONST)
            torch.save(
                self.model.state_dict(),
                os.path.join(
                    self.model_path,
                    'epoch={0}_loss={1:.7f}.pt'.format(epoch_num, self.mean_epoch_loss)
                )
            )
            self._save_prediction(
                self.train_df[self.input_label].values,
                self.train_df[self.target_label].values,
                self.all_train_outputs,
                os.path.join(self.prediction_path, 'train_epoch={}.csv'.format(epoch_num))
            )
            self.all_val_outputs = self.epoch(epoch_num, mode=VAL_CONST)
            self._save_prediction(
                self.val_df[self.input_label].values,
                self.val_df[self.target_label].values,
                self.all_val_outputs,
                os.path.join(self.prediction_path, 'val_epoch={}.csv'.format(epoch_num))
            )

In [None]:
config_file = '/content/config(colab).yml'
trainer = Trainer(config_file)
trainer.train()

Device = cuda


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1, train: 100%|██████████| 926/926 [21:10<00:00,  1.37s/it]


Mean loss = 0.32844114071864544, accuracy = 0.339,  time: 1270.7917517


Epoch 1, val: 100%|██████████| 164/164 [01:23<00:00,  1.96it/s]


Mean loss = 0.2883703247984735, accuracy = 0.644,  time: 83.5038896


Epoch 2, train: 100%|██████████| 926/926 [21:20<00:00,  1.38s/it]


Mean loss = 0.2634850988088107, accuracy = 0.317,  time: 1280.7973652


Epoch 2, val: 100%|██████████| 164/164 [01:23<00:00,  1.96it/s]


Mean loss = 0.2841940712092853, accuracy = 0.650,  time: 83.6330375


Epoch 3, train: 100%|██████████| 926/926 [21:20<00:00,  1.38s/it]


Mean loss = 0.22448270273099194, accuracy = 0.304,  time: 1280.2892721


Epoch 3, val: 100%|██████████| 164/164 [01:23<00:00,  1.96it/s]


Mean loss = 0.30258184312502057, accuracy = 0.656,  time: 83.6512787


Epoch 4, train: 100%|██████████| 926/926 [21:19<00:00,  1.38s/it]


Mean loss = 0.18313788907445533, accuracy = 0.310,  time: 1279.7670448


Epoch 4, val: 100%|██████████| 164/164 [01:23<00:00,  1.96it/s]


Mean loss = 0.3169262011694472, accuracy = 0.655,  time: 83.7160004


Epoch 5, train: 100%|██████████| 926/926 [21:20<00:00,  1.38s/it]


Mean loss = 0.1479131047720528, accuracy = 0.314,  time: 1280.4052771


Epoch 5, val: 100%|██████████| 164/164 [01:23<00:00,  1.95it/s]


Mean loss = 0.3384379159386565, accuracy = 0.655,  time: 83.9707626
