In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
SEED: int = 42
OUT_PATH: str = '../results/training/raw'

In [4]:
HID_SIZE: int = 256
DROPOUT: float = 0.4

In [5]:
import random

import torch

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [6]:
DATA_PATH: str = '../data/imdb'
DATASETS: list = [
    ('train', '_prepped.train'),
    ('test', '_prepped.test')
]

In [7]:
MODELS: list = [
    ('base', 'bert-base-uncased'),
    ('textattack', 'textattack/bert-base-uncased-imdb'),
    ('fabriceyhc', 'fabriceyhc/bert-base-uncased-imdb'),
    ('wakaka', 'Wakaka/bert-finetuned-imdb')
]

In [8]:
data_config: dict = {
    'polarities': {
        "negative": 0,
        "positive": 1
    },
    'data_label': 'text',
    'target_label': 'sentiment'
}

In [9]:
from typing import Dict
from modules import Data

datasets: Dict[str, Data] = {
    label: Data(data_path=f"{DATA_PATH}.{name}.csv", **data_config)
    for label, name in DATASETS
}

In [10]:
encoder_config: dict = {
    'layers': [11]
}

In [11]:
from modules import Encoder

encoders: Dict[str, Encoder] = {
    label: Encoder({**{'model': ref}, **encoder_config})
    for label, ref in MODELS
}

In [12]:
from modules.util import get_device


def collation_fn(batch: list) -> tuple:
    encoded_column: str = encoders[MODELS[0][0]].col_name
    target_column: str = datasets['train'].target_label

    encode_label_fn: callable = lambda x: datasets['train'].encode_label(x)

    return (
        torch.stack([
            sample[encoded_column].values[0]
            for sample in batch
        ]).to(get_device()),
        torch.tensor(
            [
                encode_label_fn(lb) for lb in
                [sample[target_column].values[0] for sample in batch]
            ],
            dtype=torch.long, device=get_device())
    )

In [13]:
from modules import Model, Trainer

for label, encoder in encoders.items():

    for data_label, dataset in datasets.items():
        encoder.df_encode(dataset.data, col=dataset.data_label)

    model = Model({
        'name': label,
        'in_size': encoder.dim,
        'hid_size': HID_SIZE,
        'out_size': len(data_config['polarities']),
        'dropout': DROPOUT,
    })

    Trainer(
        model, datasets, collation_fn,
        out_dir=f'{OUT_PATH}/{label}.',
        user_config={}
    )()

                                                                                                                                                                                                                                                                                                                                                                          