In [1]:
import os
import pickle
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Any

import pandas as pd
import torch
import yaml
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer, IntervalStrategy, AutoModelForSequenceClassification, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#FIXME: get rid
import sys
sys.path.append("C:/Users/Windows 10/NINA/UNI/EngineeringThesis/enginora/src/")
import enginora

In [3]:
os.environ['WANDB_DISABLED'] = 'true'

---
# Model

In [4]:
@dataclass
class ModelConfig:
    model_name: str
    num_labels: int
    max_length: int
    truncation: bool
    padding: str
    return_tensors: str
    output_attentions: bool
    output_hidden_states: bool

    def __post_init__(self):
        self.num_labels = int(self.num_labels)
        self.max_length = int(self.max_length)
        self.truncation = bool(self.truncation)
        self.output_attentions = bool(self.output_attentions)
        self.output_hidden_states = bool(self.output_hidden_states)

    def create_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        return lambda input_tokens: tokenizer(
            input_tokens,
            padding=self.padding,
            max_length=self.max_length,
            truncation=self.truncation,
            return_tensors=self.return_tensors
        )

    def create_model(self):
        return AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            output_attentions=self.output_attentions,
            output_hidden_states=self.output_hidden_states,
        )

---
# Dataset selectors

In [5]:
class Selector(ABC):

    def __init__(self, **kwargs):
        pass

    @abstractmethod
    def select(self, dataset: pd.DataFrame) -> pd.DataFrame:
        pass


@dataclass
class SelectorConfig(yaml.YAMLObject):
    name: str
    args: Dict

In [6]:
class DummySelector(Selector):

    def select(self, dataset: pd.DataFrame) -> pd.DataFrame:
        return dataset

In [7]:
SELECTORS = {
    'DUMMY': DummySelector,
}

---
# Train Set

In [8]:
@dataclass
class TrainSetConfig:
    path: str
    selectors: List[SelectorConfig]

    def __post_init__(self):
        self.selectors = [SelectorConfig(**t) for t in self.selectors]

In [9]:
@dataclass
class TrainingConfig:
    dataset: TrainSetConfig
    batch_size: int
    epochs: int
    learning_rate: float
    output_dir: str

    def __post_init__(self):
        self.dataset = TrainSetConfig(**self.dataset)
        self.batch_size = int(self.batch_size)
        self.epochs = int(self.epochs)
        self.learning_rate = float(self.learning_rate)

    def load_dataset(self) -> pd.DataFrame:
        df = pd.read_csv(self.dataset.path, header=None, names=['id', 'text', 'label'])

        for t in self.dataset.selectors:
            selector = SELECTORS[t.name](**t.args)
            df = selector.select(df)

        return df

---
# Metrics

In [10]:
# FIXME: solarski
# about the metrics - I'm not sure if we should have separate metrics for all steps (validation, test, control)
# I would rather go with single metrics config, that would be common for all those steps.
# Then we don't need strange test_set_select (etc.) methods, which are either way redundant since the metric are calculated
# during call to trainer.predict().
# Only thing that we are interested in is saving those metrics for test and control stage - in training too for some

In [11]:
from enginora.metrics.slicing_scoring import slicing_scores
from dataclasses import field
from enginora.utils import ignore_unmatched_kwargs
METRIC_FUNCTIONS = {
    'accuracy': ignore_unmatched_kwargs(accuracy_score),
    'slicing_scores': ignore_unmatched_kwargs(slicing_scores)
    
}

@dataclass
class MetricsConfig:
    name: str
    args: Dict[str, Any] = field(default_factory=dict) 

    def __post_init__(self):
        if self.args is None:
            self.args = dict()

---
# Validation set

In [12]:
@dataclass
class ValidationSetConfig:
    path: str

In [13]:
@dataclass
class ValidationConfig:
    dataset: ValidationSetConfig
    batch_size: int
    metrics: List[MetricsConfig]
    metric_for_best_model: str

    def __post_init__(self):
        self.dataset = ValidationSetConfig(**self.dataset)
        self.batch_size = int(self.batch_size)
        self.metrics = [MetricsConfig(**m) for m in self.metrics]

    def load_dataset(self) -> pd.DataFrame:
        return pd.read_csv(self.dataset.path, header=None, names=['id', 'text', 'label'])

    # TODO: rename ???
    def validation_set_select(self, predictions) -> dict:
        predictions, true_labels = predictions[0], predictions[1]
        predictions = predictions[0].argmax(1)

        return {
            metric.name:
                METRIC_FUNCTIONS[metric.name](true_labels, predictions)
            for metric in self.metrics
        }

---
# Predictions

In [14]:
def save_predictions(object, predictions_filename):
    file = open(predictions_filename, 'wb')
    pickle.dump(object, file)
    file.close()

In [15]:
def load_predictions(predictions_filename):
    file = open(predictions_filename, 'rb')
    results_depickled = pickle.load(file)
    file.close()
    return results_depickled

---
# Test set

In [16]:
@dataclass
class TestSetConfig:
    path: str
    selectors: List[SelectorConfig]

    def __post_init__(self):
        self.selectors = [SelectorConfig(**t) for t in self.selectors]

In [17]:
from enginora.selector.slicing.slicing import SlicingSelector

In [18]:
SELECTORS_TESTING = {
    'SLICING': SlicingSelector,
}

In [19]:
@dataclass
class TestingConfig:
    dataset: TestSetConfig
    metrics: List[MetricsConfig]
    results_file: str

    def __post_init__(self):
        self.dataset = TestSetConfig(**self.dataset)
        
        self.metrics = [MetricsConfig(**m) for m in self.metrics]

    def load_dataset(self) -> pd.DataFrame:
        return pd.read_csv(self.dataset.path, header=None, names=['id', 'text', 'label'])

    # TODO: rename ???
    def test_set_select(self) -> dict:
        
        predictions = load_predictions(self.results_file)
        predictions, true_labels = predictions[0], predictions[1]
        proba_predictions = predictions[0]
        predictions = predictions[0].argmax(1)

        df = pd.read_csv(self.dataset.path, header=None, names=['id', 'text', 'label'])
        for t in self.dataset.selectors:
            selector = SELECTORS_TESTING[t.name](**t.args)
            selected = selector.select(df)
            selected.dump(selector.result_file) #FIXME: probably change

        return {
            metric.name:
                METRIC_FUNCTIONS[metric.name](true_labels, predictions, **metric.args, proba_predictions = proba_predictions)
            for metric in self.metrics
        }

---
# Control set

In [20]:
@dataclass
class ControlSetConfig:
    path: str

In [21]:
@dataclass
class ControlConfig:
    dataset: ControlSetConfig
    metrics: List[MetricsConfig]
    results_file: str

    def __post_init__(self):
        self.dataset = ControlSetConfig(**self.dataset)
        self.metrics = [MetricsConfig(**m) for m in self.metrics]

    def load_dataset(self) -> pd.DataFrame:
        return pd.read_csv(self.dataset.path, header=None, names=['id', 'text', 'label'])

    # TODO: rename ???
    # FIXME: mutation zmiennej
    def control_set_select(self) -> dict:
        predictions = load_predictions(self.results_file)
        predictions, true_labels = predictions[0], predictions[1]
        predictions = predictions[0].argmax(1)

        return {
            metric.name:
                METRIC_FUNCTIONS[metric.name](true_labels, predictions)
            for metric in self.metrics
        }

---
# Configuration

In [22]:
with open('./config.yaml', 'r') as stream:
    configuration = yaml.safe_load(stream)

In [23]:
model_config = ModelConfig(**configuration['model'])
model_config

ModelConfig(model_name='bert-base-cased', num_labels=12, max_length=256, truncation=True, padding='max_length', return_tensors='pt', output_attentions=True, output_hidden_states=True)

In [24]:
training_config = TrainingConfig(**configuration['training'])
training_config

TrainingConfig(dataset=TrainSetConfig(path='../../data/thedeep.subset.train.txt', selectors=[SelectorConfig(name='DUMMY', args={})]), batch_size=2, epochs=1, learning_rate=0.001, output_dir='ClassificationBERT')

In [25]:
validation_config = ValidationConfig(**configuration['validation'])
validation_config

ValidationConfig(dataset=ValidationSetConfig(path='../../data/thedeep.subset.validation.txt'), batch_size=2, metrics=[MetricsConfig(name='accuracy', args={})], metric_for_best_model='accuracy')

In [26]:
test_config = TestingConfig(**configuration['testing'])
test_config

TestingConfig(dataset=TestSetConfig(path='../../data/thedeep.subset.test.txt', selectors=[SelectorConfig(name='SLICING', args={'result_file': '../../data/slicing.pickle'})]), metrics=[MetricsConfig(name='accuracy', args={}), MetricsConfig(name='slicing_scores', args={'slice_file': '../../data/slicing.pickle'})], results_file='./file_test_results.pickle')

In [27]:
control_config = ControlConfig(**configuration['control'])
control_config

ControlConfig(dataset=ControlSetConfig(path='../../data/thedeep.subset.control.txt'), metrics=[MetricsConfig(name='accuracy', args={})], results_file='./file_control_results.pickle')

---
# Notebook flow

In [28]:
tokenizer = model_config.create_tokenizer()
model = model_config.create_model()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [29]:
data = {
    'train': training_config.load_dataset()[:20],
    'validation': validation_config.load_dataset()[:10],
    'test': test_config.load_dataset()[:10],
    'control': control_config.load_dataset()[:10],
}

In [30]:
tokens = {
    dataset_type: tokenizer(dataset['text'].tolist())
    for dataset_type, dataset in data.items()
}

labels = {
    dataset_type: torch.tensor(dataset['label'].tolist())
    for dataset_type, dataset in data.items()
}

In [31]:
class TextDataset(Dataset):
    def __init__(self, tokens, labels: torch.Tensor):
        self.input_ids = tokens.input_ids
        self.attention_mask = tokens.attention_mask
        self.token_type_ids = tokens.token_type_ids
        self.y = labels

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return {
            'input_ids': self.input_ids[i],
            'attention_mask': self.attention_mask[i],
            'token_type_ids': self.token_type_ids[i],
            'labels': self.y[i]
        }

In [32]:
datasets = {
    dataset_type: TextDataset(tokens[dataset_type], labels[dataset_type])
    for dataset_type in data.keys()
}

In [33]:
training_args = TrainingArguments(
    output_dir=training_config.output_dir,
    learning_rate=training_config.learning_rate,
    evaluation_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    logging_strategy=IntervalStrategy.EPOCH,
    per_device_train_batch_size=training_config.batch_size,
    per_device_eval_batch_size=validation_config.batch_size,
    load_best_model_at_end=True,
    metric_for_best_model=validation_config.metric_for_best_model,
    num_train_epochs=training_config.epochs,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [34]:
trainer = Trainer(
    model=model,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    compute_metrics=validation_config.validation_set_select,
    args=training_args
)

In [35]:
trainer.train()

***** Running training *****
  Num examples = 20
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 10


100%|██████████| 10/10 [00:35<00:00,  3.19s/it]***** Running Evaluation *****
  Num examples = 10
  Batch size = 2


{'loss': 2.5514, 'learning_rate': 0.0, 'epoch': 1.0}


                                               
100%|██████████| 10/10 [00:41<00:00,  3.19s/it]Saving model checkpoint to ClassificationBERT\checkpoint-10
Configuration saved in ClassificationBERT\checkpoint-10\config.json


{'eval_loss': 2.5990102291107178, 'eval_accuracy': 0.3, 'eval_runtime': 5.7028, 'eval_samples_per_second': 1.754, 'eval_steps_per_second': 0.877, 'epoch': 1.0}


Model weights saved in ClassificationBERT\checkpoint-10\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ClassificationBERT\checkpoint-10 (score: 0.3).
100%|██████████| 10/10 [00:43<00:00,  4.36s/it]

{'train_runtime': 43.6141, 'train_samples_per_second': 0.459, 'train_steps_per_second': 0.229, 'train_loss': 2.5514015197753905, 'epoch': 1.0}





TrainOutput(global_step=10, training_loss=2.5514015197753905, metrics={'train_runtime': 43.6141, 'train_samples_per_second': 0.459, 'train_steps_per_second': 0.229, 'train_loss': 2.5514015197753905, 'epoch': 1.0})

In [36]:
test_result = trainer.predict(datasets['test'])
test_result.metrics

***** Running Prediction *****
  Num examples = 10
  Batch size = 2
100%|██████████| 5/5 [00:04<00:00,  1.10s/it]

{'test_loss': 2.7577528953552246,
 'test_accuracy': 0.1,
 'test_runtime': 6.1194,
 'test_samples_per_second': 1.634,
 'test_steps_per_second': 0.817}

In [37]:
save_predictions(test_result, configuration['testing']['results_file'])

In [38]:
test_config.test_set_select()

100%|██████████| 2585/2585 [00:02<00:00, 895.79it/s]


{'accuracy': 0.1,
 'slicing_scores':                    accuracy
 overall                 0.1
 short                   0.0
 textblob_polarity       0.5}

In [39]:
control_result = trainer.predict(datasets['control'])
control_result.metrics

***** Running Prediction *****
  Num examples = 10
  Batch size = 2
10it [00:16,  1.77s/it]                      

{'test_loss': 3.062967300415039,
 'test_accuracy': 0.2,
 'test_runtime': 7.3063,
 'test_samples_per_second': 1.369,
 'test_steps_per_second': 0.684}

In [40]:
save_predictions(control_result, configuration['control']['results_file'])

In [41]:
control_config.control_set_select()

{'accuracy': 0.2}