In [1]:
import os
import pickle
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict

import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer, IntervalStrategy, AutoModelForSequenceClassification, AutoTokenizer

In [2]:
os.environ['WANDB_DISABLED'] = 'true'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

---
# Dataset transformations

In [3]:
class Transformation(ABC):

    def __init__(self, **kwargs):
        pass

    @abstractmethod
    def transform(self, dataset: pd.DataFrame) -> pd.DataFrame:
        pass


@dataclass
class TransformationConfig:
    name: str
    kwargs: Dict

In [4]:
class DummyTransformation(Transformation):

    def transform(self, dataset: pd.DataFrame) -> pd.DataFrame:
        return dataset

In [5]:
TRANSFORMATIONS = {
    'DUMMY': DummyTransformation,
}

---
# Train Set

In [6]:
@dataclass
class TrainSetConfig:
    path: str
    transformations: List[TransformationConfig]

In [7]:
def train_set_select(config: TrainSetConfig) -> pd.DataFrame:
    df = pd.read_csv(config.path, header=None, names=['id', 'text', 'label'])[:100]

    for t in config.transformations:
        transformation = TRANSFORMATIONS[t.name](*t.kwargs)
        df = transformation.transform(df)

    return df

---
# Metrics

In [8]:
METRIC_FUNCTIONS = {'accuracy': accuracy_score}


@dataclass
class MetricsConfig:
    name: str

---
# Validation set

In [9]:
@dataclass
class ValidationSetConfig:
    path_set: str
    metrics: List[MetricsConfig]

In [10]:
def load_validation_set(config: ValidationSetConfig) -> pd.DataFrame:
    return pd.read_csv(config.path_set, header=None, names=['id', 'text', 'label'])[:10]

---
# Predictions

In [11]:
def save_predictions(object, predictions_filename):
    file = open(predictions_filename, 'wb')
    pickle.dump(object, file)
    file.close()

In [12]:
def load_predictions(predictions_filename):
    file = open(predictions_filename, 'rb')
    results_depickled = pickle.load(file)
    file.close()
    return results_depickled

---
# Test set

In [13]:
@dataclass
class TestSetConfig:
    path_set: str
    path_predictions: str
    metrics: List[MetricsConfig]

In [14]:
def load_test_set(config: TestSetConfig) -> pd.DataFrame:
    return pd.read_csv(config.path_set, header=None, names=['id', 'text', 'label'])[:10]

In [15]:
def test_set_select(config: TestSetConfig) -> dict:
    predictions = load_predictions(config.path_predictions)
    predictions, true_labels = predictions[0], predictions[1]
    predictions = predictions[0].argmax(1)

    return {
        metric.name:
            METRIC_FUNCTIONS[metric.name](true_labels, predictions)
        for metric in config.metrics
    }

---
# Control set

In [16]:
@dataclass
class ControlSetConfig:
    path_set: str
    path_predictions: str
    metrics: List[MetricsConfig]

In [17]:
def load_control_set(config: ControlSetConfig) -> pd.DataFrame:
    return pd.read_csv(config.path_set, header=None, names=['id', 'text', 'label'])

In [18]:
def control_set_select(config: ControlSetConfig) -> dict:
    predictions = load_predictions(config.path_predictions)
    predictions, true_labels = predictions[0], predictions[1]
    predictions = predictions[0].argmax(1)

    return {
        metric.name:
            METRIC_FUNCTIONS[metric.name](true_labels, predictions)
        for metric in config.metrics
    }

---
# Model

In [19]:
@dataclass
class ModelConfig:
    model_name: str
    num_labels: int
    max_length: int
    truncation: int
    padding: int
    return_tensors: str
    output_attentions: bool = False
    output_hidden_states: bool = False

In [20]:
class ModelLoader:

    def __init__(self, config: ModelConfig):
        self.config = config

    def create_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)

        return lambda input_tokens: tokenizer(input_tokens,
                                              padding=self.config.padding,
                                              max_length=self.config.max_length,
                                              truncation=self.config.truncation,
                                              return_tensors=self.config.return_tensors)

    def create_model(self):
        return AutoModelForSequenceClassification.from_pretrained(
            self.config.model_name,
            num_labels=self.config.num_labels,
            output_attentions=self.config.output_attentions,
            output_hidden_states=self.config.output_hidden_states,
        )

In [21]:
# TODO pipeline - e.g.
# class MyTokenClassificationPipeline(TokenClassificationPipeline):
#     def preprocess(self, sentence, offset_mapping=None):
#         truncation = False
#         padding = 'longest'
#         model_inputs = self.tokenizer(
#             sentence,
#             return_tensors=self.framework,
#             truncation=truncation,
#             padding=padding,
#             return_special_tokens_mask=True,
#             return_offsets_mapping=self.tokenizer.is_fast,
#         )
#         if offset_mapping:
#             model_inputs["offset_mapping"] = offset_mapping

#         model_inputs["sentence"] = sentence
#         return model_inputs

---
# Configuration

In [22]:
custom_configuration = {

    'train_set': {
        'path': './data/thedeep.subset.train.txt',
        'transformations': [
            {
                'name': 'DUMMY',
                'args': {}
            },
        ]
    },

    'validation_set': {
        'path': './data/thedeep.subset.validation.txt',
    },

    'test_set': {
        'path': './data/thedeep.subset.test.txt',
    },

    'control_set': {
        'path': './data/thedeep.subset.control.txt',
    },

    'labels': {
        'path': './data/thedeep.labels.txt'
    },

    'model': {
        'model_name': 'bert-base-cased',
        'num_labels': 12,
        'max_length': 256,
        'truncation': True,
        'padding': 'max_length',
        'return_tensors': 'pt',
        'output_attentions': True,
        'output_hidden_states': True,
    },

    'training': {
        'batch_size': 2,
        'epochs': 1,
        'learning_rate': 1e-3,
        'output_dir': 'ClassificationBERT',
        'metric_for_best_model': 'accuracy',
    },

    'validation': {
        'metrics': [
            {
                'name': 'accuracy',
            },
        ],
    },

    'testing': {
        'results_file': './file_test_results.pickle',
        'metrics': [
            {
                'name': 'accuracy',
            },
        ],
    },

    'control': {
        'results_file': './file_control_results.pickle',
        'metrics': [
            {
                'name': 'accuracy',
            },
        ],
    }
}

In [23]:
model_config = ModelConfig(**custom_configuration['model'])

In [24]:
train_set_config = TrainSetConfig(
    custom_configuration['train_set']['path'],
    [TransformationConfig(t['name'], t['args']) for t in custom_configuration['train_set']['transformations']]
)

In [25]:
validation_set_config = ValidationSetConfig(
    custom_configuration['validation_set']['path'],
    [MetricsConfig(metric['name']) for metric in custom_configuration['validation']['metrics']]
)

In [26]:
test_set_config = TestSetConfig(
    custom_configuration['test_set']['path'],
    custom_configuration['testing']['results_file'],
    [MetricsConfig(metric['name']) for metric in custom_configuration['testing']['metrics']]
)

In [27]:
control_set_config = ControlSetConfig(
    custom_configuration['control_set']['path'],
    custom_configuration['control']['results_file'],
    [MetricsConfig(metric['name']) for metric in custom_configuration['control']['metrics']]
)

---
# Notebook flow

In [28]:
train_set = train_set_select(train_set_config)
validation_set = load_validation_set(validation_set_config)
test_set = load_test_set(test_set_config)
control_set = load_control_set(control_set_config)

In [29]:
model_loader = ModelLoader(model_config)

tokenizer = model_loader.create_tokenizer()
model = model_loader.create_model()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [30]:
data = {
    'train': train_set,
    'validation': validation_set,
    'test': test_set,
    'control': control_set
}

In [31]:
tokens = {
    dataset_type: tokenizer(dataset['text'].tolist())
    for dataset_type, dataset in data.items()
}

labels = {
    dataset_type: torch.tensor(dataset['label'].tolist())
    for dataset_type, dataset in data.items()
}

In [32]:
class TextDataset(Dataset):
    def __init__(self, tokens, labels: torch.Tensor):
        self.input_ids = tokens.input_ids
        self.attention_mask = tokens.attention_mask
        self.token_type_ids = tokens.token_type_ids
        self.y = labels

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return {
            'input_ids': self.input_ids[i],
            'attention_mask': self.attention_mask[i],
            'token_type_ids': self.token_type_ids[i],
            'labels': self.y[i]
        }

In [33]:
datasets = {
    dataset_type: TextDataset(tokens[dataset_type], labels[dataset_type])
    for dataset_type in data.keys()
}

In [34]:
training_args = TrainingArguments(
    output_dir=custom_configuration['training']['output_dir'],
    learning_rate=custom_configuration['training']['learning_rate'],
    evaluation_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    logging_strategy=IntervalStrategy.EPOCH,
    per_device_train_batch_size=custom_configuration['training']['batch_size'],
    per_device_eval_batch_size=custom_configuration['training']['batch_size'],
    load_best_model_at_end=True,
    metric_for_best_model=custom_configuration['training']['metric_for_best_model'],
    num_train_epochs=custom_configuration['training']['epochs']
)

Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (azure-identity 1.12.0 (c:\users\solarsan\appdata\roaming\python\python39\site-packages), Requirement.parse('azure-identity==1.7.0'), {'azureml-dataprep'}).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [35]:
def compute_metrics(p):
    predictions, true_labels = p
    predictions = predictions[0].argmax(1)

    return {
        metric.name:
            METRIC_FUNCTIONS[metric.name](true_labels, predictions)
        for metric in validation_set_config.metrics
    }

In [36]:
trainer = Trainer(
    model=model,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    compute_metrics=compute_metrics,
    args=training_args
)

In [37]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,2.5568,2.785387,0.1


TrainOutput(global_step=50, training_loss=2.5567893981933594, metrics={'train_runtime': 18.2972, 'train_samples_per_second': 5.465, 'train_steps_per_second': 2.733, 'total_flos': 13156733952000.0, 'train_loss': 2.5567893981933594, 'epoch': 1.0})

In [38]:
test_result = trainer.predict(datasets['test'])
test_result

PredictionOutput(predictions=(array([[-1.9110947 , -1.2210466 , -1.6205574 , -0.27681816,  1.2025404 ,
         0.04812513, -1.3607601 , -1.9584048 , -0.8596275 ,  0.20346878,
        -0.8614781 , -1.3320965 ],
       [-1.9110947 , -1.2210466 , -1.6205573 , -0.27681807,  1.2025404 ,
         0.04812505, -1.36076   , -1.9584045 , -0.8596274 ,  0.20346878,
        -0.8614782 , -1.3320965 ],
       [-1.9110947 , -1.2210466 , -1.6205575 , -0.27681798,  1.2025405 ,
         0.04812507, -1.36076   , -1.9584044 , -0.8596275 ,  0.20346873,
        -0.8614782 , -1.3320965 ],
       [-1.9110947 , -1.2210466 , -1.6205575 , -0.27681795,  1.2025404 ,
         0.04812511, -1.36076   , -1.9584045 , -0.8596274 ,  0.20346873,
        -0.8614782 , -1.3320965 ],
       [-1.9110947 , -1.2210466 , -1.6205572 , -0.27681822,  1.2025404 ,
         0.04812514, -1.3607601 , -1.9584045 , -0.8596275 ,  0.20346875,
        -0.8614782 , -1.3320962 ],
       [-1.9110947 , -1.2210463 , -1.6205572 , -0.2768182 ,  1.20

In [39]:
save_predictions(test_result, custom_configuration['testing']['results_file'])

In [40]:
test_set_select(test_set_config)

{'accuracy': 0.5}

In [41]:
control_result = trainer.predict(datasets['control'])
control_result

PredictionOutput(predictions=(array([[-1.9110949 , -1.2210466 , -1.6205575 , -0.27681777,  1.2025405 ,
         0.04812502, -1.36076   , -1.9584044 , -0.8596275 ,  0.2034687 ,
        -0.8614782 , -1.3320965 ],
       [-1.9110947 , -1.2210466 , -1.6205572 , -0.27681822,  1.2025404 ,
         0.04812512, -1.3607599 , -1.9584048 , -0.8596275 ,  0.20346875,
        -0.86147815, -1.3320962 ],
       [-1.9110948 , -1.2210466 , -1.6205575 , -0.27681795,  1.2025405 ,
         0.04812505, -1.3607602 , -1.9584044 , -0.8596275 ,  0.20346878,
        -0.8614783 , -1.3320965 ],
       [-1.9110947 , -1.2210466 , -1.6205573 , -0.27681807,  1.2025404 ,
         0.04812511, -1.36076   , -1.9584048 , -0.85962737,  0.2034687 ,
        -0.8614781 , -1.3320963 ],
       [-1.9110947 , -1.2210466 , -1.6205573 , -0.27681807,  1.2025404 ,
         0.04812508, -1.36076   , -1.9584045 , -0.85962737,  0.20346873,
        -0.8614782 , -1.3320965 ],
       [-1.9110947 , -1.2210466 , -1.6205574 , -0.2768179 ,  1.20

In [42]:
save_predictions(control_result, custom_configuration['control']['results_file'])

In [43]:
control_set_select(control_set_config)

{'accuracy': 0.0}