# Gyafc

In [9]:
import shutil
import os
from glob import glob

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizerFast,
    XLNetTokenizer,
    XLNetForSequenceClassification,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [30]:
from transformers import (DebertaTokenizer,
                         DebertaForSequenceClassification)
import sklearn
# import torch_xla.distributed.xla_multiprocessing as xmp

In [3]:
def data_read(data_path):
    data = []
    for file_name in glob(data_path):
        with open(file_name) as f:
            tmp_data = f.read()
            data.extend(tmp_data.split('\n'))
    return data

In [4]:
path_formal = 'GYAFC_Corpus/*/{}/formal*'
path_inform = 'GYAFC_Corpus/*/{}/informal*'

In [5]:
data_train_form = data_read(path_formal.format('train'))
data_train_inform = data_read(path_inform.format('train'))

data_valid_form = data_read(path_formal.format('test'))
data_valid_inform = data_read(path_inform.format('test'))

data_test_form = data_read(path_formal.format('tune'))
data_test_inform = data_read(path_inform.format('tune'))

In [18]:
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

In [15]:
def prep_dataset(formal, informal):
    tuples = []
    data = []
    labels = []
    formal = list(set(formal))
    for sentence in formal:
        data.append(sentence)
        labels.append(0)
    informal = list(set(informal))
    for sentence in informal:
        data.append(sentence)
        labels.append(1)
    return data, labels

In [16]:
train_texts, train_labels = prep_dataset(data_train_form, data_train_inform)
val_texts, val_labels = prep_dataset(data_valid_form, data_valid_inform)
test_texts, test_labels = prep_dataset(data_test_form, data_test_inform)

In [19]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=24)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=24)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=24)

In [21]:
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels = 2)
# model.eval()

# WRAPPED_MODEL = xmp.MpModelWrapper(model)

Downloading pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

NameError: name 'xmp' is not defined

In [26]:
class Formal_informal(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [31]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    """
    Compute metrics for Trainer
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    #_, _, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")

    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        #'macro f1': macro_f1,
        'precision': precision,
        'recall': recall
    }

def train_nli(model, epochs=10, warmup_steps=200):
    """
    This contains everything that must be done to train our models
    """

    print("Loading datasets... ", end="")
    train_dataset = Formal_informal(train_encodings, train_labels)
    val_dataset = Formal_informal(val_encodings, val_labels)
    test_dataset = Formal_informal(test_encodings, test_labels)

    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=500,
        evaluation_strategy = 'steps',
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    results = []

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,             # evaluation dataset
        compute_metrics=compute_metrics
    )
    trainer.place_model_on_device = False
    trainer.train()

    trainer.save_model("nli_model/")
    tokenizer.save_pretrained("nli_model/")

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [36]:
model = model.to(device)

In [38]:
train_nli(model)

Loading datasets... 



Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.3798,0.398239,0.851169,0.844504,0.877991,0.839614
1000,0.2719,0.44849,0.866711,0.862301,0.883573,0.857685
1500,0.262,0.33134,0.877038,0.875172,0.879868,0.872962


KeyboardInterrupt: 