In [27]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, logging
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn

In [28]:
# load the original echr data
def load_echr(task="binary_cls", anon=False):
    if anon == False:
        train_df = pd.read_pickle("data/echr/non-anon_train.pkl")
        val_df = pd.read_pickle("data/echr/non-anon_valid.pkl")
        test_df = pd.read_pickle("data/echr/non-anon_test.pkl")
    else:
        train_df = pd.read_pickle("data/echr/anon_train.pkl")
        val_df = pd.read_pickle("data/echr/anon_valid.pkl")
        test_df = pd.read_pickle("data/echr/anon_test.pkl")

    if task == "binary_cls":
        train_texts, train_labels = train_df["text"].tolist(), train_df["violated"].astype(int).tolist()
        val_texts, val_labels = val_df["text"].tolist(), val_df["violated"].astype(int).tolist()
        test_texts, test_labels = test_df["text"].tolist(), test_df["violated"].astype(int).tolist()

    return train_texts, train_labels, val_texts, val_labels, test_texts, test_labels

In [29]:
# use gpu
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [30]:
# binary classification on non-anon echr data
train_texts, train_labels, val_texts, val_labels, test_texts, test_labels = load_echr(task="binary_cls", anon=False)

In [31]:
# remove warning
logging.set_verbosity_error()

# use bert
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [32]:
# tokenize texts
def generate_tokens(tokenizer, texts, max_length=512):
    tokens = tokenizer.batch_encode_plus(texts, 
        return_tensors = "pt", 
        padding = "max_length",
        truncation = True, 
        max_length = max_length, 
        pad_to_max_length = True, 
        return_token_type_ids = False
    )

    return tokens

In [33]:
max_length = 512

train_tokens = generate_tokens(tokenizer, train_texts, max_length)
val_tokens = generate_tokens(tokenizer, val_texts, max_length)
test_tokens = generate_tokens(tokenizer, test_texts, max_length)

In [34]:
# create dataloaders
def create_dataloader(tokens, labels, type, batch_size=64):
    data = TensorDataset(tokens.input_ids, tokens.attention_mask, torch.tensor(labels))

    if type == "train":
        sampler = RandomSampler(data)
        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    elif type == "val":
        sampler = SequentialSampler(data)
        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    elif type == "test":
        dataloader = DataLoader(data, batch_size=batch_size)

    return dataloader

In [35]:
train_loader = create_dataloader(train_tokens, train_labels, type="train", batch_size=64)
val_loader = create_dataloader(val_tokens, val_labels, type="val", batch_size=32)
test_loader = create_dataloader(test_tokens, test_labels, type="test", batch_size=32)

In [56]:
class BERT(nn.Module):
    def __init__(self, bert):
      super(BERT, self).__init__()
      self.bert = bert 
      # add a sigmoid layer
      self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
      _, logits = self.bert(input_ids, attention_mask)
      outputs = self.sigmoid(logits)

      return outputs

In [63]:
def train(train_loader, model, model_name, task="binary_cls", lr=1e-3):
    running_loss = 0

    model = BERT(model)
    optimizer = AdamW(model.parameters(), lr=lr) 

    if task == "binary_cls":
        loss_func = nn.BCEWithLogitsLoss()

    model.train()

    # iterate over batches
    for i, batch in enumerate(train_loader):
        # progress update after every 50 batches.
        if i % 50 == 0:
            print("--> batch {:} of {:}.".format(i, len(train_loader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        # forward pass  
        preds = model(input_ids, attention_mask)
        # compute the loss between actual and predicted values
        loss = loss_func(preds, labels)
        # add on to the total loss
        running_loss += loss.item()
        # backward pass to calculate the gradients
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        # update parameters
        optimizer.step()
    print()

    # save trained model
    torch.save(model.state_dict(), f'models/{str(model_name)}.pt')
    print('model saved')
    print()

    return running_loss

In [64]:
def evaluate(eval_loader, model, model_name, task="binary_cls"):
    running_loss = 0

    model = BERT(model)
    model.load_state_dict(torch.load(f'models/{str(model_name)}.pt'))

    if task == "binary_cls":
        loss_func = nn.BCEWithLogitsLoss()

    model.train()

    # iterate over batches
    for i, batch in enumerate(eval_loader):
        # progress update after every 50 batches.
        if i % 50 == 0:
            print("--> batch {:} of {:}.".format(i, len(eval_loader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]
        input_ids, attention_mask, labels = batch

        with torch.no_grad():
            # forward pass  
            preds = model(input_ids, attention_mask)
            # compute the loss between actual and predicted values
            loss = loss_func(preds, labels)
            # add on to the total loss
            running_loss += loss.item()
            # backward pass to calculate the gradients
    print()

    return running_loss

In [65]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [62]:
num_epochs = 10

train_losses=[]
val_losses=[]

task = "binary_cls"
model_name = "bert_binary-1"

for epoch in range(num_epochs):
    print('epoch {:} / {:}'.format(epoch + 1, num_epochs))
    
    #train model
    train_loss = train(train_loader, model, model_name, task, lr=1e-3)
    #evaluate model
    val_loss = evaluate(val_loader, model, model_name, task)

    # append training and validation loss
    train_losses.append(train_loss)
    val_losses.append(val_loss)

epoch 1 / 10
--> batch 0 of 111.
torch.Size([64, 512]) torch.Size([64, 512])


KeyboardInterrupt: 