In [None]:
!pip install datasets

In [None]:
!pip install transformers

In [None]:
#!pip install allennlp==2.1.0 allennlp-models==2.1.0

In [None]:
from datasets import list_datasets, load_dataset, list_metrics, load_metric
import numpy as np

In [None]:
fd = load_dataset("few_rel")
fr = fd['train_wiki'].to_pandas()

In [None]:
fr.head()

In [None]:
from sklearn import preprocessing
import pandas as pd

print("Total number of records", len(fr))
print("Unique relations", len(fr.relation.unique()))
fr = fr.rename(columns={"head":"head_c", "tail":"tail_c"})

le = preprocessing.LabelEncoder()
fr['relation_ord'] = le.fit_transform(fr.relation)

In [None]:
for i in range(5):
  print(fr.tokens[i], fr.head_c[i], fr.tail_c[i])
  print("")

Input: Merpati flight 106 departed Jakarta (CGK) on a domestic flight to Tanjung Pandan (TJQ)

Output: <TAIL>Merpati</TAIL> flight 106 departed Jakarta (CGK) on a domestic flight to <HEAD>Tanjung Pandan <\/HEAD> (TJQ)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
i = 0
sentlen = []
train = []
targets=[]

for row in fr.to_dict(orient="records"):
  #print(row)
  sent = ""
  head_t = row['head_c']['indices']
  tail_t = row['tail_c']['indices']
  head_t = head_t.tolist()[0]
  tail_t = tail_t.tolist()[0]
  hs, ts = False, False
  for idx, token in enumerate(row['tokens']):
    if idx == head_t[0]:
      sent += "[HEAD]"
    elif idx == tail_t[0]:
      sent += "[TAIL]"

    sent += token

    if idx == head_t[-1]:
      sent += "[/HEAD]"
    elif idx == tail_t[-1]:
      sent += "[/TAIL]"

    sent += " "

  sent = sent.replace(" ,", ",")
  sent = sent.replace("( ", " (")
  sent = sent.replace(" )", ") ")
  train.append(sent)
  targets.append(row['relation_ord'])
  sentlen.append(len(sent))


In [None]:
train[-10:]

In [None]:
max_len=256

In [None]:
import torch
import time

input_ids = []
attn_masks = []
labels = []
head_idx = []
tail_idx = []

t0 = time.time()


print('Encoding {:,} training examples...'.format(len(train)))

for (index, row) in enumerate(train):

    # Report progress.
    if ((len(input_ids) % 15000) == 0):
        print('  Encoded {:,} comments.'.format(len(input_ids)))

    # Convert sentence pairs to input IDs, with attention masks.
    encoded_dict = tokenizer.encode_plus(row,  # The text to encode.
                                        max_length=max_len,    # Pad or truncate to this lenght.
                                        padding="max_length",
                                        truncation=True, 
                                        return_tensors='pt')   # Return objects as PyTorch tensors.
    decoded = [tokenizer.convert_ids_to_tokens(y) for y in encoded_dict['input_ids']][0]
    for i, x in enumerate(decoded):
        if x=="[" and i+4 < len(decoded) and "".join(decoded[i: i+5]) == "[H##EA##D]":
            head_idx.append(i)
        if x=="[" and i+4 < len(decoded) and "".join(decoded[i: i+5]) == "[T##A##IL]":
            tail_idx.append(i)        

    # Add this example to our lists.
    input_ids.append(encoded_dict['input_ids'])
    attn_masks.append(encoded_dict['attention_mask'])
    
print('\nDONE. {:,} examples.'.format(len(input_ids)))

# ======== List of Examples --> Tensor ========

# Convert each Python list of Tensors into a 2D Tensor matrix.
input_ids = torch.cat(input_ids, dim=0)
attn_masks = torch.cat(attn_masks, dim=0)

# ======== Prepare Labels ========

labels = np.array(targets).astype(int)

# Cast the labels list to a 2D Tensor.
labels = torch.LongTensor(labels)

head_idx = torch.LongTensor(head_idx)

tail_idx = torch.LongTensor(tail_idx)

# ======== Summary ========

print('\nData structure shapes:')
print('   input_ids:  {:}'.format(str(input_ids.shape)))
print('  attn_masks:  {:}'.format(str(attn_masks.shape)))
print('      labels:  {:}'.format(str(labels.shape)))
print('      head_idx:  {:}'.format(str(head_idx.shape)))
print('      tail_idx:  {:}'.format(str(tail_idx.shape)))
print('\nEncoding took {:.0f} seconds'.format(time.time() - t0))

In [None]:
print(len(input_ids), len(attn_masks), len(labels))

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
#dataset = TensorDataset(input_ids[:2000], attn_masks[:2000], labels[:2000], head_idx[:2000], tail_idx[:2000])
dataset = TensorDataset(input_ids, attn_masks, labels, head_idx, tail_idx)

# Create a 90-10 train-validation split. Calculate the number of samples to 
# include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
from torch import nn
from torch.nn import CrossEntropyLoss#, softmax_cross_entropy_with_logits
import torch.nn as nn
from transformers import BertPreTrainedModel, BertModel

class BertForRelationExtraction(BertPreTrainedModel):
    '''
    This custom class closely resembles BertForSequenceClassification, which
    supports multiclass classification, but not multi-label.
    This modified version supports data points with multiple labels.
    '''

    def __init__(self, config):
        '''
        Class initializer, called when we create a new instance of this class.
        '''

        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        print("dropout = ", config.hidden_dropout_prob)
        # Create a [768 x classes] weight matrix to use as our classifier.
        #self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.classifier = nn.Sequential(
            nn.Linear(2 * config.hidden_size, config.hidden_size),
            nn.ReLU(),
            nn.Dropout(config.hidden_dropout_prob),
            nn.Linear(config.hidden_size, config.num_labels)
        )

        # Initialize model weights (inherited function).
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        head_idx=None,
        tail_idx=None
    ):
        outputs = self.bert(
            input_ids,                      # The input sequence
            attention_mask=attention_mask,  # Mask out any [PAD] tokens.
            token_type_ids=token_type_ids,  # Identify segment A vs. B
            position_ids=position_ids,      
            head_mask=head_mask,            
            inputs_embeds=inputs_embeds,    # Presumably the initial embeddings
                                            # for the tokens in our sequence.
            output_attentions=output_attentions, # Boolean, whether to return
                                                 # all of the attention scores.
            output_hidden_states=output_hidden_states, # Whether to return
                                                       # embeddings from all 12
                                                       # layers.
        )
        final_embeddings = outputs[0]
        activated_cls = outputs[1]
        idx = torch.arange(input_ids.size(0)).to(input_ids.device)
        head_emb = final_embeddings[idx, head_idx]
        tail_emb = final_embeddings[idx, tail_idx]
        h = torch.cat((head_emb, tail_emb), dim=-1)

        logits = self.classifier(h)
        
        if labels is not None:
            
            loss_fct = nn.CrossEntropyLoss()
            #m = nn.LogSoftmax(dim=1)
            loss = loss_fct(logits.float(), labels) # The labels
            return ((loss, logits) + outputs[2:])
        else:
        
            return ((logits,) + outputs[2:])

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
from transformers import AdamW, BertConfig

model = BertForRelationExtraction.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 64,    
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
desc = model.cuda()

print ("Model loaded.")

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Specify our batch size.
batch_size = 16

# Create the DataLoaders for our training sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5,
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 5

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import time
import datetime

def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))  


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
def calc_f1(labels, predictions):
    #print("labels= ", labels, "predictions=", predictions)
    for x, x_name in [(labels, "labels"), (predictions, "predictions")]:
        unique, counts = np.unique(x, return_counts=True)
        print (x_name, np.asarray((unique, counts)))

    correct_by_relation = ((labels == predictions) & (predictions != 0)).astype(np.int32).sum()
    guessed_by_relation = (predictions != 0).astype(np.int32).sum()
    gold_by_relation = (labels != 0).astype(np.int32).sum()

    prec_micro = 1.0
    if guessed_by_relation > 0:
        prec_micro = float(correct_by_relation) / float(guessed_by_relation)
    recall_micro = 1.0
    if gold_by_relation > 0:
        recall_micro = float(correct_by_relation) / float(gold_by_relation)
    f1_micro = 0.0
    if prec_micro + recall_micro > 0.0:
        f1_micro = 2.0 * prec_micro * recall_micro / (prec_micro + recall_micro)
    return prec_micro, recall_micro, f1_micro


In [None]:
update_interval = 100
import random
import numpy as np
from sklearn.metrics import roc_auc_score
from torch.nn import Softmax
from scipy.special import softmax

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        if (step % update_interval) == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_head_idx = batch[3].to(device)
        b_tail_idx = batch[4].to(device)

        model.zero_grad()        

        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels,
                             head_idx=b_head_idx,
                             tail_idx=b_tail_idx)

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
 
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    # Tracking variables 
    total_eval_loss = 0

    predictions, true_labels = [], []

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_head_idx = batch[3].to(device)
        b_tail_idx = batch[4].to(device)
        
        with torch.no_grad():   
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels,
                                   head_idx=b_head_idx,
                                   tail_idx=b_tail_idx)

        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Calculate the accuracy for this batch of test sentences.

        # Move logits and labels to CPU
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.extend(preds.tolist())
        #print(label_ids)
        true_labels.extend(label_ids.tolist())

    # Calculate the average validation loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("  Average validation loss: {0:.2f}".format(avg_val_loss))
    
    # Measure validation accuracy...
    # Combine the results across all batches. 
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    
    #print(b.shape, predictions.shape)
    val_accuracy = calc_f1(true_labels, predictions)

    # Report the final accuracy for this validation run.
    print("Prec, Recall, F1 :", val_accuracy)

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            #'Valid. Accur.': val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
CP_DIR = "/content/drive/My Drive/dev/relext/checkpoints/"
model_state, optimizer_state, scheduler_state = model.state_dict(), optimizer.state_dict(), scheduler.state_dict()
cp = {'model_state': model_state, 'optimizer_state':optimizer_state, 'scheduler': sche
      duler_state}
filename = datetime.datetime.now().strftime("%Y%m%d.%H.")+".pth.tar"
torch.save(cp, CP_DIR+filename )