In [2]:
import numpy as np
import pandas as pd
import torch
import random

from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from torch.nn import CrossEntropyLoss, MSELoss
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn import metrics

In [3]:
# read data with head entity, relation and tail entity
data = pd.read_csv('./Data/processed/train.csv',index_col=False)

In [4]:
### --- Initialize necessary model parameters and hyperparameters --- ###
max_seq_length = 128
train_batch_size = 8
eval_batch_size = 8
num_train_epochs = 6
num_train_steps = int(
        len(data) / train_batch_size * num_train_epochs) # number of batches * number of epochs
learning_rate = 2e-5 
warmup_proportion = 0.1 

seed_val = 42

output_dir = './model/scibert_1/'


In [5]:
# load tokenizer
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig

tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels = 2, output_attentions = True,
                                         output_hidden_states = True)

### ---- Uncomment following lines to add special tokens ----###
# new_tokens = data['head'].unique().tolist()
# num_added_toks = tokenizer.add_tokens(new_tokens)

# print('We have added', num_added_toks, 'tokens')
# # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
# model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [6]:
### ---- Code block to generate input token representations, attention masks and segment embeddings---- ###
input_ids_list = []
input_mask_list = []
segment_ids_list = []
label_id_list = []

def create_seq(row, data = 'train'):
    head = str(row['head']).lower()
    relation = str(row['relation']).lower()
    tail = str(row['tail']).lower()
    tokens_head = tokenizer.tokenize(head)
    tokens_relation = tokenizer.tokenize(relation)
    tokens_tail = tokenizer.tokenize(tail)
    
    tokens = ["[CLS]"] + tokens_head + ["[SEP]"]
    segment_ids = [0] * len(tokens)
    
    # segment id is different for relation tokens as suggested in KG-BERT
    tokens += tokens_relation + ["[SEP]"]
    segment_ids += [1] * (len(tokens_relation) + 1)
    
    tokens += tokens_tail + ["[SEP]"]
    segment_ids += [1] * (len(tokens_tail) + 1)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    if len(input_ids)>max_seq_length:
        input_ids = input_ids[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = row['label']
    
    input_ids_list.append(input_ids)
    input_mask_list.append(input_mask)
    segment_ids_list.append(segment_ids)
    label_id_list.append(label_id)
    return

In [7]:
%%time
data.apply(lambda row: create_seq(row), axis =1)

Wall time: 256 ms


0      None
1      None
2      None
3      None
4      None
       ... 
491    None
492    None
493    None
494    None
495    None
Length: 496, dtype: object

In [8]:
# Use train_test_split to split our data into train and validation sets fortraining
from sklearn.model_selection import train_test_split
# Use 95% for training and 5% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids_list, label_id_list, 
                                                            random_state=2018, test_size=0.05, stratify=label_id_list)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(input_mask_list, label_id_list,
                                             random_state=2018, test_size=0.05, stratify=label_id_list)

# do the same for segment ids
train_segment_ids, validation_segment_ids, _, _ = train_test_split(segment_ids_list, label_id_list,
                                             random_state=2018, test_size=0.05, stratify=label_id_list)


In [9]:
all_input_ids = torch.tensor([input_ids for input_ids in train_inputs], dtype=torch.long)
all_input_mask = torch.tensor([input_mask for input_mask in train_masks], dtype=torch.long)
all_segment_ids = torch.tensor([segment_ids for segment_ids in train_segment_ids], dtype=torch.long)

all_label_ids = torch.tensor([label_id for label_id in train_labels], dtype=torch.long)

In [10]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [11]:
val_input_ids = torch.tensor([input_ids for input_ids in validation_inputs], dtype=torch.long)
val_input_mask = torch.tensor([input_mask for input_mask in validation_masks], dtype=torch.long)
val_segment_ids = torch.tensor([segment_ids for segment_ids in validation_segment_ids], dtype=torch.long)

val_label_ids = torch.tensor([label_id for label_id in validation_labels], dtype=torch.long)

In [12]:
val_data = TensorDataset(val_input_ids, val_input_mask, val_segment_ids, val_label_ids)

In [13]:
len(train_data), len(val_data)

(471, 25)

In [29]:
# use cuda and gpu if available
device = None
n_gpu = 0
if torch.cuda.is_available():
    device = torch.device("cuda") 
    n_gpu = torch.cuda.device_count()
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

In [30]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know our batch size for training, so we specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
# Create the DataLoader for our training set.
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)
# Create the DataLoader for our validation set.
validation_sampler = SequentialSampler(val_data)
validation_dataloader = DataLoader(val_data, sampler=validation_sampler, batch_size=eval_batch_size)

In [31]:
import random
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [32]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())
print('The SciBERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The SciBERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (31090, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias            

In [33]:
no_decay = ['bias', 'gamma', 'beta']
optimizer_parameters = [
     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
     ]

optimizer = AdamW(optimizer_parameters,
                         lr=learning_rate)

from transformers import get_linear_schedule_with_warmup
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = num_train_steps/10, 
                                            num_training_steps = num_train_steps)

In [34]:
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [35]:
# Store the average loss after each epoch so we can plot them.
more_train_loss = []
train_loss_values = []
eval_loss_values = []

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, num_train_epochs):
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_train_epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_train_loss = 0
    total_eval_loss = 0
    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            step_train_loss = total_train_loss/step
            more_train_loss.append(step_train_loss)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.    Train loss: {:}.'.format(step, len(train_dataloader), elapsed, step_train_loss))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        if device:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_segment_id = batch[2].to(device)
            b_labels = batch[3].to(device)
        else: 
            b_input_ids = batch[0]
            b_input_mask = batch[1]
            b_segment_id = batch[2]
            b_labels = batch[3]
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                        token_type_ids=b_segment_id, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        # The call to `model` always returns a tuple, so we need to pull the loss value out of the tuple.
        train_loss = outputs[0]
        if n_gpu > 1:
            train_loss = train_loss.mean() # mean() to average on multi-gpu.
        # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end. 
        # `loss` is a Tensor containing a single value; the `.item()` function just returns the Python value from the tensor.
        total_train_loss += train_loss.item()
        
        # Perform a backward pass to calculate the gradients.
        train_loss.backward()
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient. The optimizer dictates the "update rule"
        # --how the parameters are modified based on their gradients, the learning rate, etc.
        optimizer.step()
        
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    train_loss_values.append(avg_train_loss)
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # batch = tuple(t for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_segment_id, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=b_segment_id, 
                            attention_mask=b_input_mask,
                            labels=b_labels)
        
        eval_loss = outputs[0]
        total_eval_loss += eval_loss.mean().item()
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        
        logits = outputs[1]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.detach().cpu().numpy()
        
        # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
        
#         # Accumulate the total accuracy.
#         eval_accuracy += tmp_eval_accuracy
#         # Track the number of batches
#         nb_eval_steps += 1
    # Calculate the average loss over the training data.
    avg_eval_loss = total_eval_loss / len(validation_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    eval_loss_values.append(avg_eval_loss)
    
    # Report the final accuracy for this validation run.
#     print("  Validation Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Validation Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of     59.    Elapsed: 0:03:30.    Train loss: 0.6550302602350712.

  Average training loss: 0.62
  Training epcoh took: 0:05:14

Running Validation...
  Validation Accuracy: 0.78
  Validation Loss: 0.51
  Validation took: 0:00:06

Training...
  Batch    40  of     59.    Elapsed: 0:03:55.    Train loss: 0.27876019002869723.

  Average training loss: 0.25
  Training epcoh took: 0:05:49

Running Validation...
  Validation Accuracy: 0.94
  Validation Loss: 0.24
  Validation took: 0:00:06

Training...
  Batch    40  of     59.    Elapsed: 0:03:37.    Train loss: 0.16997265766840428.

  Average training loss: 0.16
  Training epcoh took: 0:05:23

Running Validation...
  Validation Accuracy: 0.72
  Validation Loss: 0.71
  Validation took: 0:00:06

Training...
  Batch    40  of     59.    Elapsed: 0:03:39.    Train loss: 0.08228996702819132.

  Average training loss: 0.06
  Training epcoh took: 0:05:21

Running Validation...
  Validation Accuracy: 0.97
  Validation

### Save Model

In [36]:
import os

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir) 
tokenizer.save_pretrained(output_dir)


Saving model to ./model/scibert_1/


('./model/scibert_1/tokenizer_config.json',
 './model/scibert_1/special_tokens_map.json',
 './model/scibert_1/vocab.txt',
 './model/scibert_1/added_tokens.json')