In [None]:
!pip -q install transformers
import torch
import torch.nn as nn
import numpy as np
import math
import torch.nn.functional as F
import copy
import json
import time
from transformers import AutoTokenizer, AutoModel

[K     |████████████████████████████████| 2.3MB 8.0MB/s 
[K     |████████████████████████████████| 3.3MB 54.3MB/s 
[K     |████████████████████████████████| 901kB 58.5MB/s 
[?25h

In [None]:
# Citations
# https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [None]:
from google.colab import drive 
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
device = "cuda" # change to "cuda" when moved to google colab?

In [None]:
### Preprocessing ### 

In [None]:
# Constants for state of data processing
BLANK = 0
ORIGINAL = 1
REPHRASED = 2
TRANSLATED = 3

def process_data(filename, test=False):
    state = ORIGINAL
    original_sents, translated_sents, rephrased_sents = [], [], []
    file = open(filename, 'r')
    while True:
        line = file.readline()
        # print(line, len(line))
        if not line: #EOF
            break 
        if not test:
            if state == ORIGINAL: # Original sentence
                original_sents.append(line.strip())
                state = TRANSLATED
            elif state == TRANSLATED: # Translated sentence
                translated_sents.append(line.strip())
                state = BLANK
            elif state == BLANK: # Blank line
                assert line == '\n' # If assertion fails, something wrong with the file format
                state = ORIGINAL
            else:
                raise ValueError('Unexpected state encountered.')
        else:
            if state == ORIGINAL: # Original sentence
                original_sents.append(line.strip())
                state = REPHRASED
            elif state == REPHRASED: # Translated sentence
                rephrased_sents.append(line.strip())
                state = TRANSLATED
            elif state == TRANSLATED: # Translated sentence
                translated_sents.append(line.strip())
                state = BLANK
            elif state == BLANK: # Blank line
                assert line == '\n' # If assertion fails, something wrong with the file format
                state = ORIGINAL
            else:
                raise ValueError('Unexpected state encountered.')
   
    assert len(original_sents) == len(translated_sents) # all sents should be paired
    if test:
        assert len(original_sents) == len(rephrased_sents)

    return {'original': original_sents, 'translated': translated_sents, 'rephrased': rephrased_sents}

In [None]:
%%time
train = process_data('/content/drive/MyDrive/MedLane/train(12809)_new.txt')
test = process_data('/content/drive/MyDrive/MedLane/test(2030)_new.txt', test=True)

CPU times: user 31.6 ms, sys: 11.2 ms, total: 42.8 ms
Wall time: 1.65 s


In [None]:
test['translated'] = [sentence[2:] for sentence in test['translated']]

In [None]:
train['translated'][1222]

'She was also noted to have some bleeding around her epidural site .'

In [None]:
from collections import Counter

In [None]:
%%time
bert_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

def tokenize_and_build_vocab(corpus, tokenization_func, special_tokens, min_freq = 0):
    vocab_counter = Counter()
    tokenized_sent = []
    for (i, sent) in enumerate(corpus):
        if i % 200 == 0: 
          print('reached corpus sentence ' + str(i))
        token_list = tokenization_func(sent)
        tokenized_sent.append(token_list)
        vocab_counter.update(token_list)
        
    vocab_to_ix = {vocab: ix + len(special_tokens) for ix, (vocab, freq) in enumerate(vocab_counter.most_common()) if freq >= min_freq}
    
    for tok, ix in special_tokens.items():
        vocab_to_ix[tok] = ix
        
    return tokenized_sent, dict(vocab_counter.most_common()), vocab_to_ix


tokenization_func = lambda x: bert_tokenizer.tokenize(x)
special_tokens = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
print('created tokenizer')

train_src_sent, _, _ = tokenize_and_build_vocab(train['original'], tokenization_func, special_tokens)
train_tgt_sent, _, _ = tokenize_and_build_vocab(train['translated'], tokenization_func, special_tokens)

test_src_sent, _, _ = tokenize_and_build_vocab(test['original'], tokenization_func, special_tokens)
test_tgt_sent, _, _ = tokenize_and_build_vocab(test['translated'], tokenization_func, special_tokens)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…


created tokenizer
reached corpus sentence 0
reached corpus sentence 200
reached corpus sentence 400
reached corpus sentence 600
reached corpus sentence 800
reached corpus sentence 1000
reached corpus sentence 1200
reached corpus sentence 1400
reached corpus sentence 1600
reached corpus sentence 1800
reached corpus sentence 2000
reached corpus sentence 2200
reached corpus sentence 2400
reached corpus sentence 2600
reached corpus sentence 2800
reached corpus sentence 3000
reached corpus sentence 3200
reached corpus sentence 3400
reached corpus sentence 3600
reached corpus sentence 3800
reached corpus sentence 4000
reached corpus sentence 4200
reached corpus sentence 4400
reached corpus sentence 4600
reached corpus sentence 4800
reached corpus sentence 5000
reached corpus sentence 5200
reached corpus sentence 5400
reached corpus sentence 5600
reached corpus sentence 5800
reached corpus sentence 6000
reached corpus sentence 6200
reached corpus sentence 6400
reached corpus sentence 6600
re

In [None]:
test_tgt_sent[0]

['we',
 '##ane',
 '##d',
 'off',
 'vent',
 'to',
 'c',
 '##pa',
 '##p',
 '[',
 'continuous',
 'positive',
 'air',
 '##way',
 'pressure',
 ']',
 'and',
 'was',
 'ex',
 '##tub',
 '##ated',
 'in',
 'the',
 'afternoon',
 'on',
 '9',
 '-',
 '2',
 'by',
 'the',
 'lung',
 'specialist',
 'team',
 '.']

In [None]:
from torch.utils import data

# These IDs are reserved.
# PAD_INDEX = 0
# UNK_INDEX = 1
# SOS_INDEX = 2
# EOS_INDEX = 3

class MTDatasetForBERT(data.Dataset):
    def __init__(self, bert_tokenizer, src_sents, tgt_sents, num_samples=1.):
        self.src_sents = src_sents[:int(len(src_sents) * num_samples)]
        self.tgt_sents = tgt_sents[:int(len(src_sents) * num_samples)]
        self.bert_tokenizer = bert_tokenizer
        
        self.max_src_seq_len = 201
        self.max_tgt_seq_len = 201

        self.SOS = self.bert_tokenizer.convert_tokens_to_ids('[CLS]')
        self.EOS = self.bert_tokenizer.convert_tokens_to_ids('[SEP]')
        self.UNK = self.bert_tokenizer.convert_tokens_to_ids('[UNK]')
        self.PAD = self.bert_tokenizer.convert_tokens_to_ids('[PAD]')
        
        assert len(self.src_sents) == len(self.tgt_sents)
        
    def __getitem__(self, index):
        src_sent = self.src_sents[index]
        src_len = len(src_sent) + 2 # including <SOS> <EOS>
        src_ids = [token for token in self.bert_tokenizer.convert_tokens_to_ids(src_sent)]
        
        src_id = ([self.SOS] + src_ids + [self.EOS] + [self.PAD] *
              (self.max_src_seq_len - src_len)) # Padding
        
        tgt_sent = self.tgt_sents[index]
        tgt_len = len(tgt_sent) + 2 # including <SOS> <EOS>
        tgt_ids = [token for token in self.bert_tokenizer.convert_tokens_to_ids(tgt_sent)]
                
        tgt_id = ([self.SOS] + tgt_ids + [self.EOS] + [self.PAD] *
              (self.max_tgt_seq_len - tgt_len))       
        
        return torch.tensor(src_id), src_len, torch.tensor(tgt_id), tgt_len
    
    def __len__(self):
        return len(self.src_sents)

In [None]:
batch_size = 8
train_set = MTDatasetForBERT(bert_tokenizer, train_src_sent, train_tgt_sent, 1)
test_set  = MTDatasetForBERT(bert_tokenizer, test_src_sent, test_tgt_sent, 1)

train_data_loader = data.DataLoader(train_set, batch_size=8, num_workers = 8, shuffle=True)
test_data_loader = data.DataLoader(test_set, batch_size=1, num_workers = 8, shuffle=False)

  cpuset_checked))


In [None]:
print(train_set.max_src_seq_len)
print(train_set.max_tgt_seq_len)
print(test_set.max_src_seq_len)
print(test_set.max_tgt_seq_len)

201
201
201
201


In [None]:
# Model Code

In [None]:
# Masking functions
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

def make_std_mask(tgt, pad):
    "Create a mask to hide padding and future words."
    tgt_mask = (tgt != pad).unsqueeze(-2)
    tgt_mask = tgt_mask & Variable(
        subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
    return tgt_mask

In [None]:
# Loss function/perplexity
import math
import time

class SimpleLossCompute:
  """A simple loss compute and train function."""

  def __init__(self, generator, criterion, opt=None):
    self.generator = generator
    self.criterion = criterion
    self.opt = opt

  def __call__(self, x, y, norm):
    x = self.generator(x)
    loss = self.criterion(x.contiguous().view(-1, x.size(-1)),
                          y.contiguous().view(-1))
    loss = loss / norm

    if self.opt is not None:  # training mode
      loss.backward()          
      self.opt.step()
      self.opt.zero_grad()

    return loss.data.item() * norm

def run_epoch(data_loader, model, loss_compute, print_every = 10):
    "Standard Training and Logging Function"
    criterion = nn.NLLLoss(reduction="sum", ignore_index=train_set.PAD)
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_loader):
        srcs, src_lens, tgts, tgt_lens = batch
        
        srcs_mask = srcs.unsqueeze(-2) != train_set.PAD
        tgts_mask = make_std_mask(tgts[:,:-1], train_set.PAD)

        del tgt_lens

        # print('source ids shape', srcs.shape)
        # print('target ids shape', tgts[:,:-1].shape)
        # print('source masks shape', srcs_mask.shape)
        # print('target masks shape', tgts_mask.shape)

        out = model.forward(srcs.to(device), tgts[:,:-1].to(device), srcs_mask.to(device), tgts_mask.to(device))
        # print(out.size()) # seq_len = 47
        # print(tgts[:,1:].size()) # (bs x 46 x ...)

        loss = loss_compute(out, tgts[:,1:].to(device), norm=srcs.size(0))
        total_loss += loss
        total_tokens += (tgts[:,1:] != train_set.PAD).data.sum().item()
        
        if model.training and i % print_every == 0:
          print("Epoch Step: %d Loss: %f" % (i, loss / srcs.size(0)))
          print("Epoch Step: %d Perplexity: %f" % (i, math.exp(total_loss / float(total_tokens))))

    return math.exp(total_loss / float(total_tokens))

In [None]:
def train(model, num_epochs, learning_rate, print_every):
  # Set `ignore_index` as PAD_INDEX so that pad tokens won't be included when
  # computing the loss.
  criterion = nn.NLLLoss(reduction="sum", ignore_index=train_set.PAD)
  optim = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

  # Keep track of dev ppl for each epoch.
  dev_ppls = []

  for epoch in range(num_epochs):
    print("Epoch", epoch)

    model.train()
    train_ppl = run_epoch(data_loader=train_data_loader, model=model,
                          loss_compute=SimpleLossCompute(model.generator,
                                                         criterion, optim),
                          print_every=print_every)
        
    model.eval()
    with torch.no_grad():      
      dev_ppl = run_epoch(data_loader=test_data_loader, model=model,
                          loss_compute=SimpleLossCompute(model.generator,
                                                         criterion, None),
                          print_every=print_every)
      print("Validation perplexity: %f" % dev_ppl)
      dev_ppls.append(dev_ppl)

  # torch.save(model, 'drive/My Drive/MIT/6.864/6.864 FP/transformer/model.pt')
        
  return dev_ppls

In [None]:
# Greedy decode algorithm:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    # print(src, src_mask)
    encoder_out = model.encode(src, src_mask)
    # print(encoder_out)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    
    output = []
    for i in range(max_len-1):
        out = model.decode(ys, encoder_out, src_mask, subsequent_mask(ys.size(1)).type_as(src.data))
        prob = model.generator(out[:, -1])
        # print(prob)
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        if next_word == train_set.EOS: 
            break
        output.append(next_word)    
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    
    return output

In [None]:
# Model Implementation

In [None]:
from torch.autograd import Variable

class PositionalEncoder(nn.Module):
  def __init__(self, d_model, max_seq_len=201):
    super().__init__()
    self.d_model = d_model
        
    # create constant 'pe' matrix with values dependant on 
    # pos and i
    pe = torch.zeros(max_seq_len, d_model)
    for pos in range(max_seq_len):
      for i in range(0, d_model, 2):
        pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
        pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))

    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = x * math.sqrt(self.d_model) # conflict possible, check this line

    seq_len = x.size(1)
    x = x + Variable(self.pe[:,:seq_len], requires_grad=False)# .cuda()
    return x

In [None]:
def attention(q, k, v, d_k, mask=None, dropout=None):
  scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)

  if mask is not None:
    # print('mask size ', mask.size())
    # print('q size ', q.size())
    # print(scores.size())
    mask = mask.unsqueeze(1)
    # print(mask.size())
    scores = scores.masked_fill(mask==0, -1e9) # approximation

  scores = F.softmax(scores, dim=-1)

  if dropout is not None:
    scores = dropout(scores)

  return torch.matmul(scores, v)

class MultiHeadAttention(nn.Module):
  def __init__(self, heads, d_model, dropout = 0.1):
    super().__init__()
    self.d_model = d_model
    self.d_k = d_model // heads
    self.heads = heads

    self.query = nn.Linear(d_model, d_model)
    self.key = nn.Linear(d_model, d_model)
    self.value = nn.Linear(d_model, d_model)
    self.dropout = nn.Dropout(dropout)
    self.out = nn.Linear(d_model, d_model)

  def forward(self, q, k, v, mask=None):
    bs = q.size(0)

    q = self.query(q).view(bs, -1, self.heads, self.d_k)
    k = self.key(k).view(bs, -1, self.heads, self.d_k)
    v = self.value(v).view(bs, -1, self.heads, self.d_k)

    q = q.transpose(1, 2)
    k = k.transpose(1, 2)
    v = v.transpose(1, 2)
    scores = attention(q, k, v, self.d_k, mask, self.dropout)

    concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)

    return self.out(concat)

In [None]:
class FeedForward(nn.Module):
  def __init__(self, d_model, d_ff=2048, dropout=0.1):
    super().__init__()

    self.l1 = nn.Linear(d_model, d_ff)
    self.dropout = nn.Dropout(dropout)
    self.l2 = nn.Linear(d_ff, d_model)

  def forward(self, x):
    x = self.dropout(F.relu(self.l1(x)))
    return self.l2(x)

In [None]:
class LayerNorm(nn.Module):
  def __init__(self, d_model, eps=1e-6):
    super().__init__()

    self.size = d_model

    self.alpha = nn.Parameter(torch.ones(self.size))
    self.bias = nn.Parameter(torch.zeros(self.size))
    self.eps = eps

  def forward(self, x):
    norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
    return norm

In [None]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, heads, d_ff = 2048, dropout=0.1):
    super().__init__()
    self.ln1 = LayerNorm(d_model)
    self.ln2 = LayerNorm(d_model)
    self.attn = MultiHeadAttention(heads, d_model, dropout)
    self.ff = FeedForward(d_model, d_ff, dropout)
    self.d1 = nn.Dropout(dropout)
    self.d2 = nn.Dropout(dropout)

  def forward(self, x, mask):
    x2 = self.ln1(x)
    x = x + self.d1(self.attn(x2, x2, x2, mask))
    x2 = self.ln2(x)
    x = x + self.d2(self.ff(x2))
    return x

class DecoderLayer(nn.Module):
  def __init__(self, d_model, heads, d_ff = 2048, dropout=0.1):
    super().__init__()
    self.ln1 = LayerNorm(d_model)
    self.ln2 = LayerNorm(d_model)
    self.ln3 = LayerNorm(d_model)

    self.d1 = nn.Dropout(dropout)
    self.d2 = nn.Dropout(dropout)
    self.d3 = nn.Dropout(dropout)

    self.attn1 = MultiHeadAttention(heads, d_model, dropout)
    self.attn2 = MultiHeadAttention(heads, d_model, dropout)
    self.ff = FeedForward(d_model, d_ff, dropout)
  
  def forward(self, x, encoder_out, src_mask, tgt_mask):
    x2 = self.ln1(x)
    # print('x2 shape ', x2.shape)
    x = x + self.d1(self.attn1(x2, x2, x2, tgt_mask))
    x2 = self.ln2(x)
    x = x + self.d2(self.attn2(x2, encoder_out, encoder_out, src_mask))
    x2 = self.ln3(x)
    return x + self.d3(self.ff(x2))

In [None]:
import copy

def clone(module, N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):
  def __init__(self, cbert, vocab_size, d_model, N, heads, d_ff = 2048, dropout=0.1):
    super().__init__()
    self.N = N
    self.cbert = cbert
    self.pe = PositionalEncoder(d_model)
    self.layers = clone(EncoderLayer(d_model, heads, d_ff, dropout), self.N)
    self.ln = LayerNorm(d_model)

    # self.cbert.resize_token_embeddings(vocab_size)

  def forward(self, src, mask):
    x = self.cbert(src, mask).last_hidden_state.to(device)
    # print('Post BERT: ', x)
    x = self.pe(x)
    for i in range(self.N):
      x = self.layers[i](x, mask)

    # print('Post Encoder: ', x)
    return self.ln(x)

class Decoder(nn.Module):
  def __init__(self, vocab_size, d_model, N, heads, d_ff = 2048, dropout=0.1):
    super().__init__()
    self.N = N
    # self.cbert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    self.embed = nn.Embedding(vocab_size, d_model)
    self.pe = PositionalEncoder(d_model)
    self.layers = clone(DecoderLayer(d_model, heads, d_ff, dropout), self.N)
    self.ln = LayerNorm(d_model)

    # self.cbert.resize_token_embeddings(vocab_size)

  def forward(self, tgt, encoder_out, src_mask, tgt_mask):
    x = self.embed(tgt)
    x = self.pe(x)
    for i in range(self.N):
      x = self.layers[i](x, encoder_out, src_mask, tgt_mask)
    return self.ln(x)

In [None]:
class Generator(nn.Module):
  def __init__(self, d_model, vocab_size):
    super().__init__()
    self.proj = nn.Linear(d_model, vocab_size)

  def forward(self, x):
    return F.log_softmax(self.proj(x), dim=-1)

In [None]:
# Full Model

In [None]:
class ClinicalTextTranslationModel(nn.Module):
    """ Model class wrapping encoder + intermediate layers + decoder """
    def __init__(self, cbert, src_vocab_size, tgt_vocab_size, d_model, N, heads, d_ff = 2048, dropout=0.1, intermediate_layers=None):
        
        #self, hidden_size, num_heads, N, ff_size, dropout, max_seq_len,
        #         intermediate_layers, src_vocab_size, tgt_vocab_size):
        
        super().__init__()
        
        self.encoder = Encoder(cbert, src_vocab_size, d_model, N, heads, d_ff = 2048, dropout=0.1)
        self.intermediate_layers = intermediate_layers
        self.decoder = Decoder(tgt_vocab_size, d_model, N, heads, d_ff = 2048, dropout=0.1)
        
        self.generator = Generator(d_model, tgt_vocab_size)
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        encoder_output = self.encoder(src, src_mask)
        
        intermediate_layer_output = encoder_output
        
        if self.intermediate_layers is not None:
            intermediate_layer_output = self.intermediate_layers(intermediate_output)
            
        decoder_output = self.decoder(tgt, intermediate_layer_output, src_mask, tgt_mask)
        
        return decoder_output
    
    def encode(self, src, src_mask):
        encoder_output = self.encoder(src, src_mask)
        
        intermediate_layer_output = encoder_output
        
        if self.intermediate_layers is not None:
            intermediate_layer_output = self.intermediate_layers(intermediate_output)
        
        return intermediate_layer_output
        
    def decode(self, tgt, intermediate_layer_output, src_mask,  tgt_mask):
        decoder_output = self.decoder(tgt, intermediate_layer_output, src_mask, tgt_mask)
        return decoder_output

In [None]:
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    cbert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    model = ClinicalTextTranslationModel(cbert, src_vocab, tgt_vocab, d_model, N, h, d_ff, dropout).to(device)
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    # for p in model.parameters():
    #     if p.dim() > 1:
    #         nn.init.xavier_uniform(p)
    return model

init = True
if init:
  model = make_model(len(bert_tokenizer), len(bert_tokenizer), N=6, d_model=768, d_ff=2048, h=8, dropout=0.1)
else:
  model = torch.load("/content/drive/MyDrive/MedLane/model_6.pt").to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435778770.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
ppls = train(model, 12, 0.00005, 100)
print(ppls)

Epoch 0


  cpuset_checked))


Epoch Step: 0 Loss: 352.617157
Epoch Step: 0 Perplexity: 34472.451005
Epoch Step: 100 Loss: 206.141479
Epoch Step: 100 Perplexity: 10760.198910
Epoch Step: 200 Loss: 141.190628
Epoch Step: 200 Perplexity: 3639.367677
Epoch Step: 300 Loss: 131.298752
Epoch Step: 300 Perplexity: 2108.836234
Epoch Step: 400 Loss: 169.679840
Epoch Step: 400 Perplexity: 1532.777704
Epoch Step: 500 Loss: 222.068893
Epoch Step: 500 Perplexity: 1208.759905
Epoch Step: 600 Loss: 117.369911
Epoch Step: 600 Perplexity: 1009.975149
Epoch Step: 700 Loss: 140.370697
Epoch Step: 700 Perplexity: 864.868809
Epoch Step: 800 Loss: 218.467316
Epoch Step: 800 Perplexity: 759.408223
Epoch Step: 900 Loss: 235.337097
Epoch Step: 900 Perplexity: 683.279297
Epoch Step: 1000 Loss: 195.413208
Epoch Step: 1000 Perplexity: 619.613018
Epoch Step: 1100 Loss: 107.790413
Epoch Step: 1100 Perplexity: 566.099492
Epoch Step: 1200 Loss: 140.070526
Epoch Step: 1200 Perplexity: 521.314546
Epoch Step: 1300 Loss: 139.749374
Epoch Step: 1300 Pe

In [None]:
# torch.save(model, "/content/drive/MyDrive/MedLane/model_12.pt")

In [None]:
!pip -q install sacrebleu
import sacrebleu
from tqdm import tqdm

def compute_BLEU(model, data_loader, decoder, max_iters):
  bleu_scores = []

  for i, batch in enumerate(test_data_loader):
    if i >= max_iters: break
    srcs, src_lens, tgts, tgt_lens = batch
          
    srcs_mask = srcs.unsqueeze(-2) != train_set.PAD

    # src_sent = [bert_tokenizer.convert_ids_to_tokens(token.item()) for token in srcs[0]]
    # print('Source sent: ', bert_tokenizer.convert_tokens_to_string(src_sent))    
    
    out = decoder(model, srcs.to(device), srcs_mask.to(device), 201, train_set.SOS)
    # print(tgts[0,:])
    tgts = tgts[0,1:]
    
    tgts = tgts[:np.where(tgts == train_set.EOS)[0][0]]
    
    tgt_sent = [bert_tokenizer.convert_ids_to_tokens(token.item()) for token in tgts]
    out_sent = [bert_tokenizer.convert_ids_to_tokens(token.item()) for token in out]
    print('Target Tokens: ', tgt_sent)
    print('Out tokens: ', out_sent)

    tgt_sent = bert_tokenizer.convert_tokens_to_string(tgt_sent)
    out_sent = bert_tokenizer.convert_tokens_to_string(out_sent)

    print('Target sent: ', tgt_sent)
    print('Out sent: ', out_sent)

    bleu_scores.append(sacrebleu.raw_corpus_bleu([out_sent], [[tgt_sent]], .01).score)

  return bleu_scores

# print('BLEU score: %f' % (np.mean(compute_BLEU(model, 
#                                            test_data_loader,
#                                             greedy_decode))))

[?25l[K     |██████                          | 10kB 32.2MB/s eta 0:00:01[K     |████████████                    | 20kB 38.4MB/s eta 0:00:01[K     |██████████████████              | 30kB 21.6MB/s eta 0:00:01[K     |████████████████████████        | 40kB 16.6MB/s eta 0:00:01[K     |██████████████████████████████  | 51kB 8.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 5.1MB/s 
[?25h

In [None]:
# model = torch.load("/content/drive/MyDrive/MedLane/model_5.pt", map_location=torch.device('cpu'))

In [None]:
"""
Potential Improvements/Experiments
 - Try BERT tokenizers e.g ClinicalBERT
 - start with ClincalBERT/BioBERT/BEHRT pretrained embeddings
 - Intermediate layers?
 - try varying N, adjust hyperparams
 - make a validation set from part of the training examples?
 - "Tunability" - Use only most common words/words with low enough reading scores
 - Front-load any UI that you can!
"""

'\nPotential Improvements/Experiments\n - Try BERT tokenizers e.g ClinicalBERT\n - start with ClincalBERT/BioBERT/BEHRT pretrained embeddings\n - Intermediate layers?\n - try varying N, adjust hyperparams\n - make a validation set from part of the training examples?\n - "Tunability" - Use only most common words/words with low enough reading scores\n - Front-load any UI that you can!\n'

In [None]:
device = 'cuda'
bleu_scores = compute_BLEU(model.to(device), test_data_loader, greedy_decode, 25000)

In [None]:
# transformer = torch.load('/content/drive/MyDrive/MIT/6.871/6.871 NLP Project/data/model_basic.pt', map_location=torch.device('cpu'))

In [None]:
np.mean(bleu_scores)