In [1]:
!pip install transformers



In [2]:
!pip install rouge



In [3]:
EVAL = False # set if running in eval only mode

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
import numpy as np
import os
import sys
import tensorflow as tf
import torch
import datetime, time
import io
import re
from csv import reader
import matplotlib.pyplot as plt
import traceback
import random
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

import glob, os
import torch.nn as nn

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import BertModel, AdamW, BertConfig,BertTokenizer
from torch.nn.init import xavier_uniform_

import tensorflow as tf
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
from rouge import Rouge 


## Create result Directory

In [6]:
BASE_DIR = "/content/drive/My Drive/summ_data/"
MODEL_INPUT_DIR = BASE_DIR+"ICSI_plus_NXT/toy_tensor/"
RESULT_DIR = BASE_DIR+"ICSI_plus_NXT/toy_result/"
TOY_DIR = BASE_DIR+"ICSI_plus_NXT/toy/"
for d in (MODEL_INPUT_DIR, RESULT_DIR, TOY_DIR):
  if not os.path.exists(d):
      os.makedirs(d)
  files = glob.glob(d+'*')
  for f in files:
    os.remove(f)



## Encoder classes

In [7]:
class Bert(nn.Module):
    def __init__(self):
        super(Bert, self).__init__()
        self.model = BertModel.from_pretrained('bert-base-cased')

    def forward(self, x, segs, mask):
        # the below returns a tuple. First element in the tuple is last hidden state. Second element in tuple is pooler output
        result = self.model(x, attention_mask =mask, position_ids=segs)
        top_vec = result[0]
        return top_vec


class Classifier(nn.Module):
    def __init__(self, hidden_size):
        super(Classifier, self).__init__()
        self.linear1 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, mask_cls):
        h = self.linear1(x)
        h = h.squeeze(-1)
        x2 = x.clone().detach().cpu().numpy()
        print("X (CLS) totals = ", np.sum(x2[0,:,:][:20], axis=1))
        print("Logit = ", h[0,:10])
        sent_scores = self.sigmoid(h) * mask_cls.float()
        return sent_scores, x, h


class Summarizer(nn.Module):
    def __init__(self, args=None, num_hidden = 768, load_pretrained_bert = True, bert_config = None):
        super(Summarizer, self).__init__()
        self.args = args
        self.bert = Bert()
#        if (args.encoder == 'classifier'):
        self.encoder = Classifier(num_hidden)
        for p in self.encoder.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

    def forward(self, x, segs, clss, mask, mask_cls, sentence_range=None):
        top_vec = self.bert(x, segs, mask)
        sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss]
        sents_vec = sents_vec * mask_cls[:, :, None].float()
        sent_scores, x, h = self.encoder(sents_vec, mask_cls)
        sent_scores = sent_scores.squeeze(-1)
        return sent_scores, mask_cls, x, h


## Training

In [8]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    print('GPU device not found')


Found GPU at: /device:GPU:0


In [9]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


## Toy train and Validation datasets. here the model is trying to predict sentences that have upper case

In [10]:
dat_files = ["""The steel photographs the nutty amusement. The miniature plant publicizes the swim. The voiceless interest publicizes the blood. The striped industry logs the attraction. The rate scans the time. The authority launchs the yellow development.The part counsels the cultured wine.The thing familiarizes the detail. Did the neighborly interaction really dance the perspective. The scientific location can't return the trash""",
             """She did her best to help him. Flesh-colored yoga pants were far worse than even he feared. He dreamed of eating green apples with worms. Most shark attacks occur about 10 feet from the beach since that's where the people are. He hated that he loved what she hated about hate. The Guinea fowl flies through the air with all the grace of a turtle.If eating three-egg omelets causes weight-gain, budgie eggs are a good substitute.""",
             """The zippy mind starts the market. The wary value creates the event. The tangy language quantifys the liquid. The tacit food relates the market. The point packages the somber smash. The scattered burst authorizes the step. The hearty double can't blink the tour. It was then the ripe expert met the accidental energy."""]

for fi, f in enumerate(dat_files):
  with open(TOY_DIR+"M"+str(fi)+".txt", "w") as out:
    dat = f.lower()
    dat = f.split(".")
    labels = np.random.randint(2, size=len(dat))
    for i, label in enumerate(labels):
      if dat[i] is None or dat[i].strip() == "":
        continue
      if label == 0:
        out.write(dat[i].lower() + "|"+ str(label)+ "\n")
      else:
        out.write(dat[i].upper() +  "|"+ str(label)+ "\n")


### Construct the input tokens as specified in the bertsum paper https://arxiv.org/pdf/1903.10318.pdf

In [11]:
cls_vid = tokenizer.vocab["[CLS]"]
sep_vid = tokenizer.vocab["[SEP]"]

class BertDataProcessor:
    def __init__(self, data_dir, out_dir):
        self.data_dir = data_dir
        self.out_dir = out_dir
        self.inp_df = None
        self.meetings = set()

        for inp_file in sorted(glob.glob(self.data_dir+"/*txt")):
            f_name = os.path.basename(inp_file)
            self.meetings.add(f_name.split(".")[0])
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

    def split(self):
        '''
        train, validation and test split by meeting
        '''
        meetings = list(self.meetings)
        self.train_list = meetings[:2]
        self.validation_list = meetings[2:]

    def tokenize(self, chunk, labels):
      input_ids = tokenizer.convert_tokens_to_ids(chunk)
      attn_masks = [1]*len(input_ids)
      cls_ids = [i for i, t in enumerate(input_ids) if t == cls_vid ]
      mask_cls = [1 for _ in range(len(cls_ids))]

      [attn_masks.append(0) for _ in range(len(attn_masks), 512)]
      [input_ids.append(0) for _ in range(len(input_ids), 512)]
      [cls_ids.append(0) for _ in range(len(cls_ids), 512)]
      [mask_cls.append(0) for _ in range(len(mask_cls), 512)]

      _segs = [-1] + [i for i, t in enumerate(input_ids) if t == sep_vid]
      segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
      segments_ids = []
      for i, s in enumerate(segs):
          if (i % 2 == 0):
              segments_ids += s * [0]
          else:
              segments_ids += s * [1]
      [labels.append(0) for _ in range(len(labels), 512)]
      [segments_ids.append(0) for _ in range(len(segments_ids), 512)]
      print("input_ids", input_ids)
      print("attn_masks", attn_masks)
      print("segments_ids", segments_ids)
      print("cls_ids", cls_ids)
      print("labels", labels)
      print("mask_cls", mask_cls)
      b_data_dict = {"src": input_ids, "labels": labels, "segs": segments_ids, 
                  'clss': cls_ids, "attn": attn_masks, "mask_cls":mask_cls}
      return b_data_dict

    def format_to_bert(self, args=None):
        for batch, f_names in (("train", self.train_list), ("validation",self.validation_list)):
            output = []
            for f_name in f_names:
                # process a meeting at a time
                pth = self.data_dir+"/"+f_name+"."+"*txt"
                cur_chunk = None
                cur_labels = []
                for inp_file in sorted(glob.glob(pth)):
                    with open(inp_file, "r") as mtg_f:
                        for line in mtg_f:
                            sent, label = line.split("|")
                            s_chunk = tokenizer.tokenize(sent) [:510]
                            s_chunk = ["[CLS]"] + s_chunk + ["[SEP]"]

                            if cur_chunk is None:
                                cur_chunk = s_chunk
                                cur_labels.append(int(label))
                            else:
                                # if new line fits in to remaining space, add it, else fill with spaces and add a new line
                                if len(s_chunk) + len(cur_chunk) < 512:
                                    cur_chunk += s_chunk
                                    cur_labels.append(int(label))
                                else:
                                    print("\ninput: ",cur_chunk)
                                    b_data_dict = self.tokenize(cur_chunk, cur_labels)
                                    output.append(b_data_dict)
                                    cur_chunk = s_chunk
                                    cur_labels = [int(label)]
                        # handle last sentence
                        if cur_chunk is not None and len(cur_chunk) > 0:
                            print("\ninput: ",cur_chunk)
                            b_data_dict = self.tokenize(cur_chunk, cur_labels)
                            output.append(b_data_dict)

            out =  {"src": [], "labels": [], "segs": [], 
                                                'clss': [], "attn": [], "mask_cls":[]}
            for sample in output:
                for key, val in sample.items():
                    out[key].append(val)
            for k, v in out.items():
                out[k] = torch.LongTensor(v)
            for k, v in out.items():
                torch.save(v, self.out_dir+"/"+k+"_"+batch+".pt")

dp = BertDataProcessor(TOY_DIR, MODEL_INPUT_DIR)
dp.split()
dp.format_to_bert()



input:  ['[CLS]', 'the', 'steel', 'photographs', 'the', 'nut', '##ty', 'amusement', '[SEP]', '[CLS]', 'the', 'miniature', 'plant', 'public', '##izes', 'the', 'swim', '[SEP]', '[CLS]', 'the', 'voice', '##less', 'interest', 'public', '##izes', 'the', 'blood', '[SEP]', '[CLS]', 'the', 'striped', 'industry', 'logs', 'the', 'attraction', '[SEP]', '[CLS]', 'the', 'rate', 'scan', '##s', 'the', 'time', '[SEP]', '[CLS]', 'THE', 'AU', '##TH', '##OR', '##IT', '##Y', 'LA', '##UN', '##CH', '##S', 'THE', 'Y', '##EL', '##L', '##OW', 'DE', '##VE', '##L', '##OP', '##ME', '##NT', '[SEP]', '[CLS]', 'THE', 'PA', '##RT', 'CO', '##UN', '##SE', '##LS', 'THE', 'C', '##U', '##LT', '##UR', '##ED', 'W', '##IN', '##E', '[SEP]', '[CLS]', 'the', 'thing', 'familiar', '##izes', 'the', 'detail', '[SEP]', '[CLS]', 'D', '##ID', 'THE', 'NE', '##IG', '##H', '##BO', '##RL', '##Y', 'IN', '##TE', '##RA', '##CT', '##ION', 'R', '##EA', '##LL', '##Y', 'D', '##AN', '##CE', 'THE', 'P', '##ER', '##SP', '##EC', '##TI', '##VE', '[S

In [12]:
train_d = dict()
val_d = dict()

for d_set in ("src", "labels", "segs", 'clss', "attn", "mask_cls"):
    train_d[d_set] = torch.load(MODEL_INPUT_DIR + d_set+"_"+"train.pt")
    val_d[d_set] = torch.load(MODEL_INPUT_DIR + d_set+"_"+"validation.pt")

train_dataset = TensorDataset(train_d["src"],train_d["labels"], train_d["segs"], 
                              train_d["clss"], train_d["attn"], train_d["mask_cls"])
val_dataset = TensorDataset(val_d["src"], val_d["labels"], val_d["segs"], 
                              val_d["clss"], val_d["attn"], val_d["mask_cls"])


In [13]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [14]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )


In [15]:
# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

model = Summarizer()
# tell pytorch to run on GPU
torch.cuda.empty_cache()
_=model.cuda()



In [16]:
## set all layers to train
for param in model.bert.parameters():
    param.requires_grad=True
for param in model.parameters():
    param.requires_grad=True

In [17]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 5e-4, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
#optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

In [18]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 20

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


This function writes out the labeled data as well as the result for evaluation using rouge

In [19]:
## setup loss function and other variables required for training

#seed_val = 42

#random.seed(seed_val)
#np.random.seed(seed_val)
#torch.manual_seed(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
loss_c = torch.nn.BCELoss(reduction='none').to(device)
#loss_c = torch.nn.MSELoss(reduction='none').to(device)
#loss_c = torch.nn.CrossEntropyLoss(reduction='none').to(device)


## Training

In [20]:
files = glob.glob(RESULT_DIR+'*')
for f in files:
    os.remove(f)
    #print(f)


# For each epoch...
for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        
        src, labels, segs, clss, attn, mask_cls = batch
        src, labels, segs, clss, attn, mask_cls = src.to(device), labels.to(device), segs.to(device), clss.to(device), attn.to(device), mask_cls.to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

#        x, segs, clss, mask, mask_cls, sentence_range=None
        probs, mask_cls, x, h = model( src, segs, clss, attn, mask_cls)
        print("TRAIN OUTPUTS:", probs.shape, probs[:,:20])
        print("TRAIN LABELS:",labels.shape,labels[:,:20])
        loss = loss_c(probs, labels.float())
        print("TRAIN LOSS",loss[:,:20])
        loss = (loss * mask_cls.float()).sum()
        print("TRAIN LOSS TOTAL",loss)
        
        #probs = probs.detach().cpu().numpy()
        #labels = labels.to('cpu').numpy()

        #gen_outputs("TBATCH"+str(step), probs, labels, clss.to(device), mask_cls.to(device), src.to(device))

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        (loss/loss.numel()).backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    for i in range(len(list(model.parameters()))):
        #print("list(model.parameters())[0].grad = ", i, list(model.parameters())[i].grad)
        pass
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.


    print("")
    print("Running Validation...")
    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    step = -1
#    for batch in validation_dataloader:
    for batch in train_dataloader:
        step += 1
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        src, labels, segs, clss, attn, mask_cls = batch
        src, labels, segs, clss, attn, mask_cls = src.to(device), labels.to(device), segs.to(device), clss.to(device), attn.to(device), mask_cls.to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            probs, mask_cls, x, h = model( src, segs, clss, attn, mask_cls)

            print("VALIDATION OUTPUTS:", probs.shape, probs[:,:20])
            print("VALIDATION LABELS:",labels.shape,labels[:,:20])

            loss = loss_c(probs[:20], labels[:20].float())
            print("VALIDATION LOSS",loss[:,:20])
            loss = (loss * attn.float()).sum()
            print("VALIDATION LOSS TOTAL",loss)
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        #print(type(logits), type(b_labels))
        
    #    probs = probs.numpy() #logits.detach().cpu().numpy()
    #    label_ids = labels.numpy() #b_labels.to('cpu').numpy()

    # write results so that we can use rouge to compare
#        gen_outputs("BATCH"+str(epoch_i)+"_"+str(step), probs, labels, clss.to('cpu').numpy(), mask_cls.to('cpu').numpy(), src.to('cpu').numpy())

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Training...
X (CLS) totals =  [ -3.8937967  -7.733228   -3.8720956  -6.1603384  -3.7517133  -3.1392064
 -12.254707    0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.       ]
Logit =  tensor([-0.1639,  0.7479, -0.1173,  0.0648, -0.3375, -0.2434, -0.1251, -0.0013,
        -0.0013, -0.0013], device='cuda:0', grad_fn=<SliceBackward>)
TRAIN OUTPUTS: torch.Size([2, 512]) tensor([[0.4591, 0.6787, 0.4707, 0.5162, 0.4164, 0.4394, 0.4688, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.4428, 0.4988, 0.5365, 0.4800, 0.4196, 0.4312, 0.4643, 0.4871, 0.5507,
         0.4861, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000]], device='cuda:0', grad_fn=<SliceBackward>)
TRAIN LABELS: torch.Size([2, 512]) tensor([[0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0,