In [None]:
#!pip install transformers

In [None]:
#!pip install pyrouge

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
import pandas as pd
import numpy as np
import os
import sys
import tensorflow as tf
import torch
import datetime, time
import io
import re
from csv import reader
import matplotlib.pyplot as plt
import traceback
import random
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

import glob, os
import torch.nn as nn

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import BertModel, AdamW, BertConfig,BertTokenizer
from train import Summarizer

import tensorflow as tf
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
from pyrouge import Rouge155


In [68]:
BASE_DIR = "/home/rajivn/W266/W266-fall-2020-hwu-rnair/"
INPUT_DIR = BASE_DIR+"data/ICSI_plus_NXT/processing/"
DATA_DIR = BASE_DIR+"data/ICSI_plus_NXT/tensors/"
RESULT_DIR = BASE_DIR+"/data/ICSI_plus_NXT/result/"

if not os.path.exists(RESULT_DIR):
    os.makedirs(RESULT_DIR)

files = glob.glob(RESULT_DIR+'*')
for f in files:
    os.remove(f)


## Create model inputs

In [29]:
class BertDataProcessor:
    def __init__(self, data_dir, out_dir):
        self.data_dir = data_dir
        self.out_dir = out_dir
        self.inp_df = None
        self.meetings = set()

        for inp_file in sorted(glob.glob(self.data_dir+"/*txt")):
            f_name = os.path.basename(inp_file)
            self.meetings.add(f_name.split(".")[0])
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

    def split(self):
        '''
        train, validation and test split by meeting
        '''
        meetings = list(self.meetings)
#        self.train_list = meetings[:50]
#        self.validation_list = meetings[50:58]
#        self.test_list = meetings[58:]

        self.train_list = meetings[:10]
        self.validation_list = meetings[10:15]
        self.test_list = meetings[15:20]



    def format_to_bert(self, args=None):

        cls_vid = tokenizer.vocab["[CLS]"]
        sep_vid = tokenizer.vocab["[SEP]"]

        for batch, f_names in (("train", self.train_list), ("validation",self.test_list),
                 ("test", self.validation_list)):
            output = []
            for f_name in f_names:
                # process a meeting at a time
                pth = self.data_dir+"/"+f_name+"."+"*txt"
                cur_chunk = None
                cur_labels = []
                for inp_file in sorted(glob.glob(pth)):
                    with open(inp_file, "r") as mtg_f:
                        for line in mtg_f:
                            sent, label = line.split("|")
                            s_chunk = tokenizer.tokenize(sent) [:510]
                            s_chunk = ["[CLS]"] + s_chunk + ["[SEP]"]
                            if cur_chunk is None:
                                cur_chunk = s_chunk
                                cur_labels.append(int(label))

                            else:
                                # if new line fits in to remaining space, add it, else fill with spaces and add a new line
                                if len(s_chunk) + len(cur_chunk) < 512:
                                    cur_chunk += s_chunk
                                    cur_labels.append(int(label))
                                else:
                                    input_ids = tokenizer.convert_tokens_to_ids(cur_chunk)
                                    attn_masks = [1]*len(input_ids)
                                    cls_ids = [i for i, t in enumerate(input_ids) if t == cls_vid ]
                                    mask_cls = [1 for _ in range(len(cls_ids))]

                                    [attn_masks.append(0) for _ in range(len(attn_masks), 512)]
                                    [input_ids.append(0) for _ in range(len(input_ids), 512)]
                                    [cls_ids.append(0) for _ in range(len(cls_ids), 512)]
                                    [mask_cls.append(0) for _ in range(len(mask_cls), 512)]

                                    _segs = [-1] + [i for i, t in enumerate(input_ids) if t == sep_vid]
                                    segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
                                    segments_ids = []
                                    for i, s in enumerate(segs):
                                        if (i % 2 == 0):
                                            segments_ids += s * [0]
                                        else:
                                            segments_ids += s * [1]
                                    [cur_labels.append(0) for _ in range(len(cur_labels), 512)]
                                    [segments_ids.append(0) for _ in range(len(segments_ids), 512)]
                                    b_data_dict = {"src": input_ids, "labels": cur_labels, "segs": segments_ids, 
                                                'clss': cls_ids, "attn": attn_masks, "mask_cls":mask_cls}
                                    output.append(b_data_dict)
                                    cur_chunk = s_chunk
                                    cur_labels = [int(label)]
            out =  {"src": [], "labels": [], "segs": [], 
                                                'clss': [], "attn": [], "mask_cls":[]}
            for sample in output:
                for key, val in sample.items():
                    out[key].append(val)
            for k, v in out.items():
                out[k] = torch.LongTensor(v)
            for k, v in out.items():
                torch.save(v, self.out_dir+"/"+k+"_"+batch+".pt")

dp = BertDataProcessor(INPUT_DIR, 
                    DATA_DIR)
dp.split()
dp.format_to_bert()

## Encoder classes

In [7]:
class Bert(nn.Module):
    def __init__(self, temp_dir="/tmp/bert", load_pretrained_bert=True, bert_config=None):
        super(Bert, self).__init__()
        print(temp_dir)
        if(load_pretrained_bert):
            self.model = BertModel.from_pretrained('bert-base-uncased')
        else:
            self.model = BertModel(bert_config)

    def forward(self, x, segs, mask):
        # the below returns a tuple. First element in the tuple is last hidden state. Second element in tuple is pooler output
        result = self.model(x, attention_mask =mask, position_ids=segs)
#        top_vec = encoded_layers[-1]

        top_vec = result[0]
        return top_vec


class Classifier(nn.Module):
    def __init__(self, hidden_size):
        super(Classifier, self).__init__()
        self.linear1 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, mask_cls):
        h = self.linear1(x)
        h = h.squeeze(-1)
        sent_scores = self.sigmoid(h) * mask_cls.float()
        return sent_scores


class Summarizer(nn.Module):
    def __init__(self, args=None, num_hidden = 768, load_pretrained_bert = True, bert_config = None):
        super(Summarizer, self).__init__()
        self.args = args
        self.bert = Bert( load_pretrained_bert, bert_config=None)
#        if (args.encoder == 'classifier'):
        self.encoder = Classifier(num_hidden)

    def load_cp(self, pt):
        self.load_state_dict(pt['model'], strict=True)

    def forward(self, x, segs, clss, mask, mask_cls, sentence_range=None):
        top_vec = self.bert(x, segs, mask)
        sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss]
        sents_vec = sents_vec * mask_cls[:, :, None].float()
        sent_scores = self.encoder(sents_vec, mask_cls).squeeze(-1)
        return sent_scores, mask_cls


## Training

In [8]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    print('GPU device not found')


GPU device not found


In [9]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [10]:
train_d = dict()
val_d = dict()
test_d=dict()

for d_set in ("src", "labels", "segs", 'clss', "attn", "mask_cls"):
    train_d[d_set] = torch.load(DAT_DIR + d_set+"_"+"train.pt")
    val_d[d_set] = torch.load(DAT_DIR + d_set+"_"+"validation.pt")
    test_d[d_set] = torch.load(DAT_DIR + d_set+"_"+"test.pt")

train_dataset = TensorDataset(train_d["src"],train_d["labels"], train_d["segs"], 
                              train_d["clss"], train_d["attn"], train_d["mask_cls"])
val_dataset = TensorDataset(val_d["src"], val_d["labels"], val_d["segs"], 
                              val_d["clss"], val_d["attn"], val_d["mask_cls"])
test_dataset = TensorDataset(test_d["src"], test_d["labels"], test_d["segs"], 
                              test_d["clss"], test_d["attn"], test_d["mask_cls"])


In [11]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [12]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 4

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )


In [13]:
# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

model = Summarizer()
if torch.cuda.is_available(): 
    # tell pytorch to run on GPU
    model.cuda()

True


In [14]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


for param in model.bert.parameters():
    param.requires_grad=False

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.model.embeddings.word_embeddings.weight            (30522, 768)
bert.model.embeddings.position_embeddings.weight          (512, 768)
bert.model.embeddings.token_type_embeddings.weight          (2, 768)
bert.model.embeddings.LayerNorm.weight                        (768,)
bert.model.embeddings.LayerNorm.bias                          (768,)

==== First Transformer ====

bert.model.encoder.layer.0.attention.self.query.weight    (768, 768)
bert.model.encoder.layer.0.attention.self.query.bias          (768,)
bert.model.encoder.layer.0.attention.self.key.weight      (768, 768)
bert.model.encoder.layer.0.attention.self.key.bias            (768,)
bert.model.encoder.layer.0.attention.self.value.weight    (768, 768)
bert.model.encoder.layer.0.attention.self.value.bias          (768,)
bert.model.encoder.layer.0.attention.output.dense.weight   (768, 768)
bert.model.encoder.layer.0.attention.output.dense.bias        

In [15]:
## set all layers to train
for param in model.bert.parameters():
    param.requires_grad=True

In [16]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


In [17]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


This function writes out the labeled data as well as the result for evaluation using rouge

In [73]:
def gen_outputs(batch_id, probs, labels, cls_ids, mask_cls,src):
    # extract sentences and labels
    print (probs)
    probs = np.where(probs>0.5, 1, 0)

    reference = []
    result = []
    for p, passage in enumerate(src):
#        print(probs[p,])
        lines = tokenizer.decode(passage)
        lines = lines.split("[SEP]")
        for i, sent in enumerate(lines):
            sent = sent.replace("[SEP]", "").replace("[CLS]", "").replace("[PAD]", "")
            with open(RESULT_DIR+"REF"+batch_id, "a") as ref, open(RESULT_DIR+"LAB"+batch_id, "a") as lab:
                if labels[p, i] == 1:
                    reference.append(sent)
                    lab.write(sent)
                if probs[p, i] == 1:
                    print("Found pred ", sent)
                    result.append(sent)
                    ref.write(sent)


In [19]:
## setup loss function and other variables required for training

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
loss_c = torch.nn.BCELoss(reduction='none')


## Training

In [75]:
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        
        src, labels, segs, clss, attn, mask_cls = batch
        src, labels, segs, clss, attn, mask_cls = src.to(device), labels.to(device), segs.to(device), clss.to(device), attn.to(device), mask_cls.to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

#        x, segs, clss, mask, mask_cls, sentence_range=None
        probs, mask_cls = model( src, segs, clss, attn, mask_cls)
        loss = loss_c(probs, labels.float())
        loss = (loss * attn.float()).sum()

        probs = probs.detach().cpu().numpy()
        labels = labels.to('cpu').numpy()


#        accuracy = gen_outputs("BATCH"+str(step), probs, labels, clss, mask_cls, src)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))



Training...


KeyboardInterrupt: 

In [21]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = BASE_DIR+'data/model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)


# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
torch.save(model.state_dict(), output_dir+"bertsum_classifier")

Saving model to /home/rajivn/W266/W266-fall-2020-hwu-rnair/data/model_save/


## Validation

In [74]:
# ========================================
#               Validation
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.

print("")
print("Running Validation...")

t0 = time.time()

# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()

# Tracking variables 
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0

# Evaluate data for one epoch
step = -1
for batch in validation_dataloader:
    step += 1
    # Unpack this training batch from our dataloader. 
    #
    # As we unpack the batch, we'll also copy each tensor to the GPU using 
    # the `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids 
    #   [1]: attention masks
    #   [2]: labels 
    src, labels, segs, clss, attn, mask_cls = batch
    src, labels, segs, clss, attn, mask_cls = src.to(device), labels.to(device), segs.to(device), clss.to(device), attn.to(device), mask_cls.to(device)

    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        
        probs, mask_cls = model( src, segs, clss, attn, mask_cls)
        loss = loss_c(probs, labels.float())
        loss = (loss * attn.float()).sum()


    # Accumulate the validation loss.
    total_eval_loss += loss.item()

    # Move logits and labels to CPU
    #print(type(logits), type(b_labels))

    # write results so that we can use rouge to compare
    gen_outputs("BATCH"+str(step), probs.numpy(), labels.numpy(), clss.numpy(), mask_cls.numpy(), src.numpy())


print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Running Validation...
[[0.02253773 0.02251139 0.02253773 ... 0.         0.         0.        ]
 [0.03220176 0.03225196 0.03220176 ... 0.         0.         0.        ]
 [0.03087844 0.03092079 0.03087844 ... 0.         0.         0.        ]
 [0.0286278  0.02863207 0.0286278  ... 0.         0.         0.        ]]
[[0.02105888 0.02106893 0.02105888 ... 0.         0.         0.        ]
 [0.0187543  0.01877958 0.0187543  ... 0.         0.         0.        ]
 [0.02080334 0.02079263 0.02080334 ... 0.         0.         0.        ]
 [0.05928656 0.05940514 0.05928656 ... 0.         0.         0.        ]]
[[0.04365123 0.04368804 0.04365123 ... 0.         0.         0.        ]
 [0.04256718 0.04263455 0.04256718 ... 0.         0.         0.        ]
 [0.02698812 0.02700876 0.02698812 ... 0.         0.         0.        ]
 [0.03332015 0.033367   0.03332015 ... 0.         0.         0.        ]]
[[0.03844148 0.03849587 0.03844148 ... 0.         0.         0.        ]
 [0.09006088 0.09032573 0

KeyboardInterrupt: 

In [46]:
len([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1])

78