In [1]:
# credit: training cells are restructured version of a colab document that fine-tunes bert on text classification.
# adopted from Chris McCormick  https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX?usp=sharing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# weights and biases
!pip install wandb -q
!wandb login

[K     |████████████████████████████████| 1.8MB 13.9MB/s 
[K     |████████████████████████████████| 102kB 13.4MB/s 
[K     |████████████████████████████████| 133kB 64.2MB/s 
[K     |████████████████████████████████| 163kB 43.1MB/s 
[K     |████████████████████████████████| 102kB 11.0MB/s 
[K     |████████████████████████████████| 71kB 8.2MB/s 
[?25h  Building wheel for watchdog (setup.py) ... [?25l[?25hdone
  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
!cd /content/drive/MyDrive/cs-7643-dl/project/data
DATA_DIR = '/content/drive/MyDrive/cs-7643-dl/project/sample'
MODEL_DIR = '/content/drive/MyDrive/cs-7643-dl/project/model'
RESULT_DIR = '/content/drive/MyDrive/cs-7643-dl/project/result'
ADAPTER_DATA_DIR = '/content/drive/MyDrive/cs-7643-dl/project/adapter/data'
ADAPTER_MODEL_DIR = '/content/drive/MyDrive/cs-7643-dl/project/adapter/model'
ADAPTER_RESULT_DIR = '/content/drive/MyDrive/cs-7643-dl/project/adapter/result'

In [4]:
# install adatpter transformers
!pip install -U git+https://github.com/Adapter-Hub/adapter-transformers.git
# data processing
!pip install datasets
# metrics dependencies
!pip install rouge_score

Collecting git+https://github.com/Adapter-Hub/adapter-transformers.git
  Cloning https://github.com/Adapter-Hub/adapter-transformers.git to /tmp/pip-req-build-wh2njpm6
  Running command git clone -q https://github.com/Adapter-Hub/adapter-transformers.git /tmp/pip-req-build-wh2njpm6
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.3MB/s 
[?25hCollecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 31.6MB/s 
Collecting sentencepiece==0.1.91
[

In [5]:
import time
import datetime
import os
import random

import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, dataset
import torch.nn as nn

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import AdapterType

from datasets import Dataset, load_metric

import wandb

In [6]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [7]:
def load_data(dir, data_mode = "train", format=".pt", split=False):
  dataset = []
  for filename in os.listdir(dir):
    if data_mode in filename:
      dataset.extend(torch.load(os.path.join(dir, filename)))
  return dataset
      

In [8]:
def process_data(dataset, max_len=512):
  src = []
  src_sent_labels = []
  segs = []
  att_msk = []
  cls_ind = []
  max_labels = max(len(x['src_sent_labels']) for x in dataset)
  max_cls_ind = max(len(x['clss']) for x in dataset)
  for i in range(len(dataset)):
    dataset[i]['src'] += [0] * (max_len - len(dataset[i]['src']))
    dataset[i]['segs'] += [0] * (max_len - len(dataset[i]['segs']))
    if len(dataset[i]['src']) > max_len:
      dataset[i]['src'] = dataset[i]['src'][:max_len]
    if len(dataset[i]['segs']) > max_len:
      dataset[i]['segs'] = dataset[i]['segs'][:max_len]
    if max(dataset[i]['clss']) >= max_len: # truncate cls positions after 512th token
      dataset[i]['clss'] = [x for x in dataset[i]['clss'] if x < max_len]
      dataset[i]['src_sent_labels'] = dataset[i]['src_sent_labels'][:len(dataset[i]['clss'])] 
    dataset[i]['att_msk'] = [0 if x == 0 else 1 for x in dataset[i]['src']]
    dataset[i]['clss'] += [-1] * (max_cls_ind - len(dataset[i]['clss']))
    dataset[i]['src_sent_labels'] += [-1] * (max_labels - len(dataset[i]['src_sent_labels']))

  for ex in dataset:
    src.append(ex['src'])
    src_sent_labels.append(ex['src_sent_labels'])
    segs.append(ex['segs'])
    att_msk.append(ex['att_msk'])
    cls_ind.append(ex['clss'])
  src = torch.tensor(src)
  src_sent_labels = torch.tensor(src_sent_labels)
  segs = torch.tensor(segs)
  att_msk = torch.FloatTensor(att_msk)
  cls_ind = torch.tensor(cls_ind)
  dataset = TensorDataset(src, src_sent_labels, segs, att_msk, cls_ind)
  return dataset

In [9]:
def create_data_loader(dataset, batch_size, sampler_type="random"):
  if sampler_type == "random":
    dataloader = DataLoader(
                dataset,
                sampler = RandomSampler(dataset),
                batch_size = batch_size
            )
  else:
    dataloader = DataLoader(
                dataset,
                sampler = SequentialSampler(dataset),
                batch_size = batch_size
            )
  return dataloader

In [10]:
class Classifier(nn.Module):
  """
  idea of adding a simple classifier on top of bert for 
  extractive summarization came from PreSumm. The following snippet from
   https://github.com/nlpyang/PreSumm/blob/master/src/models/encoder.py
  """
  def __init__(self, input_size):
    super(Classifier, self).__init__()
    self.l1 = nn.Linear(input_size, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    out = None
    out = self.l1(x).squeeze(-1)
    out = self.sigmoid(out)
    return out

In [11]:
class ExtractiveSummarizer(nn.Module):
  def __init__(self, language_model, classifier, device, approach="fine-tuning"):
    super(ExtractiveSummarizer, self).__init__()
    self.lm = language_model
    self.cl = classifier
    self.device = device
    self.approach = approach
  def forward(self, batch_src, batch_segs, batch_att_msk):
    lm_out = self.lm(input_ids=batch_src, 
            token_type_ids=batch_segs,
            attention_mask=batch_att_msk)
    last_hidden_layer = lm_out[0]
    cls_indices = batch_src==101
    cls_indices = cls_indices.unsqueeze(-1).expand(-1, -1, 768)
    cls_embs = last_hidden_layer[cls_indices].reshape(-1, 768)
    out = self.cl(cls_embs)
    return out

In [12]:
def criterion(pred, target):
  loss = nn.BCELoss()
  return loss(pred, target)

In [13]:
def train(model, train_dataloader, device, criterion, optimizer, scheduler):
  """main loop is the modified version of https://mccormickml.com/2019/07/22/BERT-fine-tuning/
    which in turn was adopted from the `run_glue.py` script here:
    https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128"""
  model.train()
  total_train_loss = 0
  t0 = time.time()
  for step, batch in enumerate(train_dataloader):
    if step % 20 == 0:
      elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
    batch_src = batch[0].to(device)
    batch_src_sent_labels = batch[1].to(device)
    batch_segs = batch[2].to(device)
    batch_att_msk = batch[3].to(device)
    model.zero_grad()  
    out = model(batch_src, batch_segs, batch_att_msk)
    batch_src_sent_labels_flat = batch_src_sent_labels[batch_src_sent_labels!=-1]
    loss = criterion(out, batch_src_sent_labels_flat.float())
    total_train_loss += loss.item()
    wandb.log({"Training Loss": loss.item()})
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
  return total_train_loss

In [14]:
def validate(model, validation_dataloader, device, criterion):
  """main loop from per https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=6J-FYdx6nFE_"""
  model.eval()
  total_val_loss = 0
  t0 = time.time()
  pred_ids_tensor = []
  for idx, batch in enumerate(validation_dataloader):
    batch_src = batch[0].to(device)
    batch_src_sent_labels = batch[1].to(device)
    batch_segs = batch[2].to(device)
    batch_att_msk = batch[3].to(device)
    with torch.no_grad():
      out = model(batch_src, batch_segs, batch_att_msk)
    batch_src_sent_labels_flat = batch_src_sent_labels[batch_src_sent_labels!=-1]
    loss = criterion(out, batch_src_sent_labels_flat.float())
    total_val_loss += loss.item()
    pred_ids = batch_src_sent_labels.detach().clone().float()
    pred_ids[pred_ids!=-1]=out
    pred_ids_tensor.append(pred_ids)
  pred_ids_tensor = torch.cat(pred_ids_tensor, dim=0)
  return total_val_loss, pred_ids_tensor

In [15]:
def convert_ids_to_summaries(dataset_raw, pred_ids, SUM_LEN):
  summaries = {'predictions': [], 'references': []}
  for i in range(len(dataset_raw)):
    # top_sent_inds = torch.topk(pred_ids[i], SUM_LEN).indices[torch.topk(pred_ids[i], SUM_LEN).values != -1]
    num_sents = len(dataset_raw[i]['src_txt'])
    if num_sents < SUM_LEN:
      print('the number of sentences in the following story is less \
      than SUM_LEN: ', SUM_LEN, '... getting all the sentences')
      print('story number ', i, '\n', dataset_raw[i]['src_txt'])
    top_sent_inds = torch.topk(pred_ids[i], min(SUM_LEN, num_sents)).indices[torch.topk(pred_ids[i], min(SUM_LEN, num_sents)).values != -1]
    top_sent_inds, ind = torch.sort(top_sent_inds)
    hypothesis = ' '.join([dataset_raw[i]['src_txt'][sent_id] for sent_id in top_sent_inds])
    summaries['predictions'].append(hypothesis)
    summaries['references'].append(dataset_raw[i]['tgt_txt'])
  return summaries

In [16]:
def calculate_rouge(summaries):
  scores = {}
  rouge = load_metric('rouge')
  results = rouge.compute(predictions=summaries['predictions'], references=summaries['references'])
  scores['rouge1-fmeasure-mid'] = results['rouge1'].mid.fmeasure
  scores['rouge2-fmeasure-mid'] = results['rouge2'].mid.fmeasure
  scores['rougeL-fmeasure-mid'] = results['rougeL'].mid.fmeasure
  scores['rougeLsum-fmeasure-mid'] = results['rougeLsum'].mid.fmeasure
  return scores


In [None]:
# training/validation

BATCH_SIZE = 16
EPOCHS = 10
MAX_LEN = 512


TRAINING_DIR = DATA_DIR
VALIDATION_DIR = DATA_DIR

GEN_SUM = False # whether to generate a summary
SUM_LEN = 5 # number of sentences in generated summary

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

train_dataset = load_data(TRAINING_DIR, "train", ".pt")
train_dataset = process_data(train_dataset[:500], MAX_LEN)
train_dataloader = create_data_loader(train_dataset, BATCH_SIZE, "random")

validation_dataset_raw = load_data(VALIDATION_DIR, "valid", ".pt")
validation_dataset = process_data(validation_dataset_raw[:100], MAX_LEN)
# must choose "sequential" to keep the order in line with validation_dataset_raw
validation_dataloader = create_data_loader(validation_dataset, BATCH_SIZE, "sequential")

bert = BertModel.from_pretrained(
    "bert-base-uncased",
    output_hidden_states = True
    )
bert = bert.to(device)
classifier = Classifier(768).to(device)
extractive_summarizer = ExtractiveSummarizer(bert, classifier, device, "fine-tuning")

optimizer = AdamW(extractive_summarizer.parameters(),
                  lr = 2e-5)

total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

wandb.init(project="finetuning-bert-for-summarization")
config = wandb.config
wandb.watch(extractive_summarizer, log="all")
stats = []
for epoch in range(EPOCHS):
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
  print('Training\n')
  t0 = time.time()
  total_train_loss = train(extractive_summarizer, train_dataloader, device, criterion, optimizer, scheduler)
  avg_train_loss = total_train_loss / len(train_dataloader) 
  training_time = format_time(time.time() - t0)
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))
  print("")
  print("Validation\n")
  t0 = time.time()
  total_val_loss, pred_ids = validate(extractive_summarizer, validation_dataloader, device, criterion)
  if GEN_SUM:
    convert_ids_to_summaries(validation_dataset_raw, pred_ids, SUM_LEN)
  avg_val_loss = total_val_loss / len(validation_dataloader) 
  validation_time = format_time(time.time() - t0)
  print("  Validation Loss: {0:.2f}".format(avg_val_loss))
  print("  Validation took: {:}".format(validation_time))
  stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )




In [45]:
# save stats
pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=stats)
df_stats = df_stats.set_index('epoch')
df_stats.to_csv(os.path.join(RESULT_DIR, 'fin-tuning-stats' + format(time.time())))

In [None]:
# https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_models_for_inference.html
# save state_dict
# PATH = os.path.join(MODEL_DIR, "state_dict_model.pt")
# torch.save(model_to_save.state_dict(), PATH)
# extractive_summarizer = ExtractiveSummarizer(bert, classifier, device, "fine-tuning")
# extractive_summarizer.load_state_dict(torch.load(PATH))
# extractive_summarizer.eval()

# save entire model
# Specify a path
PATH = os.path.join(MODEL_DIR, "entire_model.pt")

# Save
torch.save(extractive_summarizer, PATH)

# Load
extractive_summarizer = torch.load(PATH)
extractive_summarizer.eval()

In [None]:
# testing
BATCH_SIZE = 16
MAX_LEN = 512


TESTING_DIR = DATA_DIR

GEN_SUM = True # whether to generate a summary
SUM_LEN = 5 # number of sentences in generated summary

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

test_dataset_raw = load_data(TESTING_DIR, "test", ".pt")
test_dataset = process_data(test_dataset_raw[:100], MAX_LEN)
test_dataloader = create_data_loader(test_dataset, BATCH_SIZE, "sequential")

print("start inference")
total_test_loss, pred_ids = validate(extractive_summarizer, test_dataloader, device, criterion)
if GEN_SUM:
  print("generate summaries")
  summaries = convert_ids_to_summaries(test_dataset_raw[:100], pred_ids, SUM_LEN)
  print("calculate rouge")
  rouge_scores = calculate_rouge(summaries)
  rouge_scores = pd.DataFrame(rouge_scores.items())
  rouge_scores.to_csv(os.path.join(RESULT_DIR, 'fine-tuning-rouge-scores' + format(time.time())))

start inference
generate summaries
calculate rouge


In [17]:
def encode_batch(batch, MAX_LEN=512):
  """adapted from transformer-adapter's github https://github.com/Adapter-Hub/adapter-transformers/blob/master/notebooks/01_Adapter_Training.ipynb"""
  return tokenizer(
      batch["sentences"],
      max_length=MAX_LEN,
      truncation=True,
      padding="max_length"
  )

In [18]:
def preprocess_adapter_data(tokenizer, dataset_raw, encode_batch, device):
  sentences = []
  labels = []
  for i in range(len(dataset_raw)):
    if len(dataset_raw[i]['src_txt']) == len(dataset_raw[i]['src_sent_labels']): #exclude inconsistent data
      sentences.extend(dataset_raw[i]['src_txt'])
      labels.extend(dataset_raw[i]['src_sent_labels'])
  dataset = {'sentences': sentences, 'labels': labels}
  dataset = Dataset.from_dict(dataset)
  # Encode the input data
  dataset = dataset.map(encode_batch, batched=True)
  dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"], device=device)
  return dataset

In [19]:
def train_adapter(model, train_dataloader, device, criterion, optimizer, scheduler):
  """main loop is the modified version of https://mccormickml.com/2019/07/22/BERT-fine-tuning/
    which in turn was adopted from the `run_glue.py` script here:
    https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128"""
  model.train()
  total_train_loss = 0
  t0 = time.time()
  for step, batch in enumerate(train_dataloader):
    if step % 20 == 0:
      elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
    batch_src = batch['input_ids'].to(device)
    batch_labels = batch['labels'].to(device)
    batch_att_msk = batch['attention_mask'].to(device)
    model.zero_grad()  
    # out = model(batch_src, batch_att_msk)
    loss, logits = model(batch_src, 
                             token_type_ids=None, 
                             attention_mask=batch_att_msk, 
                             labels=batch_labels)
    # batch_src_sent_labels_flat = batch_src_sent_labels[batch_src_sent_labels!=-1]
    # loss = criterion(out, batch_src_sent_labels_flat.float())
    total_train_loss += loss.item()
    wandb.log({"Training Loss": loss.item()})
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
  return total_train_loss

In [20]:
def validate_adapter(model, validation_dataloader, device):
  """main loop from per https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=6J-FYdx6nFE_"""
  model.eval()
  total_val_loss = 0
  t0 = time.time()
  preds_list = []
  for idx, batch in enumerate(validation_dataloader):
    if idx % 20 == 0:
      elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(validation_dataloader), elapsed))
    batch_src = batch['input_ids'].to(device)
    batch_labels = batch['labels'].to(device)
    batch_att_msk = batch['attention_mask'].to(device)
    with torch.no_grad():
      loss, logits = model(batch_src, 
                            token_type_ids=None, 
                            attention_mask=batch_att_msk, 
                            labels=batch_labels)
    preds_batch = torch.argmax(logits, axis=1)
    preds_list.append(preds_batch)
    # batch_src_sent_labels_flat = batch_src_sent_labels[batch_src_sent_labels!=-1]
    # loss = criterion(out, batch_src_sent_labels_flat.float())
    total_val_loss += loss.item()
    #   pred_ids = batch_src_sent_labels.detach().clone().float()
    #   pred_ids[pred_ids!=-1]=out
    #   pred_ids_tensor.append(pred_ids)
  preds_tensor = torch.cat(preds_list, dim=0)
  return total_val_loss, preds_tensor, preds_list

In [None]:
# training/validation

BATCH_SIZE = 16
EPOCHS = 10
MAX_LEN = 512


TRAINING_DIR = ADAPTER_DATA_DIR
VALIDATION_DIR = ADAPTER_DATA_DIR

GEN_SUM = False # whether to generate a summary
SUM_LEN = 5 # number of sentences in generated summary

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_dataset_raw = load_data(TRAINING_DIR, "train", ".pt")
train_dataset = preprocess_adapter_data(tokenizer, train_dataset_raw[:500], encode_batch, device)
train_dataloader = create_data_loader(train_dataset, BATCH_SIZE, "sequential")

validation_dataset_raw = load_data(VALIDATION_DIR, "valid", ".pt")
validation_dataset = preprocess_adapter_data(tokenizer, validation_dataset_raw[:100], encode_batch, device)
validation_dataloader = create_data_loader(validation_dataset, BATCH_SIZE, "sequential")

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
model.add_adapter("cnn-dm-summarization", AdapterType.text_task)
model.train_adapter(["cnn-dm-summarization"])
model.set_active_adapters([["cnn-dm-summarization"]])
model.to(device)

optimizer = AdamW(model.parameters(),
                  lr = 1e-4)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

wandb.init(project="adapter-training-for-summarization")
config = wandb.config
wandb.watch(model, log="all")
stats = []

for epoch in range(EPOCHS):
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
  print('Training\n')
  t0 = time.time()
  total_train_loss = train_adapter(model, train_dataloader, device, criterion, optimizer, scheduler)
  avg_train_loss = total_train_loss / len(train_dataloader) 
  training_time = format_time(time.time() - t0)
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))
  print("")
  print("Validation\n")
  t0 = time.time()
  total_val_loss, preds_tensor, preds_list = validate_adapter(model, validation_dataloader, device)
  avg_val_loss = total_val_loss / len(validation_dataloader) 
  validation_time = format_time(time.time() - t0)
  print("  Validation Loss: {0:.2f}".format(avg_val_loss))
  print("  Validation took: {:}".format(validation_time))
  stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )



In [23]:
# save stats
pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=stats)
df_stats = df_stats.set_index('epoch')
df_stats.to_csv(os.path.join(ADAPTER_RESULT_DIR, 'adapter-training-stats' + format(time.time())))

In [None]:
model.save_adapter(ADAPTER_MODEL_DIR, "cnn-dm-summarization")


In [21]:
def create_summaries_from_preds(dataset_raw, preds, SUM_LEN):
  summaries = {'predictions': [], 'references': []}
  last_ind = 0
  for i in range(len(dataset_raw)):
    num_sents = len(dataset_raw[i]['src_txt'])
    num_labels = len(dataset_raw[i]['src_sent_labels'])
    if num_sents == num_labels: #exclude inconsistent data
      next_ind = last_ind + num_sents
      # print(i, num_sents, SUM_LEN, last_ind, next_ind)
      top_sent_inds = torch.topk(preds_tensor[last_ind:next_ind], min(SUM_LEN, num_sents)).indices[torch.topk(preds_tensor[last_ind:next_ind], min(SUM_LEN, num_sents)).values != -1]
      top_sent_inds, ind = torch.sort(top_sent_inds)
      hypothesis = ' '.join([dataset_raw[i]['src_txt'][sent_id] for sent_id 
                            in top_sent_inds])
      summaries['predictions'].append(hypothesis)
      summaries['references'].append(dataset_raw[i]['tgt_txt'])
      last_ind = next_ind
  return summaries

In [None]:
# testing
BATCH_SIZE = 16
MAX_LEN = 512


TESTING_DIR = ADAPTER_DATA_DIR

GEN_SUM = True # whether to generate a summary
SUM_LEN = 5 # number of sentences in generated summary

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

test_dataset_raw = load_data(TESTING_DIR, "test", ".pt")
test_dataset = preprocess_adapter_data(tokenizer, test_dataset_raw[:100], encode_batch, device)
test_dataloader = create_data_loader(test_dataset, BATCH_SIZE, "sequential")

print("start inference")
total_test_loss, preds_tensor, preds_list = validate_adapter(model, test_dataloader, device)
if GEN_SUM:
  print("generate summaries")
  summaries = create_summaries_from_preds(test_dataset_raw[:100], preds_tensor, SUM_LEN)
  print("calculate rouge")
  rouge_scores = calculate_rouge(summaries)
  rouge_scores = pd.DataFrame(rouge_scores.items())
  rouge_scores.to_csv(os.path.join(ADAPTER_RESULT_DIR, 'adapter-training-rouge-scores' + format(time.time())))

In [None]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

896066

In [None]:
sum(p.numel() for p in extractive_summarizer.parameters() if p.requires_grad)

109483009

In [88]:
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
def plot_learning_curve(csv_path, output_path, approach):
  df_stats = pd.read_csv(csv_path)
  plt.style.use('seaborn')
  sns.set(font_scale=1.75)
  plt.rcParams["figure.figsize"] = (12,6)
  plt.plot(df_stats['epoch'], df_stats['Training Loss'], 'b-o', label="Training")
  plt.plot(df_stats['epoch'], df_stats['Valid. Loss'], 'g-o', label="Validation")
  plt.title(approach+ " - Training & Validation Loss")
  plt.xlabel("Epoch")
  plt.ylabel("Loss")
  plt.legend()
  plt.xticks(df_stats['epoch'])
  plt.savefig(output_path)
  plt.close()
  return

In [None]:
csv_path = '/content/drive/MyDrive/cs-7643-dl/project/result/fin-tuning-stats1607385330.2799566'
approach = 'fine-tuning'
output_path = os.path.join(RESULT_DIR, 'fine-tuning-learning-curve.png')
plot_learning_curve(csv_path, output_path, approach)

In [None]:
csv_path = '/content/drive/MyDrive/cs-7643-dl/project/adapter/result/adapter-training-stats1607382571.3011503'
approach = 'adapter-training'
output_path = os.path.join(ADAPTER_RESULT_DIR, 'adapter-based-learning-curve.png')
plot_learning_curve(csv_path, output_path, approach)

## Experiments


1.   Tackling Overfitting
2.   The role of summary length


In [22]:
# Experiment 1.
BATCH_SIZE = 16
EPOCHS = 5
MAX_LEN = 512


TRAINING_DIR = ADAPTER_DATA_DIR
VALIDATION_DIR = ADAPTER_DATA_DIR

GEN_SUM = False # whether to generate a summary
SUM_LEN = 5 # number of sentences in generated summary

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_dataset_raw = load_data(TRAINING_DIR, "train", ".pt")
train_dataset = preprocess_adapter_data(tokenizer, train_dataset_raw[:80], encode_batch, device)
train_dataloader = create_data_loader(train_dataset, BATCH_SIZE, "sequential")

validation_dataset_raw = load_data(VALIDATION_DIR, "valid", ".pt")
validation_dataset = preprocess_adapter_data(tokenizer, validation_dataset_raw[:20], encode_batch, device)
validation_dataloader = create_data_loader(validation_dataset, BATCH_SIZE, "sequential")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:

dropout_values = [.1, .25, .4, .55, .7, .85]

for dropout in dropout_values:

  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased",
      num_labels = 2,
      output_attentions = False,
      output_hidden_states = False,
      hidden_dropout_prob=dropout
  )
  model.add_adapter("cnn-dm-summarization", AdapterType.text_task)
  model.train_adapter(["cnn-dm-summarization"])
  model.set_active_adapters([["cnn-dm-summarization"]])
  model.to(device)

  optimizer = AdamW(model.parameters(),
                    lr = 1e-4)
  total_steps = len(train_dataloader) * EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)

  wandb.init(project="adapter-training-for-summarization")
  config = wandb.config
  wandb.watch(model, log="all")
  stats = []

  for epoch in range(EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
    print('Training\n')
    t0 = time.time()
    total_train_loss = train_adapter(model, train_dataloader, device, criterion, optimizer, scheduler)
    avg_train_loss = total_train_loss / len(train_dataloader) 
    training_time = format_time(time.time() - t0)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("")
    print("Validation\n")
    t0 = time.time()
    total_val_loss, preds_tensor, preds_list = validate_adapter(model, validation_dataloader, device)
    avg_val_loss = total_val_loss / len(validation_dataloader) 
    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    stats.append(
          {
              'dropout': dropout,
              'epoch': epoch + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )
  df_stats = pd.DataFrame(data=stats)
  df_stats = df_stats.set_index('epoch')
  df_stats.to_csv(os.path.join(ADAPTER_RESULT_DIR, 'experiments-dropout-' + str(dropout) +'--' + format(time.time())))


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/cs-7643-dl/project/adapter/result/experiments-dropout-0.1--1607441369.307022')
df25 = pd.read_csv('/content/drive/MyDrive/cs-7643-dl/project/adapter/result/experiments-dropout-0.25--1607442085.7993891')
df40 = pd.read_csv('/content/drive/MyDrive/cs-7643-dl/project/adapter/result/experiments-dropout-0.4--1607442802.136754')
df55 = pd.read_csv('/content/drive/MyDrive/cs-7643-dl/project/adapter/result/experiments-dropout-0.55--1607443519.0885258')
df70 = pd.read_csv('/content/drive/MyDrive/cs-7643-dl/project/adapter/result/experiments-dropout-0.7--1607444235.2241807')
df85 = pd.read_csv('/content/drive/MyDrive/cs-7643-dl/project/adapter/result/experiments-dropout-0.85--1607444951.6637008')

df_list = [df1, df25, df40, df55, df70, df85]
plt.style.use('seaborn')

plt.rcParams["figure.figsize"] = (12,6)
for df in df_list:
  # plt.plot(df['epoch'], df['Training Loss'], label = "Train Loss dr: " + str(df['dropout'].max()))
  plt.plot(df['epoch'], df['Valid. Loss'], label = "Valid. Loss dr: " + str(df['dropout'].max()))
# plt.plot(df_dropouts['epoch'][:5], df_dropouts['Training Loss'][df_dropouts['dropout']==0.25], df_dropouts['Valid. Loss'][df_dropouts['dropout']==0.25], 'b-o', label="dropout = 0.25")
plt.title("Dropout Rate Effectiveness- Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks(df['epoch'])
plt.savefig('/content/drive/MyDrive/cs-7643-dl/project/adapter/result/experiments-dropout-val-losses.png')
plt.close()

plt.rcParams["figure.figsize"] = (12,6)
for df in df_list:
  # plt.plot(df['epoch'], df['Training Loss'], label = "Train Loss dr: " + str(df['dropout'].max()))
  plt.plot(df['epoch'], df['Training Loss'], label = "Train Loss dr: " + str(df['dropout'].max()))
# plt.plot(df_dropouts['epoch'][:5], df_dropouts['Training Loss'][df_dropouts['dropout']==0.25], df_dropouts['Valid. Loss'][df_dropouts['dropout']==0.25], 'b-o', label="dropout = 0.25")
plt.title("Dropout Rate Effectiveness- Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks(df['epoch'])
plt.savefig('/content/drive/MyDrive/cs-7643-dl/project/adapter/result/experiments-dropout-train-losses.png')
plt.close()

In [None]:
# experiment 2.
from transformers import AdapterConfig
reduction_factor_list = [2, 12, 64]


for reduction_factor in reduction_factor_list:

  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased",
      num_labels = 2,
      output_attentions = False,
      output_hidden_states = False
  )
  config = AdapterConfig.load("pfeiffer", reduction_factor=reduction_factor)
  model.add_adapter("cnn-dm-summarization", AdapterType.text_task, config=config)
  model.train_adapter(["cnn-dm-summarization"])
  model.set_active_adapters([["cnn-dm-summarization"]])
  model.to(device)

  optimizer = AdamW(model.parameters(),
                    lr = 1e-4)
  total_steps = len(train_dataloader) * EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)

  wandb.init(project="adapter-training-for-summarization")
  config = wandb.config
  wandb.watch(model, log="all")
  stats = []

  for epoch in range(EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
    print('Training\n')
    t0 = time.time()
    total_train_loss = train_adapter(model, train_dataloader, device, criterion, optimizer, scheduler)
    avg_train_loss = total_train_loss / len(train_dataloader) 
    training_time = format_time(time.time() - t0)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("")
    print("Validation\n")
    t0 = time.time()
    total_val_loss, preds_tensor, preds_list = validate_adapter(model, validation_dataloader, device)
    avg_val_loss = total_val_loss / len(validation_dataloader) 
    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    stats.append(
          {
              'reduction_factor': reduction_factor,
              'epoch': epoch + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )
  df_stats = pd.DataFrame(data=stats)
  df_stats = df_stats.set_index('epoch')
  df_stats.to_csv(os.path.join(ADAPTER_RESULT_DIR, 'experiments-reduction_factor-' + str(reduction_factor) +'--' + format(time.time())))


In [None]:
# experiment 3.
from transformers import AdapterConfig
ln_list = [True, False]


for ln in ln_list:

  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased",
      num_labels = 2,
      output_attentions = False,
      output_hidden_states = False
  )
  config = AdapterConfig.load("pfeiffer", adapter_residual_before_ln= ln, ln_before= ln, ln_after= ln)
  model.add_adapter("cnn-dm-summarization", AdapterType.text_task, config=config)
  model.train_adapter(["cnn-dm-summarization"])
  model.set_active_adapters([["cnn-dm-summarization"]])
  model.to(device)

  optimizer = AdamW(model.parameters(),
                    lr = 1e-4)
  total_steps = len(train_dataloader) * EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)

  wandb.init(project="adapter-training-for-summarization")
  config = wandb.config
  wandb.watch(model, log="all")
  stats = []

  for epoch in range(EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
    print('Training\n')
    t0 = time.time()
    total_train_loss = train_adapter(model, train_dataloader, device, criterion, optimizer, scheduler)
    avg_train_loss = total_train_loss / len(train_dataloader) 
    training_time = format_time(time.time() - t0)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("")
    print("Validation\n")
    t0 = time.time()
    total_val_loss, preds_tensor, preds_list = validate_adapter(model, validation_dataloader, device)
    avg_val_loss = total_val_loss / len(validation_dataloader) 
    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    stats.append(
          {
              'ln': ln,
              'epoch': epoch + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )
  df_stats = pd.DataFrame(data=stats)
  df_stats = df_stats.set_index('epoch')
  df_stats.to_csv(os.path.join(ADAPTER_RESULT_DIR, 'experiments-ln-' + str(ln) +'--' + format(time.time())))

In [30]:
# experiment 4.

model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased",
      num_labels = 2,
      output_attentions = False,
      output_hidden_states = False
  )

adapter_name = model.load_adapter(ADAPTER_MODEL_DIR)
model.set_active_adapters(adapter_name)
model.to(device)

BATCH_SIZE = 16
MAX_LEN = 512


TESTING_DIR = ADAPTER_DATA_DIR

GEN_SUM = True # whether to generate a summary
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

test_dataset_raw = load_data(TESTING_DIR, "test", ".pt")
test_dataset = preprocess_adapter_data(tokenizer, test_dataset_raw[:100], encode_batch, device)
test_dataloader = create_data_loader(test_dataset, BATCH_SIZE, "sequential")

Overwriting existing adapter 'cnn-dm-summarization'.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [104]:

for sum_len in range(1, 11):
  print("start inference")
  total_test_loss, preds_tensor, preds_list = validate_adapter(model, test_dataloader, device)
  if GEN_SUM:
    print("generate summaries")
    summaries = create_summaries_from_preds(test_dataset_raw[:100], preds_tensor, sum_len)
    print("calculate rouge")
    rouge_scores = calculate_rouge(summaries)
    rouge_scores['Summary Length'] = sum_len
    rouge_scores = pd.DataFrame.from_dict([rouge_scores])
    rouge_scores.to_csv(os.path.join(ADAPTER_RESULT_DIR, 'sum_len_' + str(sum_len) + '-rouge-scores' + format(time.time())))

start inference
  Batch     0  of    212.    Elapsed: 0:00:00.
  Batch    20  of    212.    Elapsed: 0:00:04.
  Batch    40  of    212.    Elapsed: 0:00:08.
  Batch    60  of    212.    Elapsed: 0:00:11.
  Batch    80  of    212.    Elapsed: 0:00:15.
  Batch   100  of    212.    Elapsed: 0:00:19.
  Batch   120  of    212.    Elapsed: 0:00:23.
  Batch   140  of    212.    Elapsed: 0:00:26.
  Batch   160  of    212.    Elapsed: 0:00:30.
  Batch   180  of    212.    Elapsed: 0:00:34.
  Batch   200  of    212.    Elapsed: 0:00:38.
generate summaries
calculate rouge
start inference
  Batch     0  of    212.    Elapsed: 0:00:00.
  Batch    20  of    212.    Elapsed: 0:00:04.
  Batch    40  of    212.    Elapsed: 0:00:08.
  Batch    60  of    212.    Elapsed: 0:00:11.
  Batch    80  of    212.    Elapsed: 0:00:15.
  Batch   100  of    212.    Elapsed: 0:00:19.
  Batch   120  of    212.    Elapsed: 0:00:23.
  Batch   140  of    212.    Elapsed: 0:00:26.
  Batch   160  of    212.    Elapsed: 0:

In [105]:
df_list = []
for i in os.listdir(ADAPTER_RESULT_DIR):
  path = os.path.join(ADAPTER_RESULT_DIR, i)
  if 'sum_len' in str(path):
    df_temp = pd.read_csv(path)
    df_list.append(df_temp)
df = pd.concat(df_list)

plt.style.use('seaborn')
plt.plot(df['Summary Length'], df['rouge1-fmeasure-mid'], label = "ROUGE-1")
plt.plot(df['Summary Length'], df['rouge2-fmeasure-mid'], label = "ROUGE-2")
plt.plot(df['Summary Length'], df['rougeL-fmeasure-mid'], label = "ROUGE-L")
plt.title("ROUGE F-Measure values for various summary lengths")
plt.xlabel("Number of Sentences")
plt.ylabel("ROUGE F-Measure")
plt.legend()
plt.xticks(df['Summary Length'])
plt.savefig(ADAPTER_RESULT_DIR+'experiments-sum_len.png')
plt.close()

In [98]:
path=0

In [47]:
scores = {}
scores['rouge1-fmeasure-mid'] = .8
scores['rouge2-fmeasure-mid'] = .7
scores['rougeL-fmeasure-mid'] = .7
scores['rougeLsum-fmeasure-mid'] = .3

In [72]:
for i in os.listdir(ADAPTER_RESULT_DIR):
  print(str(i).startswith('s'))

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True


In [None]:
# import numpy as np
# from transformers import TrainingArguments, Trainer, EvalPrediction
# from transformers import AdapterType
# from transformers import BertModelWithHeads

# model = BertModelWithHeads.from_pretrained(
#     "bert-base-uncased",
#     num_labels = 2,
#     output_attentions = False,
#     output_hidden_states = False,
# )

# # Tell pytorch to run this model on the GPU.
# model.cuda()
# # Add a new adapter
# model.add_adapter("cnndm_summarization", AdapterType.text_task)
# # Add a matching classification head
# model.add_classification_head("cnndm_summarization", num_labels=2)
# # Activate the adapter
# model.train_adapter("cnndm_summarization")
# model.to(device)


# training_args = TrainingArguments(
#     learning_rate=1e-4,
#     num_train_epochs=6,
#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=BATCH_SIZE,
#     logging_steps=200,
#     output_dir=ADAPTER_DIR,
#     overwrite_output_dir=True,
# )

# def compute_accuracy(p: EvalPrediction):
#   preds = np.argmax(p.predictions, axis=1)
#   return {"acc": (preds == p.label_ids).mean()}

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=test_dataset,
#     eval_dataset=test_dataset,
#     compute_metrics=compute_accuracy
# )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# def process_data_for_adapters(dataset, max_len=512):
#   sentences = []
#   labels = []
#   input_ids = []
#   attention_masks = []
#   for i in range(len(dataset)):
#     if len(validation_dataset_raw[i]['src_txt']) == len(validation_dataset_raw[i]['src_sent_labels']): #exclude inconsistent data
#       sentences.extend(validation_dataset_raw[i]['src_txt'])
#       labels.extend(validation_dataset_raw[i]['src_sent_labels'])


#   for sent in sentences:
#     encoded_dict = tokenizer.encode_plus(
#                         sent,
#                         add_special_tokens = True,
#                         max_length = MAX_LEN,
#                         pad_to_max_length = True,
#                         return_attention_mask = True,
#                         return_tensors = 'pt',
#                    )
#     input_ids.append(encoded_dict['input_ids'])
#     attention_masks.append(encoded_dict['attention_mask'])
#   input_ids = torch.cat(input_ids, dim=0)
#   attention_masks = torch.cat(attention_masks, dim=0)

  # use batch instead
  # encoded_dict = tokenizer.batch_encode_plus(
  #                       sentences,
  #                       add_special_tokens = True,
  #                       max_length = MAX_LEN,
  #                       pad_to_max_length = True,
  #                       return_attention_mask = True,
  #                       return_tensors = 'pt',
  #                  )
  # labels = torch.tensor(labels)
  # dataset = TensorDataset(encoded_dict['input_ids'], encoded_dict['attention_mask'], labels)
  # dataset = TensorDataset(input_ids, attention_masks, labels)
  # return dataset, encoded_dict, labels

In [None]:
# # import adapter transformer
# from transformers import BertModelWithHeads, BertTokenizer

# # train an adapter

# # 1. instantiate a tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModelWithHeads.from_pretrained('bert-base-uncased')


In [None]:
# 2. add adapters
# from transformers import AdapterType
# model.add_adapter('weird_name', AdapterType.text_task)

In [None]:
# 3. add a head
# model.add_classification_head('weird_name')

In [None]:
# model.train_adapter('weird_name')

In [None]:
# the rest is in https://colab.research.google.com/drive/1QR2Vy4mJFUi5r3HaQVROY3dQ9QMTJqhR?usp=sharing#scrollTo=2iYLQO5Evvqy
# use https://github.com/huggingface/datasets/tree/master/datasets/cnn_dailymail


In [None]:
# now let's implement bertsum


In [None]:
# import what you need from it
from datasets import list_datasets, load_dataset, list_metrics, load_metric

In [None]:
# load datasets
dataset = load_dataset('cnn_dailymail', '3.0.0')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3528.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1610.0, style=ProgressStyle(description…


Downloading and preparing dataset cnn_dailymail/3.0.0 (download: 558.32 MiB, generated: 1.28 GiB, post-processed: Unknown size, total: 1.82 GiB) to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602...


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Downloading', max=1.0, style=ProgressSt…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Downloading', max=1.0, style=ProgressSt…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=572061.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=12259516.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=660943.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602. Subsequent calls will reuse this data.


In [None]:
# split
dataset_train = dataset['train']
dataset_val = dataset['validation']
dataset_test = dataset['test']

In [None]:
# try out an example
dataset_train.select(range(5))[0]

{'article': 'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but bec

In [None]:





# from transformers import pipeline
# from datasets import list_datasets, load_dataset, list_metrics, load_metric
# summarizer = pipeline("summarization")
# article = '(CNN)  -- Actress Natasha Richardson was hospitalized after she fell on a ski slope at a Quebec resort, a resort spokeswoman said in a statement Tuesday. Actress Natasha Richardson was transferred Tuesday to an undisclosed location in the United States. Richardson was taken to a hospital near Station Mont Tremblant before she was transferred to Hopital du Sacre-Coeur in Montreal following her fall on Monday, according to the statement. However, she was transferred Tuesday to an undisclosed location in the United States, according to Michelle Simard, spokeswoman for Hopital du Sacre-Coeur. Simard said she had no further details. Richardson fell on a beginners\' trail Monday during a ski lesson at Station Mont Tremblant, said the statement from the resort, located about 80 miles northwest of Montreal. She was not wearing a helmet, the resort said. At the time, Richardson was accompanied by a veteran female ski instructor, who called the ski patrol, the statement said. The ski patrol members examined her and found no visible sign of injury, according to the statement. "As standard protocol, the ski patrol insisted that Ms. Richardson be transported to the base of the hill in a rescue toboggan," the resort statement said. Once at the base of the hill, staffers advised Richardson to seek additional medical attention, but she declined. Accompanied by the instructor, Richardson went to her hotel, where she was again advised to see a doctor, the resort said. As a precautionary measure, the instructor stayed with her, the statement said. The statement offered no details on Richardson\'s condition or injuries, but said resort staffers and police were providing support to Richardson\'s family and friends. Richardson, 45, has appeared in many television, film and stage roles, including the movies "Nell" and "The Parent Trap." She won a Tony award in 1998 for her performance as Sally Bowles in "Cabaret." She is married to actor Liam Neeson and is the daughter of actress Vanessa Redgrave. The Montreal Gazette reported that Richardson\'s two sons with Neeson were skiing with her at the time of her fall, and that Neeson flew to Montreal from a Toronto film set to be with her at the hospital.'
# golden_summary = 'Actress Natasha Richardson fell on a beginners\' trail in Quebec, Canada .\nActress had no "visible signs of injury," resort spokeswoman said .\nAmbulance was called after Richardson was "not feeling good" an hour after fall .'
# output_summary = summarizer(article, max_length=130, min_length=30, do_sample=False)

In [None]:
# rouge = load_metric('rouge')
# rouge.compute(predictions=[output_summary[0]['summary_text']], references=[golden_summary])

In [None]:
# checks out! now let's begin training 🤗

In [None]:
# from transformers import BertTokenizer

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# tokens = tokenizer.tokenize('[PAD] rubbing a bank is hard but when there is a river and near the bank of the river')


In [None]:
# from transformers import BertModel

# bert = BertModel.from_pretrained(
#     "bert-base-uncased",
#     output_hidden_states = True
#     )
# bert = bert.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
# max_labels = max(len(x['src_sent_labels']) for x in dataset)
# max_cls_ind = max(len(x['clss']) for x in dataset)
# for i in range(len(dataset)):
#   dataset[i]['src'] += [0] * (MAX_LEN - len(dataset[i]['src']))
#   dataset[i]['segs'] += [0] * (MAX_LEN - len(dataset[i]['segs']))
#   if len(dataset[i]['src']) > MAX_LEN:
#     dataset[i]['src'] = dataset[i]['src'][:MAX_LEN]
#   if len(dataset[i]['segs']) > MAX_LEN:
#     dataset[i]['segs'] = dataset[i]['segs'][:MAX_LEN]
#   if max(dataset[i]['clss']) >= MAX_LEN: # truncate cls positions after 512th token
#     dataset[i]['clss'] = [x for x in dataset[i]['clss'] if x < MAX_LEN]
#     dataset[i]['src_sent_labels'] = dataset[i]['src_sent_labels'][:len(dataset[i]['clss'])] 
#   dataset[i]['att_msk'] = [0 if x == 0 else 1 for x in dataset[i]['src']]
#   dataset[i]['clss'] += [-1] * (max_cls_ind - len(dataset[i]['clss']))
#   dataset[i]['src_sent_labels'] += [-1] * (max_labels - len(dataset[i]['src_sent_labels']))

In [None]:
# src = []
# src_sent_labels = []
# segs = []
# att_msk = []
# cls_ind = []
# for ex in dataset:
#   src.append(ex['src'])
#   src_sent_labels.append(ex['src_sent_labels'])
#   segs.append(ex['segs'])
#   att_msk.append(ex['att_msk'])
#   cls_ind.append(ex['clss'])
# src = torch.tensor(src)
# src_sent_labels = torch.tensor(src_sent_labels)
# segs = torch.tensor(segs)
# att_msk = torch.FloatTensor(att_msk)
# cls_ind = torch.tensor(cls_ind)

In [None]:
# from torch.utils.data import TensorDataset
# dataset = TensorDataset(src, src_sent_labels, segs, att_msk, cls_ind)

In [None]:
# for step, batch in enumerate(train_dataloader):
#   print(step)
#   batch_src = batch[0].to(device)
#   batch_src_sent_labels = batch[1].to(device)
#   batch_segs = batch[2].to(device)
#   batch_att_msk = batch[3].to(device)
#   model.zero_grad()  
#   output = model(input_ids = batch_src, 
#             token_type_ids=batch_segs,
#             attention_mask=batch_att_msk)
#   last_hidden_layer = output[0]
#   cls_indices = batch_src==101
#   cls_indices = cls_indices.unsqueeze(-1).expand(-1, -1, 768)
#   cls_embs = last_hidden_layer[cls_indices].reshape(-1, 768)
#   out = classifier(cls_embs)
#   batch_src_sent_labels_flat = batch_src_sent_labels[batch_src_sent_labels!=-1]
#   loss = criterion(out, batch_src_sent_labels_flat.float())
#   print(loss)
#   loss.backward()
#   # added clipping source
#   torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#   optimizer.step()
#   scheduler.step()
#   if step==0:
#     break