In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/NLP_Project/roberta

/content/drive/MyDrive/NLP_Project/roberta


In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install rouge_score
!pip install dill==0.3.5.1

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import datasets
import transformers
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizerFast
from transformers import EncoderDecoderModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import TrainingArguments
from dataclasses import dataclass, field
from typing import Optional

import json
import torch

In [None]:
training_data_path = '/content/drive/MyDrive/NLP_Project/train_changed.json'
test_data_path = '/content/drive/MyDrive/NLP_Project/validation_changed.json'

In [None]:
train_data = []
test_data = []

with open(training_data_path, 'r') as f:
    for line in f:
        json_line = json.loads(line)
        train_data.append(json_line)

with open(test_data_path, 'r') as f:
    for line in f:
        json_line = json.loads(line)
        test_data.append(json_line)

In [None]:
len(train_data)

3197

In [None]:
drop_columns = ['postText', 'postPlatform', 'targetTitle', 'targetDescription', 'targetKeywords', 'provenance', 'spoilerPositions', 'tags']
train_df = pd.DataFrame(train_data).drop(drop_columns, axis=1)
test_df = pd.DataFrame(test_data).drop(drop_columns, axis=1)
train_df

Unnamed: 0,targetParagraphs,spoiler
0,2070 is shaping up to be a great year for Moth...,2070
1,"Despite common belief, money isn't the key to ...",intellectual stimulation
2,"It’s common wisdom. Near gospel really, and no...",Purpose connects us to something bigger and in...
3,"Boiling rice may seem simple, but there is a v...",in a rice cooker
4,One of the biggest surprise announcements at A...,"Apple says that if AirPods are lost or stolen,..."
...,...,...
3192,"A long time ago in a galaxy far, far away...Wa...",it hasn’t necessarily taken the wind out of Yo...
3193,"The Kansas City, Kansas Police Department are ...","It read, ""Thanks for keeping us safe."""
3194,"Obama looks decades younger already, but what ...",1. Anti-wrinkle creams will erase the fine lin...
3195,What the HELL?!??? 1. Unless you’re somewhere ...,@beyoncefan666


In [None]:
train_data=Dataset.from_pandas(train_df[:3000])
val_data=Dataset.from_pandas(train_df[3000:])
test_data=Dataset.from_pandas(test_df)

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [None]:
batch_size=128
encoder_max_length=40
decoder_max_length=8

In [None]:
def process_data_to_model_inputs(batch):
  inputs = tokenizer(batch["targetParagraphs"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["spoiler"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

In [None]:

train_data = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["targetParagraphs", "spoiler"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


val_data = val_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["targetParagraphs", "spoiler"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", tie_encoder_decoder=True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForCausal

In [None]:
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id
roberta_shared.config.eos_token_id = tokenizer.eos_token_id


roberta_shared.config.max_length = 40
roberta_shared.config.early_stopping = True
roberta_shared.config.no_repeat_ngram_size = 3
roberta_shared.config.length_penalty = 2.0
roberta_shared.config.num_beams = 4
roberta_shared.config.vocab_size = roberta_shared.config.encoder.vocab_size

In [None]:
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
output_dir = "/content/drive/MyDrive/NLP_Project/roberta/output"

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    #evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=2,
    save_steps=16,
    eval_steps=500,
    warmup_steps=500,
    overwrite_output_dir=True,
    save_total_limit=1,
    fp16=True,
    num_train_epochs=10,
)

trainer = Seq2SeqTrainer(
    model=roberta_shared,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)

In [None]:
trainer.train()



Step,Training Loss
2,11.671
4,11.3906
6,11.5026
8,11.5039
10,11.5636
12,11.585
14,11.281
16,11.3796
18,11.1901
20,11.029




TrainOutput(global_step=240, training_loss=7.209439329306284, metrics={'train_runtime': 391.1186, 'train_samples_per_second': 76.703, 'train_steps_per_second': 0.614, 'total_flos': 825521997600000.0, 'train_loss': 7.209439329306284, 'epoch': 10.0})

In [None]:
trainer.save_model(output_dir+"/last")

In [None]:
roberta_shared.eval()

EncoderDecoderModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [None]:
batch_size = 1

In [None]:
def generate_summary(batch):
    torch.cuda.empty_cache()
    inputs = tokenizer(batch["targetParagraphs"], padding="max_length", truncation=True, max_length=40, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    outputs = roberta_shared.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred"] = output_str
    return batch

In [None]:
results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["targetParagraphs"])
pred_str = results["pred"]
label_str = results["spoiler"]



  0%|          | 0/800 [00:00<?, ?ba/s]

In [None]:
for i, pred in enumerate(pred_str):
  print(i)
  print(f"pred: {pred}")
  print(f"spoiler: {label_str[i]}")

0
pred: Kin million million millions of the
spoiler: some of the plot elements are so disturbing that they are making him feel sick
1
pred: KKin million million million Trump million of of a a
spoiler: "intentionally" could transform a court case against Phoenix-area Sheriff Joe Arpaio from civil charges to a criminal prosecution
2
pred: t million million million to a the
spoiler: 20%
3
pred: Lin million million million Trump million the a a a
spoiler: Alan Rickman & Rupert Grint CBGB
4
pred: T million million million of the
spoiler: a man who swallowed a 64GB microSD card and then pooped it into a strainer
5
pred: Kin million million million to the the the
spoiler: Sprite
6
pred: BBCCttt million millions the
spoiler: Smoky Paprika-Baked Garbanzo Beans
7
pred: KLLtt millions of a
spoiler: McGonagall was appointed as Dumbledore’s assistant in 1956, not as his replacement.
8
pred: L million million millions of of a a a
spoiler: All the scenes are actually in the movie
9
pred: Aerer milli

In [None]:
results = {}

In [None]:
for i, pred in enumerate(pred_str):
  tmp = {}
  tmp["roberta"] = pred
  tmp["spoiler"] = label_str[i]
  results[i] = tmp

In [None]:
results

{0: {'roberta': 'Kin million million millions of the',
  'spoiler': 'some of the plot elements are so disturbing that they are making him feel sick'},
 1: {'roberta': 'KKin million million million Trump million of of a a',
  'spoiler': '"intentionally" could transform a court case against Phoenix-area Sheriff Joe Arpaio from civil charges to a criminal prosecution'},
 2: {'roberta': 't million million million to a the', 'spoiler': '20%'},
 3: {'roberta': 'Lin million million million Trump million the a a a',
  'spoiler': 'Alan Rickman & Rupert Grint CBGB'},
 4: {'roberta': 'T million million million of the',
  'spoiler': 'a man who swallowed a 64GB microSD card and then pooped it into a strainer'},
 5: {'roberta': 'Kin million million million to the the the',
  'spoiler': 'Sprite'},
 6: {'roberta': 'BBCCttt million millions the',
  'spoiler': 'Smoky Paprika-Baked Garbanzo Beans'},
 7: {'roberta': 'KLLtt millions of a',
  'spoiler': 'McGonagall was appointed as Dumbledore’s assistant in

In [None]:
file_path = "/content/drive/MyDrive/NLP_Project/roberta_10_eval_1b.json"

with open(file_path, "w") as json_file:
    json.dump(results, json_file, indent=4)