In [1]:
import json

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import trange

import torch
from torch import nn
# from torch.utils.data.dataset import Dataset

from datasets import Dataset
from transformers import LongformerTokenizer, LongformerForMultipleChoice
from transformers import (
    EvalPrediction,
    Trainer,
    TrainingArguments,
    default_data_collator,
    TrainingArguments
)

from fluence.pooling import MeanPooling

In [2]:
with open("edit_data.json", "r") as read_file:
    data = json.load(read_file)

In [3]:
df = pd.DataFrame(data)

In [4]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [12]:
for i in range(845):
    if len(tokenizer.encode(data[i]['body']))>4095:
        print(i)

360
445
724


In [5]:
for i in [360, 445, 724]:
    out = tokenizer.encode(df["body"][i])[:700]
    df.loc[i, "body"] = tokenizer.decode(out)
    print(len(df["body"][i]))

Token indices sequence length is longer than the specified maximum sequence length for this model (6452 > 4096). Running this sequence through the model will result in indexing errors


3407
3338
3763


In [6]:
def preprocess_function(examples):
    prompt = str(examples['body'])
    choice0, choice1, choice2 = str(examples['thesis']), str(examples['anti-thesis']), str(examples['third-option'])
    encoding = tokenizer([prompt, prompt, prompt], [choice0, choice1, choice2], return_tensors='pt', padding='max_length')
    encoding["label"] = 0
    return encoding

In [23]:
dataset = Dataset.from_pandas(df, split=[10, 20, 30])
dataset = dataset.map(preprocess_function)

HBox(children=(FloatProgress(value=0.0, max=845.0), HTML(value='')))




In [25]:
valid_index = list(range(0, 10))

In [30]:
df.iloc[valid_index]

Unnamed: 0,headline,abstract,body,thesis,anti-thesis,third-option
0,Juneteenth Is for Everyone; Op-Ed Contributor,The holiday marking the final end of slavery s...,SOME two months after Gen. Robert E. Lee surre...,The holiday marking the final end of slavery ...,Juneteenth should be an African-American only ...,Some two months after Gen. Robert E. Lee surre...
1,No More Money for the Police,Redirect it to emergency response programs tha...,Redirect it to emergency response programs tha...,Redirect money for the police to other emergen...,Keep the current budget for the police,"On Monday, a worker at a store in Minneapolis ..."
2,Our Next Crisis Will Be Caring for Survivors o...,Many among the most ill may emerge with debili...,Many among the most ill may emerge with debili...,The next crisis will be caring for survivors o...,There will be no new crisis after Covid-19,One of the shortcomings of the nation’s public...
3,An Open Letter to My Fellow White Christians,"Our sins are grievous, but we are not yet beyo...",NASHVILLE — Since long before it was a country...,White Christians’ sins are grievous but they ...,White Christians are sinless and have already ...,"Since long before it was a country, our countr..."
4,"Prosecutors, Please Stand Up to the Police",Minor offenses should not lead to deadly encou...,Minor offenses should not lead to deadly encou...,Minor offenses should not lead to deadly encou...,Police officers should be lightly punished and...,Watching the suffocation of George Floyd in Mi...
5,Britain’s Ethnic Minorities Are Being Left for...,"The government could have predicted, and perha...","The government could have predicted, and perha...","The British government could have predicted, a...",The deaths of ethnic minorities were unpredict...,"In early April, Maruthalingam Thiyakumar, a 58..."
6,"Mayor de Blasio, Don’t Make New York’s Budget ...",Borrowing $7 billion without smart spending cu...,Borrowing $7 billion without smart spending cu...,The city of New York shouldn’t borrow more mon...,New York should immediately borrow more money ...,As the coronavirus crisis wreaks havoc on the ...
7,Anger Benefits Some Americans Much More Than O...,Angry white agitators are labeled good people ...,Angry white agitators are labeled good people ...,Anger benefits white people more than black pe...,White people and black people are both able to...,The protests against police brutality gripping...
8,Black Lives Matter Is Winning,Activists set out to show that police brutalit...,Activists set out to show that police brutalit...,Black Lives Matter is proving it’s point on po...,The Black Lives Matter movement is losing mome...,How did this happen?
9,Black Lives Matter Is Democracy in Action; Opi...,"A decentralized movement can be effective, eve...",CHICAGO — Why has this generation of black act...,Black Lives Matter is the best example of demo...,Democracy is best represented through voting,Why has this generation of black activists fai...


In [24]:
dataset.

{'abstract': ['Congress is still owed answers about President Trump’s dismissal of inspectors general.'],
 'anti-thesis': ['Trump’s dismissal of inspector Generals is justified'],
 'attention_mask': [[[1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    

In [80]:
train_dataset, eval_dataset = dataset["train"], dataset["test"]

In [72]:
model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096', return_dict=True)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForMultipleChoice: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing LongformerForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForMultipleChoice were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a dow

In [77]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    if data_args.task_name is not None:
        result = metric.compute(predictions=preds, references=p.label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result
    elif is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [84]:
training_args = TrainingArguments(output_dir='/home/nlp/experiments/edit')

In [85]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
)

In [86]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=47.0, style=ProgressStyle(description_wid…

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  sep_token_indices = (input_ids == sep_token_id).nonzero()


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/transformers/modeling_longformer.py", line 1782, in forward
    outputs = self.longformer(
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/transformers/modeling_longformer.py", line 1265, in forward
    encoder_outputs = self.encoder(
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/transformers/modeling_longformer.py", line 903, in forward
    layer_outputs = layer_module(
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/transformers/modeling_longformer.py", line 849, in forward
    self_attn_outputs = self.attention(
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/transformers/modeling_longformer.py", line 793, in forward
    self_outputs = self.self(
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/transformers/modeling_longformer.py", line 267, in forward
    attn_scores = self._sliding_chunks_query_key_matmul(
  File "/home/nlp/.local/lib/python3.8/site-packages/transformers/modeling_longformer.py", line 491, in _sliding_chunks_query_key_matmul
    chunked_attention_scores = torch.einsum("bcxd,bcyd->bcxy", (chunked_query, chunked_key))  # multiply
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/functional.py", line 325, in einsum
    return einsum(equation, *operands)
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/functional.py", line 327, in einsum
    return _VF.einsum(equation, operands)
RuntimeError: CUDA out of memory. Tried to allocate 4.22 GiB (GPU 0; 47.46 GiB total capacity; 40.17 GiB already allocated; 4.15 GiB free; 42.36 GiB reserved in total by PyTorch)


In [9]:
 outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels)

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  sep_token_indices = (input_ids == sep_token_id).nonzero()


In [11]:
# tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-summarize-news")
# model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-summarize-news").cuda()

# tokenizer = AutoTokenizer.from_pretrained("google/pegasus-multi_news")
# model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-multi_news").cuda()
# stokenizer = AutoTokenizer.from_pretrained("sentence-transformers/roberta-large-nli-stsb-mean-tokens")
# smodel = AutoModel.from_pretrained("sentence-transformers/roberta-large-nli-stsb-mean-tokens").cuda()

# def summarize(text, max_length=8192):
#     input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True).cuda()
#     generated_ids = model.generate(input_ids=input_ids, num_beams=2, max_length=max_length,  repetition_penalty=2.5, length_penalty=1.0, early_stopping=True)
#     preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
#     return preds[0]

# cos = nn.CosineSimilarity(dim=0, eps=1e-6)

# correct_preds = 0
# for i in trange(len(data)):
#     print(i)
#     input_data = summarize(data[i]['body'])
#     print(len(data[i]['body']))
#     sentences = [input_data, data[i]['thesis'], data[i]['anti-thesis'], data[i]['third-option']]
#     encoded_input = stokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
#     for k, v in encoded_input.items():
#         encoded_input[k] = v.cuda()
#     with torch.no_grad():
#         model_output = smodel(**encoded_input)
#     sentence_embeddings = MeanPooling(model_output[0], encoded_input['attention_mask'])
#     sim_array = [cos(sentence_embeddings[0], sentence_embeddings[1]),
#                 cos(sentence_embeddings[0], sentence_embeddings[2]),
#                 cos(sentence_embeddings[0], sentence_embeddings[3])]
#     pred_idx = sim_array.index(max(sim_array))
#     if pred_idx == 0:
#         correct_preds += 1