In [None]:
!ls

In [None]:
!pip install datasets transformers rouge_score nltk

In [None]:
# # Define the training data
# def parse_relations(rel_pth: str):
#     rel = []
#     stn = []
#     idx_submission = -1
#     idx_sentence = -1
#     with open(rel_pth, 'r') as f:
#         while True:
#             line = f.readline()
#             if not line: break
#             if line[0] == 'S':
#                 if int(line.split()[1]) > idx_submission:
#                     idx_submission += 1
#                     idx_sentence = -1
#                     stn.append([])
#                     rel.append([])
#                 if int(line.split()[2]) > idx_sentence:
#                     idx_sentence += 1
#                     rel[-1].append([])
#                 stn[-1].append(line.split('\t')[3][:-1])
#                 continue
#             rel[-1][-1].append(line.strip().lstrip("R\t").replace('\t', ' ')+'.')
#     rel_out, stn_out = [], []
#     for i in range(len(rel)):
#         if len(rel[i]) == 0: continue
#         for j in rel[i]:
#             rel_out.append(' '.join(j))
#         for j in stn[i]:
#             stn_out.append(j)
#     # for i in rel:
#     #     for j in i:
#     #         rel_out.append(' '.join(j))
#     # for i in stn:
#     #     for j in i:
#     #         stn_out.append(j)
#     return rel_out, stn_out

In [3]:
# Define the training data
def parse_relations(rel_pth: str):
    rel = []
    stn = []
    idx_submission = -1
    idx_sentence = -1
    with open(rel_pth, 'r') as f:
        while True:
            line = f.readline()
            if not line: break
            if line[0] == 'S':
                if int(line.split()[1]) > idx_submission:
                    idx_submission += 1
                    idx_sentence = -1
                    stn.append([])
                    rel.append([])
                if int(line.split()[2]) > idx_sentence:
                    idx_sentence += 1
                    rel[-1].append([])
                stn[-1].append(line.split('\t')[3][:-1])
                continue
            rel_line = line.strip().lstrip("R\t").split('\t')
            if len(rel_line) == 3:
                rel[-1][-1].append('<subject>%s<predicate>%s<object>%s' % (rel_line[0], rel_line[1], rel_line[2]))
            else:
                rel[-1][-1].append('<subject>%s<predicate>%s' % (rel_line[0], rel_line[1]))
    rel_out, stn_out = [], []
    for i in range(len(rel)):
        if len(rel[i]) == 0: continue
        for j in rel[i]:
            rel_out.append(''.join(j))
        for j in stn[i]:
            stn_out.append(j)
    return rel_out, stn_out

In [4]:
import random

def concatenated_dataset(rel, stn, min_choice=1, max_choice=5, len_dataset=None):
    ''' Build a dataset from parsed relations and sentences.
        Each data is a concatenation from randomly chosen sentences and their relations.
        If len_dataset is None, the number of data is same as the number of input sentences.
    '''
    
    rel_out, stn_out = [], []
    
    if len_dataset is None: len_dataset = len(stn)
        
    for i in range(len_dataset):
        n_choice = random.randint(min_choice, max_choice)
        idxs = random.sample(range(len(stn)), n_choice)
        
        r, s = [], []
        for j in idxs:
            r.append(rel[j])
            s.append(stn[j])
        
        rel_out.append(''.join(r))
        stn_out.append(' '.join(s))
        
    df = pd.DataFrame({'relations': rel_out, 'sentence': stn_out})
    out = Dataset.from_pandas(df)
    return out

In [5]:
# import pyarrow as pa
# import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset, DatasetDict

train_rel, train_stn = parse_relations("reddit_train_article_relation.txt")
train_dataset = concatenated_dataset(train_rel, train_stn)

validation_rel, validation_stn = parse_relations("reddit_validation_article_relation.txt")
validation_dataset = concatenated_dataset(validation_rel, validation_stn)

test_rel, test_stn = parse_relations("reddit_test_article_relation.txt")
test_dataset = concatenated_dataset(test_rel, test_stn)

reddit_dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset, 'test': test_dataset})
reddit_dataset.reset_format()
reddit_dataset

DatasetDict({
    train: Dataset({
        features: ['relations', 'sentence'],
        num_rows: 6902
    })
    validation: Dataset({
        features: ['relations', 'sentence'],
        num_rows: 389
    })
    test: Dataset({
        features: ['relations', 'sentence'],
        num_rows: 628
    })
})

In [16]:
reddit_dataset['train'][2]

{'relations': '<subject>Me and my best friend since Kindergarten<predicate>are<object>on a flight back from Maui<subject>He<predicate>had met<object>me for literally about ten minutes<subject>He<predicate>had decided<object>that he was in love with me and that he would write love songs in my honour from that<subject>I<predicate>raised<object>my finger a bee',
 'sentence': 'Called it right away. Me and my best friend since Kindergarten are on a flight back from Maui. He had met me for literally about ten minutes, and from that had decided that he was in love with me and that he would write love songs in my honour? As soon as I raised my finger a bee landed on the very tip for my finger.'}

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_checkpoint = 'facebook/bart-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(['<subject>', '<predicate>', '<object>'])
model.resize_token_embeddings(len(tokenizer))

Embedding(50268, 768)

In [6]:
max_input_length = 512
max_target_length = 128


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["relations"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["sentence"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
reddit_tokenized = reddit_dataset.map(preprocess_function, batched=True)
reddit_tokenized = reddit_tokenized.remove_columns(
    reddit_dataset["train"].column_names
)
reddit_tokenized

                                                                 

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6902
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 389
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 628
    })
})

In [13]:
from datasets import load_metric
import nltk
from nltk.tokenize import sent_tokenize

rouge_score = load_metric("rouge")

nltk.download("punkt")

def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["relations"]]
    return metric.compute(predictions=summaries, references=dataset["sentence"])

  rouge_score = load_metric("rouge")
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
from transformers import Seq2SeqTrainingArguments

batch_size = 32
num_train_epochs = 2
# Show the training loss with every epoch
logging_steps = len(reddit_tokenized["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-reddit",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
)

In [10]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [21]:
from transformers import Seq2SeqTrainer

import pandas as pd
from datasets import Dataset, DatasetDict

train_dataset = concatenated_dataset(train_rel, train_stn)
validation_dataset = concatenated_dataset(validation_rel, validation_stn)
test_dataset = concatenated_dataset(test_rel, test_stn)
reddit_dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset, 'test': test_dataset})
reddit_dataset.reset_format()
reddit_tokenized = reddit_dataset.map(preprocess_function, batched=True)
reddit_tokenized = reddit_tokenized.remove_columns(
    reddit_dataset["train"].column_names
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=reddit_tokenized["train"],
    eval_dataset=reddit_tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

                                                                  

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.2842,1.191589,49.0765,43.9321,47.2502,48.0435
2,0.9394,1.181983,49.1529,43.8639,47.1566,48.063


TrainOutput(global_step=432, training_loss=1.1102550347094182, metrics={'train_runtime': 110.8696, 'train_samples_per_second': 124.507, 'train_steps_per_second': 3.896, 'total_flos': 1414793416458240.0, 'train_loss': 1.1102550347094182, 'epoch': 2.0})

In [22]:
from transformers import Seq2SeqTrainer

import pandas as pd
from datasets import Dataset, DatasetDict

train_dataset = concatenated_dataset(train_rel, train_stn)
# validation_dataset = concatenated_dataset(validation_rel, validation_stn)
# test_dataset = concatenated_dataset(test_rel, test_stn)
reddit_dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset, 'test': test_dataset})
reddit_dataset.reset_format()
reddit_tokenized = reddit_dataset.map(preprocess_function, batched=True)
reddit_tokenized = reddit_tokenized.remove_columns(
    reddit_dataset["train"].column_names
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=reddit_tokenized["train"],
    eval_dataset=reddit_tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

                                                                  

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.8774,1.213181,49.2015,44.1999,47.533,48.2816
2,0.7001,1.245168,49.2011,43.8045,47.3017,48.0739


TrainOutput(global_step=432, training_loss=0.788277209908874, metrics={'train_runtime': 110.9278, 'train_samples_per_second': 124.441, 'train_steps_per_second': 3.894, 'total_flos': 1405558052720640.0, 'train_loss': 0.788277209908874, 'epoch': 2.0})

In [23]:
from transformers import Seq2SeqTrainer

import pandas as pd
from datasets import Dataset, DatasetDict

train_dataset = concatenated_dataset(train_rel, train_stn)
# validation_dataset = concatenated_dataset(validation_rel, validation_stn)
# test_dataset = concatenated_dataset(test_rel, test_stn)
reddit_dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset, 'test': test_dataset})
reddit_dataset.reset_format()
reddit_tokenized = reddit_dataset.map(preprocess_function, batched=True)
reddit_tokenized = reddit_tokenized.remove_columns(
    reddit_dataset["train"].column_names
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=reddit_tokenized["train"],
    eval_dataset=reddit_tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

                                                                  

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.5043,1.364647,48.9471,43.6423,47.2629,47.9605
2,0.4593,1.368645,48.9255,43.6105,47.3089,47.9771


TrainOutput(global_step=432, training_loss=0.48198555796234693, metrics={'train_runtime': 110.9412, 'train_samples_per_second': 124.426, 'train_steps_per_second': 3.894, 'total_flos': 1405558052720640.0, 'train_loss': 0.48198555796234693, 'epoch': 2.0})

In [24]:
from transformers import Seq2SeqTrainer

import pandas as pd
from datasets import Dataset, DatasetDict

train_dataset = concatenated_dataset(train_rel, train_stn)
# validation_dataset = concatenated_dataset(validation_rel, validation_stn)
# test_dataset = concatenated_dataset(test_rel, test_stn)
reddit_dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset, 'test': test_dataset})
reddit_dataset.reset_format()
reddit_tokenized = reddit_dataset.map(preprocess_function, batched=True)
reddit_tokenized = reddit_tokenized.remove_columns(
    reddit_dataset["train"].column_names
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=reddit_tokenized["train"],
    eval_dataset=reddit_tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

                                                                  

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.2736,1.552189,48.6122,43.3625,47.0333,47.7245
2,0.3079,1.498774,48.8147,43.4876,47.2392,47.8464


TrainOutput(global_step=432, training_loss=0.29144847944930746, metrics={'train_runtime': 110.8872, 'train_samples_per_second': 124.487, 'train_steps_per_second': 3.896, 'total_flos': 1405558052720640.0, 'train_loss': 0.29144847944930746, 'epoch': 2.0})

In [25]:
from transformers import Seq2SeqTrainer

import pandas as pd
from datasets import Dataset, DatasetDict

train_dataset = concatenated_dataset(train_rel, train_stn)
# validation_dataset = concatenated_dataset(validation_rel, validation_stn)
# test_dataset = concatenated_dataset(test_rel, test_stn)
reddit_dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset, 'test': test_dataset})
reddit_dataset.reset_format()
reddit_tokenized = reddit_dataset.map(preprocess_function, batched=True)
reddit_tokenized = reddit_tokenized.remove_columns(
    reddit_dataset["train"].column_names
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=reddit_tokenized["train"],
    eval_dataset=reddit_tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

                                                                  

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.1576,1.684059,48.9105,43.4264,47.2467,47.9253
2,0.2154,1.613396,48.4115,43.0082,46.804,47.477


TrainOutput(global_step=432, training_loss=0.18744217742372443, metrics={'train_runtime': 111.2664, 'train_samples_per_second': 124.063, 'train_steps_per_second': 3.883, 'total_flos': 1405558052720640.0, 'train_loss': 0.18744217742372443, 'epoch': 2.0})

In [26]:
import time
trainer.save_model(f"bart-base-finetuned-reddit-{time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime())}")

In [28]:
import shutil

shutil.make_archive('bart-base-finetuned-reddit-2023-05-18_22-14-51', 'zip', 'bart-base-finetuned-reddit-2023-05-18_22-14-51')

# !zip -r bart-base-finetuned-reddit-2023-05-14_16-37-39.zip bart-base-finetuned-reddit-2023-05-14_16-37-39
# !zip -r bart-base-finetuned-reddit.zip bart-base-finetuned-reddit

'/root/bart-base-finetuned-reddit-2023-05-18_22-14-51.zip'

In [29]:
trainer.evaluate()

{'eval_loss': 1.6133956909179688,
 'eval_rouge1': 48.4115,
 'eval_rouge2': 43.0082,
 'eval_rougeL': 46.804,
 'eval_rougeLsum': 47.477,
 'eval_runtime': 7.7923,
 'eval_samples_per_second': 49.921,
 'eval_steps_per_second': 1.668,
 'epoch': 2.0}

In [12]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device='cuda:0')

In [13]:
def print_summary(dataset, idx, summarizer):
    print(f"\n>>> {idx}")
    relations = dataset["test"][idx]["relations"]
    sentence = dataset["test"][idx]["sentence"]
    if len(relations.split()) == 0:
        print(f"\n>>> There's no contents.")
        return
    result = summarizer(relations)[0]["summary_text"]
    print(f"\n>>> Relations: {relations}")
    print(f"\n>>> Sentence: {sentence}")
    print(f"\n>>> Result: {result}")

In [14]:
for i in range(20): print_summary(reddit_dataset, i, summarizer)


>>> 0


Your max_length is set to 128, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 128, but your input_length is only 66. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)



>>> Relations: <subject>we<predicate>had<object>a laugh about it<subject>we<predicate>shook<object>hands<subject>he<predicate>'d like<object>to see me before he goes, I walk up still giggling like an idiot and he says to me "White Power my ni**<subject>I<predicate>walk giggling<object>still<subject>A little boy of about 3<predicate>left<object>his mother<subject>A little boy of about 3<predicate>was running<object>into the crowded food court<subject>it<predicate>was<object>Black Friday in my low-income area<subject>A guy who was stalking me and my girlfriend<predicate>found<object>2 of my throwaways<subject>A guy<predicate>was stalking<object>me and my girlfriend<subject>he<predicate>just went<object>usually

>>> Sentence: He finishes his business, gets a new battery and tells the co worker he'd like to see me before he goes, I walk up still giggling like an idiot and he says to me "White Power my ni**a" And we shook hands and had a laugh about it. A little boy of about 3 left his mot

Your max_length is set to 128, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)



>>> Relations: <subject>the same parent company<predicate>guaranteeing<object>no one<subject>nearly every energy distributer in the UK<predicate>has<object>an energy generator owned by the same parent company thereby guaranteeing no one ever questions these companies profit margins<subject>an energy generator owned by the same parent company<predicate>questions<object>these companies profit margins ever

>>> Sentence: nearly every energy distributer in the UK has an energy generator owned by the same parent company thereby guaranteeing no one ever questions these companies profit margins. Me a grown fucking man......

>>> Result: NONONOONONNO DON'T TOUCH IT Nearly every energy distributer in the UK has an energy generator owned by the same parent company thereby guaranteeing no one ever questions these companies profit margins.

>>> 3


Your max_length is set to 128, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)



>>> Relations: <subject>people<predicate>were<object>really hungry After an hour<subject>some people<predicate>started to leave because they were expecting to be fed and didn't want to stay to leave<subject>they<predicate>were expecting to did n't want<object>to stay<subject>they<predicate>to be fed<subject>I<predicate>thought<object>it was done<subject>She<predicate>gets<object>out of the car<subject>She<predicate>grabs<object>the phone<subject>She<predicate>barks<object>an angry "thanks<subject>She<predicate>gets back<subject>I<predicate>was<object>so disgusted

>>> Sentence: After an hour, people were really hungry and some people started to leave because they were expecting to be fed and didn't want to stay. where are you from, what do you do, where are you going. Thats how I thought it was done. She gets out of the car, grabs the phone, barks an angry "thanks" and gets back in the car and drives off. I was so disgusted.

>>> Result: After an hour, people were really hungry and so

Your max_length is set to 128, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)



>>> Relations: <subject>They<predicate>had<object>four cars<subject>I<predicate>broke<object>all their windows<subject>I<predicate>stole<object>all their radios<subject>the same parent company<predicate>guaranteeing<object>no one<subject>nearly every energy distributer in the UK<predicate>has<object>an energy generator owned by the same parent company thereby guaranteeing no one ever questions these companies profit margins<subject>an energy generator owned by the same parent company<predicate>questions<object>these companies profit margins ever<subject>She<predicate>just kept<object>telling me, very sternly 'You need to leave, You need to leave<subject>She<predicate>just kept telling<object>me very sternly 'You need to leave, You need to leave<subject>This guy<predicate>is<object>incompetent with women<subject>This guy<predicate>has<object>a complex where he thinks they owe him attention<subject>he<predicate>thinks<object>they owe him attention a complex<subject>me<predicate>go<objec

Your max_length is set to 128, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 128, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)



>>> Relations: <subject>she<predicate>tunneled out<object>from our back yard One night<subject>I<predicate>to run<object>after her<subject>she<predicate>got<object>trashed<subject>she<predicate>threw up<object>hopefully on her dress, but not sure<subject>I<predicate>heard<object>that she was inconsolable the entire night, got trashed and threw up (hopefully on her dress, but not sure

>>> Sentence: One night she tunneled out from our back yard and I had to run after her. I heard through the grapevine that she was inconsolable the entire night, got trashed and threw up (hopefully on her dress, but not sure).

>>> Result: One night shetunneled out from our back yard and I had to run after her. I heard that she was inconsolable the entire night, got trashed and threw up (hopefully on her dress, but not sure).

>>> 7

>>> Relations: <subject>Shocking<predicate>is n't<object>it<subject>He<predicate>continued<object>to grab me and try to convince me<subject>He<predicate>continued to try to 

Your max_length is set to 128, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
Your max_length is set to 128, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)



>>> Relations: <subject>I<predicate>go<object>outside<subject>I<predicate>walk<object>down

>>> Sentence: I go outside and walk down my drive way to the corner store.

>>> Result: I go outside and walk down to the shed.

>>> 9

>>> Relations: <subject>I<predicate>said<object>okay<subject>My go-to<predicate>was making<object>big pizza orders

>>> Sentence: I said okay even though I didn't want him coming in.. My go-to was making big pizza orders and then never going to pick them up.

>>> Result: I said okay. My go-to was making big pizza orders.

>>> 10


Your max_length is set to 128, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 128, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)



>>> Relations: <subject>I<predicate>laughed<subject>I<predicate>said<object>thanks<subject>I<predicate>brushed<object>her hand aside<subject>I<predicate>to shift

>>> Sentence: I laughed and brushed her hand aside by claiming I had to shift, and said thanks

>>> Result: " I laughed and said thanks. I rushed her hand aside and had to shift.

>>> 11

>>> Relations: <subject>She<predicate>sees<object>lots of wildlife<subject>She<predicate>relishes<object>the calm away from other people

>>> Sentence: She sees lots of wildlife and relishes the calm away from other people. take your gas/electric bill divide it by 10 and you'd still be being ripped off a little.

>>> Result: She sees lots of wildlife, and returns the calm away from other people.

>>> 12


Your max_length is set to 128, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)



>>> Relations: <subject>he<predicate>had<object>about 7 employees By that point

>>> Sentence: By that point he had about 7 employees.

>>> Result: " By that point he had about 7 employees.

>>> 13


Your max_length is set to 128, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 128, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)



>>> Relations: <subject>a security guard<predicate>showed up<subject>He<predicate>proceeds<object>to start muttering and go through the hole in the fence, into the cemetery then<subject>He<predicate>proceeds to start muttering muttering<subject>He<predicate>proceeds to go<object>through the hole in the fence<subject>I<predicate>put<object>a $10 bill in the jukebox<subject>I<predicate>played<object>NSync's "I Want It That Way<subject>an older black gentleman<predicate>to discuss<object>his warranty<subject>it<predicate>said<object>I love you" in a pre recorded voice When you squeezed the bear

>>> Sentence: Anyway, a security guard showed up. He then proceeds to start muttering and go through the hole in the fence, into the cemetery. I put a $10 bill in the jukebox and played NSync's "I Want It That Way" 40 times in a row. Cue an older black gentleman walks in wanting to discuss his warranty and that his battery is bad. When you squeezed the bear it said "I love you" in a pre recorded 

Your max_length is set to 128, but your input_length is only 63. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)



>>> Relations: <subject>a nurse<predicate>came<object>in to our room when I was exhausted and high on life all at once<subject>It<predicate>was<object>black metal, completely spherical, with an indentation around the center, and teeny tiny golf ball-like indentations<subject>a long winding main road<predicate>progressively takes<object>you down to the water<subject>you<predicate>need to drive<object>down into the valley near the harbour down a long winding main road

>>> Sentence: Okay so even though this isn't supernatural or "creepy" or anything it was fucking scary! Shortly after my wife had our first baby (when I was exhausted and high on life all at once), a nurse came in to our room to do a hearing test on the baby. It was black metal, completely spherical, with an indentation around the center, and teeny tiny golf ball-like indentations. To get back to my place you need to drive down into the valley near the harbour down a long winding main road that progressively takes you dow

Your max_length is set to 128, but your input_length is only 114. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)



>>> Relations: <subject>I<predicate>'ve seen<object>a couple of weird lights occasionally<subject>I<predicate>have<object>no recollection of being abducted<subject>I<predicate>forgot<object>mine<subject>my professor<predicate>came up<object>to me<subject>my professor<predicate>gave<object>me his calculator to use Sitting there hopeless and staring at my thermodynamics exam

>>> Sentence: I've seen a couple of weird lights occasionally, and I have no recollection of being abducted, Sitting there hopeless and staring at my thermodynamics exam, my professor came up to me and gave me his calculator to use since I forgot mine.

>>> Result: I've occasionally seen a couple of weird lights, but I have no recollection of being abducted. I forgot mine. Sitting there hopeless and staring at my thermodynamics exam, my professor came up to me and gave me his calculator to use.

>>> 17


Your max_length is set to 128, but your input_length is only 109. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)



>>> Relations: <subject>i<predicate>'m looking<object>around<subject>the fuck<predicate>is going on<subject>blood<predicate>covering<object>a lot of her clothing<subject>we<predicate>approach<object>the roundabout<subject>there<predicate>is a man in<object>medical looking attire<subject>she<predicate>said<subject>the sugar<predicate>was<object>in the plastic baggie<subject>She<predicate>told<object>her we used the stuff of the counter<subject>I<predicate>was<object>open-mouthed, amazed<subject>i<predicate>'m<object>jewish<subject>She<predicate>starts laughing<object>at us<subject>She<predicate>says<object>that was laundry detergent

>>> Sentence: As i'm looking around as to what the fuck is going on, we approach the roundabout and there is a man in medical looking attire next to a woman in what looked like white pyjamas, with blood covering a lot of her clothing. She told her we used the stuff of the counter like she said - the sugar that was in the plastic baggie. I was open-mouthed,

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_checkpoint = 'facebook/bart-base'
model_checkpoint = 'bart-base-finetuned-reddit-2023-05-18_22-14-51'
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(['<subject>', '<predicate>', '<object>'])
model.resize_token_embeddings(len(tokenizer))

Embedding(50268, 768, padding_idx=1)

In [17]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_checkpoint = 'facebook/bart-base'
model_checkpoint = 'bart-base-finetuned-reddit-2023-05-18_22-14-51'
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(['<subject>', '<predicate>', '<object>'])
model.resize_token_embeddings(len(tokenizer))
 
# Repository 생성 & model upload
REPO_NAME = 'sjyyj/sjyyj' # ex) 'my-bert-fine-tuned'
AUTH_TOKEN = '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' # <https://huggingface.co/settings/token>
 
## Upload to Huggingface Hub
model.push_to_hub(
    REPO_NAME, 
    use_temp_dir=True, 
    use_auth_token=AUTH_TOKEN
)
tokenizer.push_to_hub(
    REPO_NAME, 
    use_temp_dir=True, 
    use_auth_token=AUTH_TOKEN
)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]
pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s][A
pytorch_model.bin:   0%|          | 8.19k/558M [00:00<11:54:06, 13.0kB/s][A
pytorch_model.bin:   0%|          | 90.1k/558M [00:00<1:07:26, 138kB/s]  [A
pytorch_model.bin:   0%|          | 156k/558M [00:01<46:43, 199kB/s]   [A
pytorch_model.bin:   0%|          | 254k/558M [00:01<31:45, 293kB/s][A
pytorch_model.bin:   0%|          | 639k/558M [00:01<11:33, 803kB/s][A
pytorch_model.bin:   0%|          | 1.49M/558M [00:01<04:58, 1.86MB/s][A
pytorch_model.bin:   0%|          | 2.20M/558M [00:01<03:47, 2.44MB/s][A
pytorch_model.bin:   1%|          | 4.66M/558M [00:02<02:17, 4.03MB/s][A
pytorch_model.bin:   1%|          | 6.14M/558M [00:02<01:58, 4.64MB/s][A
pytorch_model.bin:   1%|▏         | 7.77M/558M [00:02<01:52, 4.91MB/s][A
pytorch_model.bin:   2%|▏         | 9.18M/558M [00:03<01:46, 5.18MB/s][A
pytorch_model.bin:   2%|▏         | 10.6M/558M [00:03<01:41,

CommitInfo(commit_url='https://huggingface.co/sjyyj/sjyyj/commit/02ed4ad988376b649cbae99fb3c3c856ab7681bc', commit_message='Upload tokenizer', commit_description='', oid='02ed4ad988376b649cbae99fb3c3c856ab7681bc', pr_url=None, pr_revision=None, pr_num=None)