In [None]:
from transformers import AutoModelForSeq2SeqLM, pipeline, AutoTokenizer
from tqdm.autonotebook import tqdm
import re
import pandas as pd
tqdm.pandas()

pd.set_option('display.max_colwidth', None)

In [None]:
# bart summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

In [None]:
def truncate_text_and_tokenize(text):
        tokens = tokenizer(text, return_tensors = "pt", truncation=True).input_ids
        num_tokens = tokens.size()[1]
        if num_tokens>=1024:
            _tokens = tokens[0][:num_tokens-3]
        else:
            _tokens = tokens[0]
        tokenized_text = tokenizer.decode(_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True)
        return (_tokens, tokenized_text)

In [None]:
df = pd.read_csv("../data/inappropriate_arguments_sample_100_argsme.csv")

In [None]:
df.count()

In [None]:
# check for length of at least 200 words
def get_word_count(text):
    return len(re.findall(r'\w+', text))

In [None]:
df['word_count'] = df['argument'].progress_apply(get_word_count)

In [None]:
def summarize_argument(text, word_count):
    _tokens, tokenized_text = truncate_text_and_tokenize(text)
    # min_length and max_length should be 25% and 50% of word_count
    min_length = int(word_count * 0.25)
    max_length = int(word_count * 0.35)
    summarization = summarizer(tokenized_text, min_length=min_length, max_length=max_length, clean_up_tokenization_spaces=True)
    summary = summarization[0]['summary_text']
    return summary

In [None]:
def apply_summarization(row):
    argument = row['argument']
    word_count = int(row['word_count'])
    summary = summarize_argument(argument, word_count)
    row['bart_gist'] = summary
    return row

In [None]:
s_df = df.progress_apply(apply_summarization, axis=1)

In [None]:
s_df.columns

In [None]:
s_df.to_csv("../data/results-by-corpus/argsme/summarization/bart.csv")