In [None]:
%pip install datasets transformers rouge-score nltk torch pandas numpy

In [None]:
import transformers
from datasets import load_dataset, load_metric

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# %cd drive/MyDrive/CS\ 224N\ Project
%ls # verify that you are in the right directory

In [None]:
train_data = pd.read_csv('aita_train_set.csv')
valid_data = pd.read_csv('aita_valid_set.csv')
test_data = pd.read_csv('aita_test_set.csv')

In [None]:
valid_data = valid_data[['text', 'comments']]
train_data = train_data[['text', 'comments']]
test_data = test_data[['text', 'comments']]

In [None]:
#import pandas as pd
#import datasets
from datasets import Dataset, DatasetDict

tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(valid_data)
tstds = Dataset.from_pandas(test_data)

aita_datasets = DatasetDict()

aita_datasets['train'] = tds
aita_datasets['validation'] = vds
aita_datasets['test'] = tstds

print(aita_datasets)

In [None]:
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer

In [None]:
model_checkpoint = "t5-small"
custom_eos = '[EOS]'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_special_tokens({'eos_token':custom_eos})

In [None]:
prefix = "summarize: "

max_input_length = 512
max_target_length = 50

def clean_text(text):
  sentences = nltk.sent_tokenize(text.strip())
  sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
  sentences_cleaned_no_titles = [sent for sent in sentences_cleaned
                                 if len(sent) > 0 and
                                 sent[-1] in string.punctuation]
  text_cleaned = "\n".join(sentences_cleaned_no_titles)
  return text_cleaned

def preprocess_data(examples):
  texts_cleaned = [clean_text(text) for text in examples["text"]]
  inputs = [prefix + text for text in texts_cleaned]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    no_eos_labels = examples["comments"]
    outputs = [label + custom_eos for label in no_eos_labels]
    labels = tokenizer(outputs, max_length=max_target_length, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
#medium_datasets_cleaned = medium_datasets.filter(lambda example: (len(example['text']) >= 500) and (len(example['title']) >= 20))
tokenized_datasets = aita_datasets.map(preprocess_data, batched=True)
tokenized_datasets

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
batch_size = 8
model_name = "t5-small-rationale-generation"
model_dir = f"{model_name}"
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=3e-4, #made this higher, originally 4e^-5
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False, #we don't need this to be true bc we care more about accuracy than fast training
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
import numpy as np

metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
%pip install pytorch

In [None]:
%pip install tensorboardX

In [None]:
%pip install tensorflow
%pip install datetime

In [None]:
# Function that returns an untrained model to be trained
def model_init():
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    model.generation_config.repetition_penalty = 0.4
    model.generation_config.no_repeat_ngram_size=4
    print(model.generation_config)
    model.resize_token_embeddings(len(tokenizer))
    return model

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
import tensorflow as tf
import datetime, os
# Start TensorBoard before training to monitor it in progress
%load_ext tensorboard
#%reload_ext tensorboard
%tensorboard --logdir '{model_dir}'/runs

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
%ls

In [None]:
model_name = "t5-small-rationale-generation/checkpoint-8200"
model_dir = f"{model_name}"

# tokenizer = AutoTokenizer.from_pretrained('t5-small')
# model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

In [None]:
testing = test_data.iloc[400]['comments']
testing

In [None]:
text = testing

inputs = ["summarize: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
print(decoded_output)
#predicted_comment = nltk.sent_tokenize(decoded_output.strip())
#decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
#predicted_comment = nltk.sent_tokenize(decoded_output.strip())

#print(predicted_comment)
# Session State and Callbacks in Streamlit