In [7]:
import pandas as pd
import pyarrow as pa
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
import evaluate
import numpy as np

In [8]:
df = pd.read_excel("./Final_Data.xlsx")
df_train = df.sample(frac = 0.8, random_state = 42).reset_index(drop = True)
df_test = df.drop(df_train.index).reset_index(drop = True)

df_train = Dataset(pa.Table.from_pandas(df_train))
df_test = Dataset(pa.Table.from_pandas(df_test))


In [None]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Policy"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding=True)
    labels = tokenizer(text_target=examples["Summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train = df_train.map(preprocess_function, batched=True)
tokenized_test = df_test.map(preprocess_function, batched=True)


data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
training_args = Seq2SeqTrainingArguments(
    output_dir="./Summary_Model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=100,
    predict_with_generate=True,
    # fp16=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
model.save_pretrained("./Summary_Model_V1/")

## TEST

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_checkpoint = "./Summary_Model_V1/"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
prefix = "summarize: "

def generate_summary(text):
    inputs = prefix + text
    input_ids = tokenizer.encode(inputs, return_tensors="pt", max_length=1024, truncation=True)
    output = model.generate(input_ids=input_ids, max_length=128, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

example_policy = """We use Information we collect to provide a personalised experience to you, including ads, along with the other purposes that we explain in detail below.
For some of these purposes, we use information  Acorss our products and Across our devices. The information that we use for these purposes is automatically processed by our systems. But in some cases, we also use  manual review to access and review your information.
To use less information that's connected to individual users, in some cases we de-identify or aggregate information or anonymise it so that it no longer identifies you. We use this information in the same ways we use your information as described in this section."""

summary = generate_summary(example_policy)
print(summary)

This privacy policy statement is positive because it tells you how the company uses your information to provide a personalised experience to you, including ads, and other purposes that they explain in detail below. The company uses your information to provide a personalised experience to you, including ads, along with other purposes that they explain in detail below. The company also uses manual review to access and review your information, and to use less information that's connected to individual users, in some cases we de-identify or aggregate information or anonymise it so that


In [5]:
df['Generated_Summary'] = df['Policy'].apply(generate_summary)

In [6]:
df.to_excel("test_df.xlsx")