# Privacy Policy Summarizer

##1. Connecting GPU

In [2]:
import torch

# Checking GPU availability
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## 2. Install and Import Libraries

In [3]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.2/224.2 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m93.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.0 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval
import json



## 3. Connecting Google Drive

In [5]:
# run this code when running the code on Google Colab
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.insert(0,'/content/drive/MyDrive/Applied_ML_Project/')

Mounted at /content/drive


## 4. Input

In [None]:
# policy_doc_path = '/content/drive/MyDrive/Applied_ML_Project/data/test.json'

# # Opening JSON file
# f = open(policy_doc_path)
  
# # returns JSON object as a dictionary
# data = json.load(f)

# # Closing file
# f.close()

# # Storing the content in a list
# policy_content = data['text'].split(".")

In [6]:
data = pd.read_pickle("/content/drive/MyDrive/Applied_ML_Project/data_preprocessing/summary_data1.pkl")
data.dropna(how='any', inplace = True)
new_data = data[data['original_text'].apply(lambda x: len(x.split(' ')) <= 1024)]
len(new_data)

train_data, val_test_data = train_test_split(new_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=123)

In [31]:
val_test_data.iloc[1997,0]
policy_content = "if you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform. this includes ensuring your application or use of platform meets our facebook platform policies and our advertising guidelines. your access to and use of data you receive from facebook will be limited as follows you will only request data you need to operate your application. you will have a privacy policy that tells users what user data you are going to use and how you will use display share or transfer that data and you will include your privacy policy url in the developer application. you will not use display share or transfer a user s data in a manner inconsistent with your privacy policy. you will delete all data you receive from us concerning a user if the user asks you to do so and will provide a mechanism for users to make such a request. you will not include data you receive from us concerning a user in any advertising creative. you will not directly or indirectly transfer any data you receive from us to or use such data in connection with any ad network ad exchange data broker or other advertising related toolset even if a user consents to that transfer or use. you will not sell user data. if you are acquired by or merge with a third party you can continue to use user data within your application but you cannot transfer user data outside of your application. we can require you to delete user data if you use it in a way that we determine is inconsistent with users expectations. we can limit your access to data. you will comply with all other restrictions contained in our facebook platform policies. you will not give us information that you independently collect from a user or a user s content without that user s consent. you will make it easy for users to remove or disconnect from your application. you will make it easy for users to contact you. we can also share your email address with users and others claiming that you have infringed or otherwise violated their rights. you will provide customer support for your application. you will not show third party ads or web search boxes on www facebook com. we give you all rights necessary to use the code apis data and tools you receive from us. you will not sell transfer or sublicense our code apis or tools to anyone. you will not misrepresent your relationship with facebook to others. you may use the logos we make available to developers or issue a press release or other public statement so long as you follow our facebook platform policies. we can issue a press release describing our relationship with you. you will comply with all applicable laws. in particular you will if applicable have a policy for removing infringing content and terminating repeat infringers that complies with the digital millennium copyright act comply with the video privacy protection act vppa and obtain any opt in consent necessary from users so that user data subject to the vppa may be shared on facebook. you represent that any disclosure to us will not be incidental to the ordinary course of your business. we do not guarantee that platform will always be free. you give us all rights necessary to enable your application to work with facebook including the right to incorporate content and information you provide to us into streams timelines and user action stories. you give us the right to link to or frame your application and place content including ads around your application. we can analyze your application content and data for any purpose including commercial such as for targeting the delivery of advertisements and indexing content for search to ensure your application is safe for users we can audit it. we can create applications that offer similar features and services to or otherwise compete with your application."
policy_content  = policy_content.split(".")
policy_content

['if you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform',
 ' this includes ensuring your application or use of platform meets our facebook platform policies and our advertising guidelines',
 ' your access to and use of data you receive from facebook will be limited as follows you will only request data you need to operate your application',
 ' you will have a privacy policy that tells users what user data you are going to use and how you will use display share or transfer that data and you will include your privacy policy url in the developer application',
 ' you will not use display share or transfer a user s data in a manner inconsistent with your privacy policy',
 ' you will delete all data you receive from us concerning a user if the user asks you to do so and will provide a mechanism for users to make such a request',
 ' you will 

In [32]:
# function to clean data
from bs4 import BeautifulSoup # package used for web scrapping to remove the HTML tags from the text ( not needed here)
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup # package used for web scrapping to remove the HTML tags from the text ( not needed here)
import re # A package dealing with regular experession to remove punctuation and numbers
from nltk.stem import PorterStemmer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet


# initializing stemming algorithm
ps = PorterStemmer() # porter stemmer
ss = SnowballStemmer('english') # snowball stemmer

# initializing lemmatizing algorithm
wnl = WordNetLemmatizer() # lemmatization
def text_clean_preprocess(raw_text : str):
    """
    This function will clean the data and perform certain preprocessing step of stemming/lemmatizing the words in 
    the tweet. Finally the function will remove stopwords and will only consider with words with a character length
    ranging from 2 to 10
    """
    vowel = ['a','e','i','o','u']
    text = BeautifulSoup(raw_text) # using the BeautifulSoup library to scrape the HTML tags from the text
    text = re.sub("[^a-zA-Z0-9]"," ",text.get_text()) # removing all the punctions except the expressions ":)" and ":("
    text = re.sub("\s+"," ",text)
    text = text.lower() # converting all words to lowercase
    text = text.strip() # striping leading and trailing white spaces
    # words = text.split()
    # meaningful_words = [wnl.lemmatize(w) for w in words if w not in stopwords.words("english")and w[:4] != "http"]
    return text 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
clean_policy_text = []
for sent in policy_content:
  cleaned_text = text_clean_preprocess(sent)
  clean_policy_text.append(cleaned_text)

##4. Classification Model Selection

In [34]:
class GoEmotionClassifier(nn.Module):
    def __init__(self, n_classes, do_prob, bert_model):  #n_train_steps,
        super(GoEmotionClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes)
        # self.n_train_steps = n_train_steps
        self.step_scheduler_after = "batch"

    def forward(self, ids, mask):
        output_1 = self.bert(ids, attention_mask=mask)["pooler_output"]
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output

def ret_model(n_train_steps, do_prob):
  model = GoEmotionClassifier(n_train_steps, n_labels, do_prob, bert_model=bert_model)
  return model

In [35]:
# for saved_key, model_key in zip(model_state_dict.keys(), model.state_dict().keys()):
#     print(saved_key, '  |  ', model_key)

In [36]:
bert_model = AutoModel.from_pretrained("roberta-base")
model = GoEmotionClassifier(n_classes = 10,do_prob = 0.4,bert_model = bert_model)
# print("This is our Model parameters:",model)
model_path = '/content/drive/MyDrive/Applied_ML_Project/Classification Final Models/Roberta_F1_0.75.pt'
model_state_dict = torch.load(model_path, map_location=torch.device(device))
# print(set(model_state_dict.keys()) == set(model.state_dict().keys()))

new_state_dict = {}
for key, value in model_state_dict.items():
    new_key = key.replace("module.", "")
    new_state_dict[new_key] = value

model.load_state_dict(new_state_dict)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/pytorch_

<All keys matched successfully>

In [37]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaModel, SqueezeBertTokenizer

def model_selection(model_path : str, pretrained_model : str, device):
  '''
  Selecting a model from list of fine-tuned models
  '''
  bertmodel = AutoModel.from_pretrained(pretrained_model)
  if pretrained_model == "roberta-base":
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    drop_prob = 0.4
  elif pretrained_model == "mukund/privbert":
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    drop_prob = 0.4
  elif pretrained_model == "squeezebert/squeezebert-uncased":
    tokenizer =  SqueezeBertTokenizer.from_pretrained("squeezebert/squeezebert-uncased", do_lower_case=True)
    drop_prob = 0.3


  model = GoEmotionClassifier(n_classes = 10,do_prob = drop_prob,bert_model = bertmodel)

  model_state_dict = torch.load(model_path, map_location=torch.device(device))
  new_state_dict = {}
  for key, value in model_state_dict.items():
      new_key = key.replace("module.", "")
      new_state_dict[new_key] = value

  model.load_state_dict(new_state_dict)
  model.to(device)
  

  
  return tokenizer, model


In [38]:
# Load the saved model on GPU
pt_file = 'Bert_F1_0.77.pt' #PrivBERT_F1_0.77 Bert_F1_0.77
model_path = '/content/drive/MyDrive/Applied_ML_Project/Classification Final Models/' + pt_file
tokenizer, model = model_selection(model_path, 'squeezebert/squeezebert-uncased', device)
# mukund/privbert squeezebert/squeezebert-uncased

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--squeezebert--squeezebert-uncased/snapshots/7978b0c163f11850ec35d5cd541828159313ac41/config.json
Model config SqueezeBertConfig {
  "_name_or_path": "squeezebert/squeezebert-uncased",
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_groups": 4,
  "intermediate_size": 3072,
  "k_groups": 4,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "squeezebert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_groups": 4,
  "pad_token_id": 0,
  "post_attention_groups": 1,
  "q_groups": 4,
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "v_groups": 4,
  "vocab_size": 30528
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--squeezebert--squeezebert-uncased/snapshots/7978b0c163f

##6. Classification Prediction

In [39]:
class PolicyDataset:
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts

        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]

        inputs = self.tokenizer.__call__(text,
                                        None,
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        padding="max_length",
                                        truncation=True,
                                        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
        }

In [40]:
# Create sentence and label lists
max_length = 0
for ele in clean_policy_text:
  sent_len = len(ele.split(" "))
  if sent_len > max_length:
    max_length = sent_len

In [41]:
def build_dataset(policy_content, tokenizer_max_len):
    test_dataset = PolicyDataset(policy_content, tokenizer, tokenizer_max_len)
    return test_dataset

def build_dataloader(test_dataset, batch_size):
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

    return test_data_loader

In [42]:
clean_policy_text

['if you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform',
 'this includes ensuring your application or use of platform meets our facebook platform policies and our advertising guidelines',
 'your access to and use of data you receive from facebook will be limited as follows you will only request data you need to operate your application',
 'you will have a privacy policy that tells users what user data you are going to use and how you will use display share or transfer that data and you will include your privacy policy url in the developer application',
 'you will not use display share or transfer a user s data in a manner inconsistent with your privacy policy',
 'you will delete all data you receive from us concerning a user if the user asks you to do so and will provide a mechanism for users to make such a request',
 'you will not in

In [43]:
test_dataset = build_dataset(clean_policy_text, max_length)
test_data_loader = build_dataloader(test_dataset, batch_size = 32)
test_data_loader

<torch.utils.data.dataloader.DataLoader at 0x7febaaf23610>

In [44]:
# Prediction on test set
from torch.nn.functional import softmax
print('Prediction Classes of sentences in a privacy policy')
# Tracking variables 
predictions = []
fin_outputs = []
predictions = []
# Predict 
# Telling the model not to compute or store gradients, saving memory and 
# speeding up prediction
with torch.no_grad():
  for bi, d in tqdm(enumerate(test_data_loader), total=len(test_data_loader)):

    # Adding token ids and masks to GPU/CPU
    ids = d["ids"]
    mask = d["mask"]
    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
  
  
    outputs = model(ids, mask).to(device)
    fin_outputs.extend(torch.sigmoid(outputs))
    preds = torch.stack(fin_outputs)
    preds = preds.cpu().detach().numpy()
    preds1 = np.argmax(preds, axis=1)
    predictions+=preds1.tolist()

Prediction Classes of sentences in a privacy policy


100%|██████████| 2/2 [00:00<00:00,  5.82it/s]


In [45]:
mapping = {
    0:"Category_Data Retention",
    1:"Category_Data Security",
    2:"Category_Do Not Track",
    3:"Category_First Party Collection/Use",
    4:"Category_International and Specific Audiences",
    5:"Category_Other",
    6:"Category_Policy Change",
    7:"Category_Third Party Sharing/Collection",
    8:"Category_User Access, Edit and Deletion",
    9:"Category_User Choice/Control",
}

n_labels = len(mapping)

In [46]:
class_dict = {}
for pred, pol in zip(predictions, clean_policy_text):
  if mapping[pred] not in class_dict.keys():
    class_dict[mapping[pred]] = [pol]
  else:
    class_dict[mapping[pred]].append(pol)

In [47]:
class_dict

{'Category_Third Party Sharing/Collection': ['if you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform',
  'you will not sell transfer or sublicense our code apis or tools to anyone',
  'you will comply with all applicable laws',
  'in particular you will if applicable have a policy for removing infringing content and terminating repeat infringers that complies with the digital millennium copyright act comply with the video privacy protection act vppa and obtain any opt in consent necessary from users so that user data subject to the vppa may be shared on facebook',
  ''],
 'Category_Other': ['this includes ensuring your application or use of platform meets our facebook platform policies and our advertising guidelines',
  'your access to and use of data you receive from facebook will be limited as follows you will only request data you ne

In [48]:
for key, value in class_dict.items():
  print("{}:".format(key))
  print("--------------------------------------------------")
  for item in value:
    print("  {}".format(item))

Category_Third Party Sharing/Collection:
--------------------------------------------------
  if you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform
  you will not sell transfer or sublicense our code apis or tools to anyone
  you will comply with all applicable laws
  in particular you will if applicable have a policy for removing infringing content and terminating repeat infringers that complies with the digital millennium copyright act comply with the video privacy protection act vppa and obtain any opt in consent necessary from users so that user data subject to the vppa may be shared on facebook
  
Category_Other:
--------------------------------------------------
  this includes ensuring your application or use of platform meets our facebook platform policies and our advertising guidelines
  your access to and use of data you rece

## Summarization Model Selection

In [71]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def model_selection_summarization(model_path : str, pretrained_model : str, device):
  '''
  Selecting a model from list of fine-tuned models
  '''

  if model_path.split(".")[-1] == 'pt':
    # Load the tokenizer and create an instance of the model
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model)

    # Load the checkpoint into the model
    model = torch.load(model_path, map_location=torch.device(device))
    # model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model, state_dict=state_dict)
    model.to(device)
  else:
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

  return tokenizer, model

In [50]:
# Load the saved model on GPU
#tokenizer, model = model_selection_summarization('star-nox/pegasus-cnn_dailymail-finetuned-policy', 'google/pegasus-cnn_dailymail', device)

In [75]:
# Load the saved model on GPU
tokenizer, model = model_selection_summarization('star-nox/t5-small-finetuned-policy', 't5-small', device)


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--star-nox--t5-small-finetuned-policy/snapshots/fbccda632330f560c5b1b392d64956c9fa7ecc5e/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--star-nox--t5-small-finetuned-policy/snapshots/fbccda632330f560c5b1b392d64956c9fa7ecc5e/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--star-nox--t5-small-finetuned-policy/snapshots/fbccda632330f560c5b1b392d64956c9fa7ecc5e/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--star-nox--t5-small-finetuned-policy/snapshots/fbccda632330f560c5b1b392d64956c9fa7ecc5e/tokenizer_config.json


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--star-nox--t5-small-finetuned-policy/snapshots/fbccda632330f560c5b1b392d64956c9fa7ecc5e/config.json
Model config T5Config {
  "_name_or_path": "star-nox/t5-small-finetuned-policy",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--star-nox--t5-small-finetuned-policy/snapshots/fbccda632330f560c5b1b392d64956c9fa7ecc5e/pytorch_model.bin
Generate config GenerationConfig {
  "_from_model_config": true,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.28.1"
}

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at star-nox/t5-small-finetuned-policy.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--star-nox--t5-small-finetuned-policy/snapshots/fbccda632330f560c5b1b392d64956c9fa7ecc5e/generation_config.json
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.28.1"
}



In [72]:
# Load the saved model on GPU
tokenizer,model = model_selection_summarization('/content/drive/MyDrive/Applied_ML_Project/Summarization Final Models/bart_base_finetuned_model (1).pt', 't5-small', device)

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/ad26363d1dadacd02b8d1b627db00a2db488fcf7/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/ad26363d1dadacd02b8d1b627db00a2db488fcf7/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/ad26363d1dadacd02b8d1b627db00a2db488fcf7/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/ad26363d1dadacd02b8d1b627db00a2db488fcf7/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 

## Summary Generation

In [76]:
complete_summary = {}
for practice in class_dict:
 # Tokenize the input text and convert to PyTorch tensors
  tokenized_input = tokenizer(class_dict[practice], padding=True, return_tensors='pt').to(model.device)

  # Set the decoding parameters
  num_beams = 4
  max_length = 128

  # Generate the summary using beam search decoding
  summary_ids = model.generate(
      input_ids=tokenized_input['input_ids'], 
      attention_mask=tokenized_input['attention_mask'],
      num_beams=num_beams, 
      max_length=max_length, 
      early_stopping=True
    )

  # Decode the summary tokens back into text
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

  # Print the generated summary
  print("Generated summary:", summary)

  complete_summary[practice] = summary


Generated summary: If you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform.
Generated summary: This includes ensuring your application or use of platform meets our facebook platform policies and advertising guidelines.
Generated summary: display share or transfer a user s data in a manner inconsistent with your privacy policy.
Generated summary: If you are acquired by or merge with a third party you can continue to use user data within your application but you cannot transfer user data outside of your application.
Generated summary: You give us all rights necessary to enable your application to work with facebook including the right to incorporate content and information you provide to us into streams timelines and user action stories.


In [77]:
complete_summary

{'Category_Third Party Sharing/Collection': 'If you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform.',
 'Category_Other': 'This includes ensuring your application or use of platform meets our facebook platform policies and advertising guidelines.',
 'Category_First Party Collection/Use': 'display share or transfer a user s data in a manner inconsistent with your privacy policy.',
 'Category_User Choice/Control': 'If you are acquired by or merge with a third party you can continue to use user data within your application but you cannot transfer user data outside of your application.',
 'Category_User Access, Edit and Deletion': 'You give us all rights necessary to enable your application to work with facebook including the right to incorporate content and information you provide to us into streams timelines and user action stories.'}

In [78]:
tokenized_input = tokenizer(policy_content, padding=True, return_tensors='pt').to(model.device)

# Set the decoding parameters
num_beams = 4
max_length = 128

# Generate the summary using beam search decoding
summary_ids = model.generate(
      input_ids=tokenized_input['input_ids'], 
      attention_mask=tokenized_input['attention_mask'],
      num_beams=num_beams, 
      max_length=max_length, 
      early_stopping=True
    )

# Decode the summary tokens back into text
direct_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the generated summary
print("Generated summary:", direct_summary)

Generated summary: If you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform.


In [79]:
for key, value in complete_summary.items():
  print("{}:".format(key))
  print("--------------------------------------------------")
  print("  {}".format(value))
  print("--------------------------------------------------")

Category_Third Party Sharing/Collection:
--------------------------------------------------
  If you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform.
--------------------------------------------------
Category_Other:
--------------------------------------------------
  This includes ensuring your application or use of platform meets our facebook platform policies and advertising guidelines.
--------------------------------------------------
Category_First Party Collection/Use:
--------------------------------------------------
  display share or transfer a user s data in a manner inconsistent with your privacy policy.
--------------------------------------------------
Category_User Choice/Control:
--------------------------------------------------
  If you are acquired by or merge with a third party you can continue to use user data wit

In [80]:
for sentence in policy_content:
  print(sentence)

if you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform
 this includes ensuring your application or use of platform meets our facebook platform policies and our advertising guidelines
 your access to and use of data you receive from facebook will be limited as follows you will only request data you need to operate your application
 you will have a privacy policy that tells users what user data you are going to use and how you will use display share or transfer that data and you will include your privacy policy url in the developer application
 you will not use display share or transfer a user s data in a manner inconsistent with your privacy policy
 you will delete all data you receive from us concerning a user if the user asks you to do so and will provide a mechanism for users to make such a request
 you will not include data you recei

In [None]:
complete_summary

In [None]:
direct_summary

In [60]:
# Set decoding parameters
num_beams = 4
max_length = 256
# Generate summary using beam search decoding

outputs = model.generate(
    input_ids=torch.unsqueeze(tokenized_test_data[500]['input_ids'].to(device),0),
    max_length=256,
    num_beams=num_beams,
    early_stopping=True,
    no_repeat_ngram_size=2,
    num_return_sequences=1
)

# Decode summary tokens back into text
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print summary
print("Generated summary:", summary)

NameError: ignored

In [None]:
test_data.iloc[500,0]