<a href="https://colab.research.google.com/github/sayarghoshroy/NLP_course_at_ISB/blob/main/5_paraphrasing_playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Using a Pre-Trained Transformer Encoder-Decoder based Paraphrasing Model

In [None]:
%%capture

# Getting necessary libraries
!pip install -U transformers
!pip install sentencepiece

import os.path
from os import path
import json
import torch
import nltk
import sentencepiece
from tqdm import tqdm
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, BertTokenizer

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Using a standard model
tokenizer_model_name = 'google/pegasus-large'

# Using a pre-trained community model
paraphrasing_model_name = 'tuner007/pegasus_paraphrase'

# Model Card: https://huggingface.co/tuner007/pegasus_paraphrase

torch_device = 'cuda'
if torch.cuda.is_available() == False:
  torch_device = 'cpu'

tokenizer = PegasusTokenizer.from_pretrained(tokenizer_model_name)
model = PegasusForConditionalGeneration.from_pretrained(paraphrasing_model_name).to(torch_device)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [None]:
global_return_sequences = 5

def get_unit_paraphrase(input_text, num_return_sequences = global_return_sequences, num_beams = 10):
  max_len = 60
  batch = tokenizer([input_text], 
                    truncation = True,
                    padding = 'longest',
                    max_length = max_len,
                    return_tensors = 'pt').to(torch_device)

  translated = model.generate(**batch,
                              max_length = max_len,
                              num_beams = num_beams,
                              num_return_sequences = num_return_sequences,
                              temperature = 1.5)
  
  targets = tokenizer.batch_decode(translated,
                                  skip_special_tokens = True)
  
  return targets

def get_paraphrase(input_text, num_return_sequences = global_return_sequences, num_beams = 10):
  preprocess_len = 52
  complete_paraphrases = ['', '', '', '', '']

  sentences = nltk.sent_tokenize(input_text)
  for sentence in sentences:
    tokens = nltk.word_tokenize(sentence)
    count = len(tokens)
    if count > preprocess_len:
      continue
    try:
      sentence_paraphrases = get_unit_paraphrase(sentence)
    except:
      continue

    for index, unit in enumerate(sentence_paraphrases):
      complete_paraphrases[index] += unit + ' '

  return complete_paraphrases

In [None]:
# Viewing Sample Paraphrases

examples = ['Last month Moscow claimed control of Mariupol after a weeks-long siege, but hundreds of Ukrainian soldiers remained holed up in underground tunnels beneath the huge Azovstal industrial zone, blocked by Russian troops.',
            'Asked by reporters on Tuesday whether the Ukrainian soldiers will be treated as war criminals or prisoners of war, President Vladimir Putin’s spokesman did not give an answer. ']

for example in examples:
  print('Source: ' + str(example))
  response = get_paraphrase(example)
  print('Primary Paraphrase: ' + str(response[0]))
  print()

Source: Last month Moscow claimed control of Mariupol after a weeks-long siege, but hundreds of Ukrainian soldiers remained holed up in underground tunnels beneath the huge Azovstal industrial zone, blocked by Russian troops.
Primary Paraphrase: After a weeks-long siege, Moscow claimed control of Mariupol, but hundreds of Ukrainian soldiers remained trapped in underground tunnels. 

Source: Asked by reporters on Tuesday whether the Ukrainian soldiers will be treated as war criminals or prisoners of war, President Vladimir Putin’s spokesman did not give an answer. 
Primary Paraphrase: President Putin's spokesman didn't give an answer when asked if the Ukrainian soldiers will be treated as war criminals or prisoners of war. 



In [None]:
# That's it