In [None]:
# Install necessary packages
!pip install sentence_transformers pylev

# Import required libraries
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
import random
import numpy as np
import warnings
import logging
import pylev

# Suppress warnings and set logging level
logging.getLogger("transformers").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore")

# Set seed for reproducibility
seed_value = 40
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load models and tokenizer
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'tuner007/pegasus_paraphrase'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
minilm_l12_model = SentenceTransformer('paraphrase-MiniLM-L12-v2')
mpnet_model = SentenceTransformer('paraphrase-mpnet-base-v2')




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Collecting pylev
  Downloading pylev-1.4.0-py2.py3-none-any.whl.metadata (2.3 kB)
Downloading pylev-1.4.0-py2.py3-none-any.whl (6.1 kB)
Installing collected packages: pylev
Successfully installed pylev-1.4.0


In [None]:
import spacy
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pylev
import torch

nlp = spacy.load("en_core_web_sm")

def get_response(input_text, num_return_sequences, max_length=60):
    with torch.no_grad():
        batch = tokenizer([input_text], truncation=True, padding='longest', max_length=max_length, return_tensors="pt").to(torch_device)
        translated = model.generate(
            **batch,
            max_length=max_length,
            num_beams=num_return_sequences,
            num_return_sequences=num_return_sequences,
        )
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    del batch
    del translated
    torch.cuda.empty_cache()
    return tgt_text

def compute_similarity_scores(sentence1, sentence2):
    minilm_l12_embeddings = minilm_l12_model.encode([sentence1, sentence2])
    mpnet_embeddings = mpnet_model.encode([sentence1, sentence2])
    minilm_l12_similarity = cosine_similarity([minilm_l12_embeddings[0]], [minilm_l12_embeddings[1]])[0][0]
    mpnet_similarity = cosine_similarity([mpnet_embeddings[0]], [mpnet_embeddings[1]])[0][0]
    del minilm_l12_embeddings
    del mpnet_embeddings
    torch.cuda.empty_cache()
    return {
        'minilm_l12': minilm_l12_similarity,
        'mpnet': mpnet_similarity,
    }

def detect_key_words(sentence):
    doc = nlp(sentence)
    key_words = set()
    time_related_terms = {"today", "tomorrow", "yesterday", "now", "later", "soon", "week", "month", "year", "hour", "minute", "second"}
    key_phrases = []
    current_phrase = []

    for token in doc:
        if token.pos_ in {"PROPN", "NOUN", "NUM"} and token.text not in time_related_terms:
            key_phrases.append(token.text)

    key_words.update(key_phrases)
    del key_phrases
    del current_phrase
    del doc
    torch.cuda.empty_cache()

    return sorted(key_words)

def count_pos(sentence, pos_tags):
    doc = nlp(sentence)
    pos_count = sum(1 for token in doc if token.pos_ in pos_tags)
    del doc
    torch.cuda.empty_cache()
    return pos_count

def get_distance(src_txt, paraphrased_txt):
    src_txt = src_txt.lower()
    paraphrased_txt = paraphrased_txt.lower()
    distance = pylev.levenshtein(src_txt.split(), paraphrased_txt.split())
    del src_txt
    del paraphrased_txt
    torch.cuda.empty_cache()

    return distance

def filter_paraphrases(paraphrases, sentence, original_keywords, original_nouns, original_num, orginal_pron):
    filtered_paraphrases = []
    for paraphrase in paraphrases:
        if paraphrase[-1] != '.':
            paraphrase = paraphrase + '.'
        if get_distance(sentence, paraphrase) <= 2:
            continue
        if paraphrase != sentence and paraphrase not in filtered_paraphrases:
            paraphrase_nouns = count_pos(paraphrase, {"NOUN", "PROPN"})
            paraphrase_num = count_pos(paraphrase, {"NUM"})
            paraphrase_pron = count_pos(paraphrase, {"PRON"})
            if original_num == paraphrase_num:
                paraphrase_keywords = detect_key_words(paraphrase)
                if all(word in paraphrase_keywords for word in original_keywords):
                    similarity_scores = compute_similarity_scores(sentence, paraphrase)
                    if similarity_scores['minilm_l12'] > 0.95 and similarity_scores['mpnet'] > 0.95:
                        del similarity_scores
                        filtered_paraphrases.append(paraphrase)
                del paraphrase_keywords
                torch.cuda.empty_cache()

    del paraphrases
    torch.cuda.empty_cache()
    return filtered_paraphrases

def generate_paraphrases(sentence, original_keywords, original_nouns, original_num, orginal_pron, num_return_sequences=100):
    num_return_sequences = 400
    paraphrases = get_response(sentence, num_return_sequences=num_return_sequences, max_length=60)
    paraphrases = filter_paraphrases(paraphrases, sentence, original_keywords, original_nouns, original_num, orginal_pron)
    torch.cuda.empty_cache()

    return paraphrases

sentences = [
    "An operating system manages the hardware and software resources of a computer.",
    "Linux is known for its open-source nature and strong security features.",
    "Windows provides a user-friendly interface but is more susceptible to malware.",
    "macOS is optimized for Apple's hardware, offering seamless integration and performance.",
    "The kernel is the core component of an operating system, managing tasks like memory and processes."
]




results = []
for sentence in sentences:
    print('Original sentence:', sentence)
    print("&" * 100)
    original_keywords = detect_key_words(sentence)
    print("Keywords:", original_keywords)
    original_nouns = count_pos(sentence, {"NOUN", "PROPN"})
    original_num = count_pos(sentence, {"NUM"})
    orginal_pron = count_pos(sentence, {"PRON"})
    paraphrases = generate_paraphrases(sentence, original_keywords, original_nouns, original_num, orginal_pron)
    print("TOTAL:", len(paraphrases))
    results.append({"original_sentence": sentence, "total_paraphrases": len(paraphrases)})
    del original_keywords
    del original_nouns
    del original_num
    del orginal_pron
    del paraphrases
    torch.cuda.empty_cache()

results_df = pd.DataFrame(results)

output_file_path = 'paraphrases_output.xlsx'
results_df.to_excel(output_file_path, index=False)


Original sentence: An operating system manages the hardware and software resources of a computer.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Keywords: ['computer', 'hardware', 'operating', 'resources', 'software', 'system']
TOTAL: 11
Original sentence: Linux is known for its open-source nature and strong security features.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Keywords: ['Linux', 'features', 'nature', 'security', 'source']
TOTAL: 101
Original sentence: Windows provides a user-friendly interface but is more susceptible to malware.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Keywords: ['Windows', 'interface', 'malware', 'user']
TOTAL: 7
Original sentence: macOS is optimized for Apple's hardware, offering seamless integration and performance.
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&