In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the pre-trained model and tokenizer
model_name = 'sentence-transformers/paraphrase-MiniLM-L6-v2'


In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [4]:
model = AutoModel.from_pretrained(model_name)

In [29]:

def get_embeddings(sentences):
    # Tokenize and encode the sentences
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    # Mean pooling to get sentence embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings
 
def remove_similar_lines(text, similarity_threshold = 0.5):
    lines = text.split('.')
    embeddings = get_embeddings(lines)
    similarities = cosine_similarity(embeddings.detach().numpy())
    
    # Identify and remove similar lines
    to_remove = set()
    for i in range(len(lines)):
        if i in to_remove:
            continue
        for j in range(i + 1, len(lines)):
            if similarities[i, j] > similarity_threshold:
                to_remove.add(j)
 
    # Retain only unique lines
    unique_lines = [line for i, line in enumerate(lines) if i not in to_remove]
    return '\n'.join(unique_lines)


In [31]:

# # Example usage
# with open('./first_chapter.pdf.txt', 'r') as f:
#     text = f.read()


# import pymupdf # imports the pymupdf library
# text = ""
# doc = pymupdf.open("first_chapter.pdf") # open a document
# for page in doc: # iterate the document pages
#   text += page.get_text() # get plain text encoded as UTF-8

text = "The new smartphone is a game-changer in the world of technology. It's a game-changer in the world of technology, with its advanced features and sleek design. The smartphone is a game-changer, with its ability to take high-quality photos and videos. It's a game-changer, with its advanced camera system and long-lasting battery life. The smartphone is a game-changer, with its fast processor and ample storage space. It's a game-changer, with its sleek design and user-friendly interface. The smartphone is a game-changer, with its ability to stay connected to the internet and access a wide range of apps. It's a game-changer, with its advanced security features and ability to keep personal data safe. The smartphone is a game-changer, with its ability to make and receive calls, send texts, and access the internet. It's a game-changer, with its advanced features and sleek design. The smartphone is a game-changer, with its ability to take high-quality photos and videos. It's a game-changer, with its advanced camera system and long-lasting battery life."
print('Original Text')
print('------------------------------')
print(len(text.split(".")))
 
shortened_text = remove_similar_lines(text)
print('Shortened Text')
print('------------------------------')
print(shortened_text)

Original Text
------------------------------
13
Shortened Text
------------------------------
The new smartphone is a game-changer in the world of technology
