<a href="https://colab.research.google.com/github/premkrishn/bert-hands-on-nlp/blob/main/pdf_query_similar_pdf_similar_content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import requests

# URL of the PDF file
url = "https://am.jpmorgan.com/content/dam/jpm-am-aem/americas/us/en/literature/fact-sheet/us-equity/FS-LCG-A.PDF"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Content of the PDF file
    pdf_content = response.content

    # Save the PDF content to a file
    with open("FS-LCG-A.pdf", "wb") as pdf_file:
        pdf_file.write(pdf_content)
    print("PDF file downloaded successfully.")
else:
    print("Failed to download the PDF file.")


PDF file downloaded successfully.


In [36]:
import fitz  # PyMuPDF
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Read PDF file and extract text
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Tokenize text using BERT tokenizer
def tokenize_text(text):
    tokenized_input = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    return tokenized_input

# Get BERT embeddings for tokenized text
def get_bert_embeddings(tokenized_input):
    with torch.no_grad():
        outputs = model(**tokenized_input)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embeddings
    return embeddings

# Calculate cosine similarity between query and text embeddings
def calculate_similarity(query_embedding, text_embedding):
    return cosine_similarity(query_embedding.reshape(1, -1), text_embedding.reshape(1, -1))[0][0]

# Find sections similar to a given query
def find_similar_sections(pdf_path, query):
    text = extract_text_from_pdf(pdf_path)
    tokenized_input = tokenize_text(text)
    text_embeddings = get_bert_embeddings(tokenized_input)

    # Tokenize and embed query
    query_tokens = tokenizer(query, return_tensors="pt", max_length=512, truncation=True)
    query_embeddings = get_bert_embeddings(query_tokens)

    similarity_scores = []
    for i in range(text_embeddings.shape[0]):
        similarity_score = calculate_similarity(query_embeddings, text_embeddings[i])
        similarity_scores.append((i, similarity_score))

    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    return similarity_scores

def find_similar_sections_in_pdf(pdf_path, query):
    # Find similar sections
    similar_sections = find_similar_sections(pdf_path, query)
    tokenized_input = tokenize_text(extract_text_from_pdf(pdf_path))  # Tokenize the entire text

    # Print top similar sections with content
    num_sections = min(3, len(similar_sections))
    result = []
    for i in range(num_sections):
        section_index, similarity_score = similar_sections[i]
        section_content = ""

        # Get the start and end index of the section
        start_index = tokenized_input['input_ids'][section_index].tolist().index(101)  # Start of text token
        end_index = tokenized_input['input_ids'][section_index].tolist().index(102)    # End of text token

        # Extract the text of the section
        section_text = tokenizer.decode(tokenized_input['input_ids'][section_index][start_index+1:end_index])  # Exclude [CLS] token at start

        # Split the text into words and print 5 words per line
        words = section_text.split()
        section_text_lines = [words[i:i+5] for i in range(0, len(words), 5)]
        section_text_formatted = "\n".join([" ".join(line) for line in section_text_lines])

        result.append((section_index, similarity_score, section_text_formatted))

    return result


In [38]:
# Example usage
pdf_path = "FS-LCG-A.pdf"
query = "russsell index performace"
similar_sections = find_similar_sections_in_pdf(pdf_path, query)

# Print similar sections
for section_index, similarity_score, section_text in similar_sections:
    print(f"Section {section_index}: Similarity Score: {similarity_score}")
    print("Section Content:")
    print("-" * 30)
    print(section_text)
    print("-" * 30)
    print()

Section 0: Similarity Score: 0.691104531288147
Section Content:
------------------------------
fact sheet | march 31,
2024 jpmorgan large cap growth
fund r6 shares : jlgmx
r5 shares : jlgrx r4
shares : jlgqx r3 shares
: jlgpx r2 shares :
jlgzx i shares : seegx
c shares : olgcx a
shares : olgax designed to
provide long - term capital
appreciation primarily through a diversified
portfolio of high - growth
u. s. equity securities. approach
• typically invests in a
diversified portfolio of large cap
companies with above - average
growth prospects • invests primarily
in large, well established companies
• looks for companies with
attractive fundamentals, potential to exceed
market expectations and positive price
momentum expertise portfolio manager (
s ) and years of
experience larry lee, 31 years
holly morris, 20 years giri
devulapally, 32 years robert maloney,
24 years joseph wilson, 19
years fund information class launch
february 22, 1994 share class
number 3148 cusip 4812c0506 fund
asse

In [2]:
ls

[0m[01;34mpdf_files[0m/  [01;34msample_data[0m/  webpage_text.txt


In [5]:
rm -r *

In [22]:
pip install fitz frontend pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.4
