In [7]:
!pip install PyPDF



In [5]:
from flask import Flask, render_template

import PyPDF2
import os


In [24]:
PDF_FOLDERs = "../pdfs/exponential_Functions.pdf"
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.
    """
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def search_pdfs(keyword):
    """
    Searches for a keyword in all PDFs in the specified folder.
    """
    matching_pdfs = []
    for filename in os.listdir(PDF_FOLDER):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(PDF_FOLDER, filename)
            text = extract_text_from_pdf(pdf_path)
            if keyword.lower() in text.lower():
                matching_pdfs.append(filename)
    return matching_pdfs

In [25]:
text = extract_text_from_pdf(PDF_FOLDERs)

In [27]:

PDF_FOLDER = "../pdfs"
search_pdfs("Radical")

['Radical Equations.pdf']

In [17]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(text):
    return model.encode(text)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def semantic_search(keyword, pdf_texts):
    keyword_embedding = generate_embeddings(keyword)
    pdf_embeddings = [generate_embeddings(text) for text in pdf_texts]
    similarities = cosine_similarity([keyword_embedding], pdf_embeddings)
    return similarities.argsort()[0][::-1]  # Return sorted indices


In [19]:
import openai

def refine_with_gpt(keyword, pdf_text):
    prompt = f"Search for '{keyword}' in the text and explain its relevance:\n\n{pdf_text}"
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message['content']


In [21]:
semantic_search("Exponential", text)

array([ 949,  931,  922, ..., 4963, 4668, 3307])

In [22]:
generate_embeddings(text)

array([-6.15558885e-02, -3.27429213e-02, -8.14785063e-02,  2.20018942e-02,
        2.16897670e-03, -7.06751645e-02, -4.91959192e-02,  4.50673141e-02,
        5.91372512e-02,  6.03560358e-02,  9.69159156e-02, -3.40183489e-02,
        1.34279542e-02,  4.39969152e-02,  7.23729730e-02,  5.68746701e-02,
       -4.31586802e-02, -7.17484578e-02, -3.35697867e-02,  5.19162640e-02,
        9.17470157e-02, -4.97853756e-02,  5.94947999e-03, -1.51990373e-02,
        1.07879937e-02, -1.11024909e-01, -1.41220475e-02, -6.34936392e-02,
        2.57932767e-02, -8.92783105e-02,  2.05309931e-02, -2.43783295e-02,
       -3.44554661e-03,  2.79479511e-02,  7.15474971e-03,  3.31692360e-02,
        4.52648662e-02,  3.54050696e-02,  2.44668801e-03, -1.39959976e-02,
       -8.64054039e-02,  4.50172834e-02,  1.53103909e-02, -6.10411982e-04,
        6.62536025e-02, -4.29078750e-02, -9.63842962e-03,  2.30673142e-02,
       -6.08262271e-02,  1.45137990e-02, -8.16086307e-03,  1.38271246e-02,
       -4.10450771e-02,  

In [23]:
refine_with_gpt("Exponential", text)

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [28]:
# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.
    """
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def generate_embeddings(text):
    """
    Generates embeddings for a given text using SentenceTransformer.
    """
    return model.encode(text)

def load_pdf_embeddings():
    """
    Loads and generates embeddings for all PDFs in the folder.
    Returns a dictionary with filenames and embeddings.
    """
    pdf_embeddings = {}
    for filename in os.listdir(PDF_FOLDER):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(PDF_FOLDER, filename)
            text = extract_text_from_pdf(pdf_path)
            pdf_embeddings[filename] = generate_embeddings(text)
    return pdf_embeddings

def search_pdfs(query, pdf_embeddings):
    """
    Searches for a query in all PDF embeddings and ranks results by similarity.
    """
    query_embedding = generate_embeddings(query)
    results = []
    for filename, embedding in pdf_embeddings.items():
        similarity = cosine_similarity([query_embedding], [embedding])[0][0]
        results.append((filename, similarity))
    # Sort results by similarity in descending order
    results = sorted(results, key=lambda x: x[1], reverse=True)
    return results

In [30]:
pdf_embeddings = load_pdf_embeddings()
    

# Search PDFs
results = search_pdfs("Exponential", pdf_embeddings)

In [33]:
results[0][0]

'exponential_Functions.pdf'