In [None]:
import pandas as pd
import numpy as np
import os
import ast
import pdb
from openai import OpenAI
import PyPDF2
from pathlib import Path
import pytesseract
from pdf2image import convert_from_path


In [2]:
# Explicitly set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [None]:
openai_api_key = ${{DS-Env.OpenAI_API_Key}}
client = OpenAI(api_key=openai_api_key)

In [102]:
question = input("How can I help you today?")

In [22]:
folder = "./Documentation"
documentation = []
filenames = []

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF, falling back to OCR if a page contains no extractable text.
    """
    all_text = []
    try:
        # First, try to extract text directly using PyPDF2
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text = page.extract_text()
                if text and len(text.strip()) > 50: # Check if there is a meaningful amount of text
                    all_text.append(text)
                else:
                    # Convert PDF pages to a list of Pillow images
                    images = convert_from_path(pdf_path)
                    for page_image in images:
                        # Use pytesseract to extract text from each image
                        page_text = pytesseract.image_to_string(page_image)
                        all_text.append(page_text)
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return "Text extraction failed"

    return all_text

for file in Path(folder).glob("*-1*.pdf"):
    documentation.append(extract_text_from_pdf(file))
    filenames.append(file.name)


In [69]:
#document_lines = [lambda x : x.str.split('\n') for x in documentation[0]]
dataf = pd.DataFrame({'text' : documentation, 'filename' : filenames})

In [None]:
# Split the text by newline and explode the DataFrame
data_spl = pd.DataFrame(dataf)
data_spl = data_spl.explode('text').reset_index(drop=True)

# Clean up any empty rows that may have resulted from the split
data_spl['text'] = data_spl['text'].str.strip()


In [None]:
data_used = pd.DataFrame(data_spl[50:100])

In [96]:
def get_embeddings(support_doc, model = "text-embedding-3-small"):
    support_doc = support_doc.replace("\n", " ")
    return client.embeddings.create(input = support_doc, model = model).data[0].embedding
    

In [99]:
data_used['embeddings'] = data_used['text'].apply(get_embeddings)

In [104]:
question_embedding = get_embeddings(question)

In [107]:
def fn(doc_embeddings):
    distance = np.dot(doc_embeddings, question_embedding)
    return distance

data_used['distance'] = data_used["embeddings"].apply(fn)

In [110]:
data_used.sort_values('distance', ascending=False, inplace=True)
context = data_used.iloc[0:5]

In [114]:
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role" : "system", "content" : "You are a financial data analysis expert who are answering a third year mathematics student. Answer with supporting explanations."},
             {"role" : "assistant", "content" : f"Use this information from the Financial Data Analysis class as context to answer the user question: {context}. Please, stick to this context when answering the question."},
             {"role" : "user", "content" : question}]
)

In [115]:
final_response = response.choices[0].message.content

In [145]:
def query():

    while(True):
        question = input("How Can I help you? Type 'Quit' to exit the query.")


        if(question.lower() != "quit"):
            question_embedding = get_embeddings(question)
            data_used['distance'] = data_used["embeddings"].apply(fn)
            data_used.sort_values('distance', ascending=False, inplace=True)
            top_five_distance = data_used.iloc[0:5]
            top_text = top_five_distance['text']
            context = "\n\n".join(top_text)

            response = client.chat.completions.create(
            model = "gpt-3.5-turbo",
            messages=[{"role" : "system", "content" : "You are a financial data analysis expert who are answering a third year mathematics student. Answer with supporting explanations."},
                    {"role" : "assistant", "content" : f"Use this information from the Financial Data Analysis class as context to answer the user question: {context}. Please, stick to this context when answering the question."},
                    {"role" : "user", "content" : question}]
            )

            final_response = response.choices[0].message.content
            print(question + '\n\n' + final_response + '\n')
            print('-' * 100 + '\n')
        
        else:
            break

In [None]:
query()