In [77]:
import string, nltk, os, openai, spacy, glob
import numpy as np
from dotenv import load_dotenv
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from information_retrieval_service import InformationRetrievalService
from sqlalchemy import text, create_engine
from IPython.display import clear_output

In [78]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [79]:
class IntentService:
    def __init__(self):
        load_dotenv()
        self.api_key = os.getenv("OPENAI_API_KEY")
        self.spell_checker = Speller()
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.nlp = spacy.load('en_core_web_sm')
        self.information_retrieval_service = InformationRetrievalService()

    def preprocess_question(self, question):
        """Preprocesses the input question by performing lowercasing, removing punctuation,
        tokenizing, removing stopwords, stemming, lemmatizing, and spelling correction."""
        question = question.lower().translate(str.maketrans('', '', string.punctuation))
        tokens = [self.stemmer.stem(word) for word in word_tokenize(question) if word not in self.stop_words]
        lemmas = [self.nlp(word)[0].lemma_ for word in tokens]
        revised_tokens = [self.spell_checker(token.text) if "not_" not in token.dep_ else "not_" + token.text for token in self.nlp(' '.join(lemmas))]
        return ' '.join(revised_tokens).strip()

    def detect_malicious_intent(self, question):
        """Uses OpenAI's moderation model to detect malicious intent in a question."""
        try:
            response = openai.moderations.create(
                model="text-moderation-latest",
                input=question,
            )
            is_flagged = response.results[0].flagged
            return is_flagged, "This question has been flagged for malicious content and cannot be processed." if is_flagged else "No malicious intent detected."
        except Exception as e:
            return None, f"Error in moderation: {str(e).split('. ')[0]}."

    def check_relatedness_to_pdf_content(self, question):
        """Checks if the question is related to the PDF content by querying a database."""
        question_vectorized = self.information_retrieval_service.question_to_embeddings(question)

        try:
            # Directly use the question_vectorized array for querying
            with self.information_retrieval_service.engine.connect() as conn:
                # Adjust your query to use an array parameter for vector comparison
                result = conn.execute(text("""
                    SELECT id, text, embedding <-> :question_vector AS distance
                    FROM pdf_holder
                    ORDER BY distance ASC
                    LIMIT 1;
                """), {'question_vector': question_vectorized}).fetchone()

                if result:
                    closest_id, _, distance = result
                    print(f"Closest match ID: {closest_id}, Distance: {distance}")
                    threshold = 0.5  # Adjust threshold as needed
                    if distance < threshold:
                        print("Question is related to the PDF content.")
                        return True
                    else:
                        print("Question is not related to the PDF content.")
                        return False
        except Exception as e:
            print(f"Error searching the database: {e}")
            return False


In [80]:
class InformationRetrievalService:
    def __init__(self):
        # Get database password from environment variable
        self.engine = create_engine(f'postgresql://postgres:{os.getenv("POSTGRES_PASSWORD")}@localhost:5432/pdf_db')

    def question_to_embeddings(self, question):
        """Converts a question to embeddings."""
        try:
            response = openai.embeddings.create(input=question, model="text-embedding-3-large")
            embedded_query = response.data[0].embedding
            # Ensure the embedding matches the expected dimensionality of 3072
            if len(embedded_query) != 3072:
                raise ValueError("The dimensionality of the question embedding does not match the expected 3072 dimensions.")
            else:
                question_vectorized = np.array(embedded_query, dtype=np.float64).tolist()
                return question_vectorized
        except Exception as e:
            print(f"Error embedding the question: {e}")
            return [] # Return an empty list if no data is found in the response

    def query_database(self, query):
        """Executes a given query on the database."""
        with self.engine.connect() as connection:
            result = connection.execute(text(query)).fetchone()
            return result if result else None

In [81]:
def intent_orchestrator(service):
    """Orchestrates the process of checking if a question is related to any PDF content."""
    DIRECTORY_PATH = '/Users/sternsemasuka/Desktop/ML/Project/Talk-to-your-PDF/pdf_folder/'
    pdf_file_path = glob.glob(os.path.join(DIRECTORY_PATH, '*.pdf'))
    if not pdf_file_path:
        print("No PDF files found in the specified directory.")
        return None

    while True:
        clear_output(wait=True)
        question = input("Enter your question or type 'exit' to quit: ").strip()
        if question.lower() == 'exit':
            print("Exiting...")
            return None

        is_flagged, message = service.detect_malicious_intent(question)
        print(message)
        if is_flagged or is_flagged is None:  # Continue loop if flagged or an error occurred
            continue

        related, message = service.check_relatedness_to_pdf_content(question)
        print(message)
        if related:
            return question, pdf_file_path
        else:
            print("Please try a different question...")

In [82]:
def process_user_question():
    """Main function to start the question processing workflow."""
    service = IntentService()
    result = intent_orchestrator(service)
    if result:
        question, pdf_file_path = result
        print(f"Question: '{question}' is processed and related to the PDF at '{pdf_file_path}'.")
    else:
        print("No question was processed successfully.")

if __name__ == "__main__":
    openai.api_key = os.getenv("OPENAI_API_KEY")  # Ensure OpenAI API key is set globally for the session
    process_user_question()

No malicious intent detected.
Error searching the database: (psycopg2.errors.UndefinedFunction) operator does not exist: vector <-> numeric[]
LINE 2:                     SELECT id, text, embedding <-> ARRAY[ -0...
                                                       ^
HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.

[SQL: 
                    SELECT id, text, embedding <-> %(question_vector)s AS distance
                    FROM pdf_holder
                    ORDER BY distance ASC
                    LIMIT 1;
                ]
[parameters: {'question_vector': [-0.0012315254425629973, -0.026523638516664505, -0.0388735830783844, -0.008712361566722393, 0.00020783541549462825, 0.035741496831178665, -0.029992725 ... (68623 characters truncated) ... 1190165132284, 0.0038333397824317217, -0.022717555984854698, 0.007433755323290825, 0.023550137877464294, -0.00288181914947927, -0.0008338195621035993]}]
(Background on this error at: http

TypeError: cannot unpack non-iterable bool object