In [22]:
import string, nltk, os, openai, spacy, glob, import_ipynb
from dotenv import load_dotenv
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from information_retrieval_service import InformationRetrievalService
from sqlalchemy import text

In [23]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [24]:
class IntentService:
    def __init__(self):
        # Load environment variables from .env file
        load_dotenv()
        openai.api_key = os.getenv("OPENAI_API_KEY")
        self.spell_checker = Speller()
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.nlp = spacy.load('en_core_web_sm')
        # initialization of the self from InformationRetrievalService class
        self.information_retrieval_service = InformationRetrievalService()

    def preprocess_question(self, question):
        # Lowercase
        question = question.lower()
        # Remove punctuation and special characters
        question = question.translate(str.maketrans('', '', string.punctuation))
        # Tokenization
        tokens = word_tokenize(question)
        # Stop words removal
        tokens = [word for word in tokens if word not in self.stop_words]
        # Stemming and Lemmatization
        tokens = [self.stemmer.stem(word) for word in tokens]
        tokens = [self.nlp(word)[0].lemma_ for word in tokens]
        # Negation tracking, POS tagging, NER, and spelling correction
        revised_tokens = []
        doc = self.nlp(' '.join(tokens))
        for token in doc:
            # Correct spelling
            corrected_word = self.spell_checker(token.text)
            # Handle negation
            if "not_" in token.dep_:
                corrected_word = "not_" + corrected_word
            # POS tagging and NER are included in the Spacy pipeline
            revised_tokens.append(corrected_word)
        # Rejoin tokens and trim spaces
        question = ' '.join(revised_tokens).strip()
        print("preliminary preprocessing completed...")
        return question
    
    def detect_malicious_intent(self, question):
        is_flagged = None
        try:
            # Calling the openai moderation model 
            response = openai.moderations.create(
                model="text-moderation-latest",
                input=question,
            )
            # checking if it is flagged or not
            is_flagged = response.results[0].flagged
            
            if is_flagged:
                print("This question has been flagged for malicious content and cannot be processed.")
            else:
                print("No malicious intent detected...")
        except Exception as e:
            print(f"Error in moderation: {e} Unable to determine intent due to an error.")
        return is_flagged

    def check_relatedness_to_pdf_content(self, question):
        # class the InformationRetrievalService's question_to_embeddings method
        question_vectorized = self.information_retrieval_service.question_to_embeddings(question)
        
        try:
            # Convert question_embedding to PostgreSQL array format, ensuring it matches the expected dimension
            question_vector = '{' + ','.join(map(str, question_vectorized)) + '}'
            
            with self.information_retrieval_service.engine.connect() as conn:
                result = conn.execute(text("""
                    SELECT id, text, embedding <-> :question_vector AS distance
                    FROM item
                    ORDER BY distance ASC
                    LIMIT 1;
                """), {'question_vector': question_vector}).fetchone()
                
                if result:
                    closest_id, _, distance = result
                    print(f"Closest match ID: {closest_id}, Distance: {distance}")
                    print("%" * 10)
                    # Adjust threshold based on your use case
                    threshold = 0.5  # Example threshold
                    if distance < threshold:
                        print("Question is related to the PDF content.")
                        return True
                    else:
                        print("Question is not related to the PDF content.")
                        return False
        except Exception as e:
            print(f"Error searching the database: {e}")
            return False
    
    def check_item_status(self,is_question_related, is_flagged):
        match (is_question_related, is_flagged):
            case (True, False):
                print("The question is related to the PDF and not flagged. We can proceed...")
                return True

            case (False, True):
                print("The question is not related to the PDF and has been flagged, therefore cannot proceed. Please try a different question...")
                return False
            
            case (True, True):
                print("The question is related to the PDF however has been flagged, thus cannot proceed. Please try a different question...")
                return False
            
            case (False, False):
                print("The question is not related to the PDF nor flagged. Please try a different question...")
                return False

In [25]:
def intent_orchestrator(input_question, service):
    DIRECTORY_PATH = '/Users/sternsemasuka/Desktop/ML/Project/Talk-to-your-PDF/pdf_folder/'
    pdf_file_paths = glob.glob(os.path.join(DIRECTORY_PATH, '*.pdf'))
    if not pdf_file_paths:
        print("No PDF files found in the specified directory.")
        return False, None, None  # Adjusted to return a consistent number of values

    pdf_file_path = pdf_file_paths[0]
    
    while True:  # Loop to continuously ask for input if flagged for malicious intent
        preprocessed_question = service.preprocess_question(input_question)
        is_flagged = service.detect_malicious_intent(preprocessed_question)
        
        if is_flagged:
            print("Please input another question as the previous one was flagged for malicious content.")
            input_question = input("Enter your question or type 'exit' to quit: ")  # Prompt for a new question
            if input_question.lower() == 'exit':
                print("Exiting... Thank you for using the system.")
                return False, None, None  # Exiting the loop and function
        else:
            break  # Exit the loop if the question is not flagged

    # Proceed to check the relatedness only if the question is not flagged
    is_question_related = service.check_relatedness_to_pdf_content(preprocessed_question)
    can_proceed = service.check_item_status(is_question_related, is_flagged)
    return can_proceed, preprocessed_question, pdf_file_path

In [26]:
def question_pipeline(service):
    while True:
        question = input("Enter your question or type 'exit' to quit: ")
        if question.lower() == 'exit':
            print("Exiting... Thank you for using the system.")
            return None  # Return None to indicate the user chose to exit
        else:
            can_proceed, preprocessed_question, pdf_file_path = intent_orchestrator(question, service)
            if can_proceed:
                print("Question processed successfully...")
                return preprocessed_question, pdf_file_path, question  # Return the preprocessed question
            else:
                print("Please try a different question...")

In [27]:
def process_user_question():
    service = IntentService()
    result = question_pipeline(service)
    if result:
        preprocessed_question, pdf_file_path, question = result
        print("Question preprocessed and ready for further actions.")
    else:
        print("No question was preprocessed or processed successfully.")


In [28]:
process_user_question()

preliminary preprocessing completed...
No malicious intent detected...
Error searching the database: (psycopg2.errors.UndefinedTable) relation "item" does not exist
LINE 3:                     FROM item
                                 ^

[SQL: 
                    SELECT id, text, embedding <-> %(question_vector)s AS distance
                    FROM item
                    ORDER BY distance ASC
                    LIMIT 1;
                ]
[parameters: {'question_vector': '{-0.02843516133725643,-0.01720310188829899,-0.0014136313693597913,-0.03317435458302498,-0.027117768302559853,-0.03534719720482826,-0.0015697509516030 ... (65499 characters truncated) ... .006835046224296093,-0.01557774655520916,-0.016732605174183846,0.0074124750681221485,0.023559095337986946,-0.004330716095864773,0.007284157909452915}'}]
(Background on this error at: https://sqlalche.me/e/20/f405)
The question is not related to the PDF nor flagged. Please try a different question...
Please try a different questio