In [8]:
import string, nltk, os, PyPDF2, openai, spacy, glob, sys
from dotenv import load_dotenv
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
class IntentService:
    def __init__(self):
        # Load environment variables from .env file
        load_dotenv()
        openai.api_key = os.getenv("OPENAI_API_KEY")
        self.spell_checker = Speller()
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.nlp = spacy.load('en_core_web_sm')
        

    def preprocess_question(self, question):
        # Lowercase
        question = question.lower()
        # Remove punctuation and special characters
        question = question.translate(str.maketrans('', '', string.punctuation))
        # Tokenization
        tokens = word_tokenize(question)
        # Stop words removal
        tokens = [word for word in tokens if word not in self.stop_words]
        # Stemming and Lemmatization
        tokens = [self.stemmer.stem(word) for word in tokens]
        tokens = [self.nlp(word)[0].lemma_ for word in tokens]
        # Negation tracking, POS tagging, NER, and spelling correction
        revised_tokens = []
        doc = self.nlp(' '.join(tokens))
        for token in doc:
            # Correct spelling
            corrected_word = self.spell_checker(token.text)
            # Handle negation
            if "not_" in token.dep_:
                corrected_word = "not_" + corrected_word
            # POS tagging and NER are included in the Spacy pipeline
            revised_tokens.append(corrected_word)
        # Rejoin tokens and trim spaces
        question = ' '.join(revised_tokens).strip()
        print("preliminary preprocessing completed...")
        return question
    
    def detect_malicious_intent(self, question):
        is_flagged = None
        try:
            # Calling the openai moderation model 
            response = openai.moderations.create(
                input=question,
                model="text-moderation-latest"
                )
            # checking if it is flagged or not
            is_flagged = response.results[0].flagged
            
            if is_flagged:
                print("This question has been flagged for malicious content and cannot be processed.")
            else:
                print("No malicious intent detected...")
        except Exception as e:
            print(f"Error in moderation: {e} Unable to determine intent due to an error.")
        return is_flagged

    def extract_text_from_pdf(self, pdf_path):
        # Save the original stderr stream to hide the PyPDF2 warning
        original_stderr = sys.stderr
        
        try:
            # Redirect stderr to null
            sys.stderr = open(os.devnull, 'w')
            
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfFileReader(file)
                text = ''
                for page in range(reader.numPages):
                    text += reader.getPage(page).extractText()
        
        finally:
            # Restore the original stderr stream
            sys.stderr.close()
            sys.stderr = original_stderr
        print("Text extracted from the PDF...")
        return text
    
    def calculate_similarity(self, question, pdf_text):
        is_similar = None
        try:
            # vectorization using Tfid
            vectorizer = TfidfVectorizer()
            tfidf = vectorizer.fit_transform([question, pdf_text])
            cosine_similarity_num = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]*10
            if cosine_similarity_num > 0.36:
                is_similar = True
                return is_similar
            else:
                is_similar = False
                return is_similar
        except Exception as e:
            print(f"Error in similarity check: {e} Unable to determine the similarity due to an error.")
        print("cosine similarity calculated...")
        return is_similar
    
    def check_item_status(self,is_question_related, is_flagged):
        match (is_question_related, is_flagged):
            case (True, False):
                print("The question is related to the PDF and not flagged. We can proceed...")
                return True

            case (False, True):
                print("The question is not related to the PDF and has been flagged, therefore cannot proceed. Please try a different question...")
                return False
            
            case (True, True):
                print("The question is related to the PDF however has been flagged, thus cannot proceed. Please try a different question...")
                return False
            
            case (False, False):
                print("The question is not related to the PDF nor flagged. Please try a different question...")
                return False

In [11]:
def question_pipeline(service):
    while True:
        question = input("Enter your question or type 'exit' to quit: ")
        if question.lower() == 'exit':
            print("Exiting... Thank you for using the system.")
            return None  # Return None to indicate the user chose to exit
        else:
            result, preprocessed_question = intent_orchestrator(question, service)
            if result:
                print("Question processed successfully...")
                return preprocessed_question  # Return the preprocessed question
            else:
                print("Please try a different question...")

In [1]:
def intent_orchestrator(input_question, service):
    DIRECTORY_PATH = '/Users/sternsemasuka/Desktop/ML/Project/Talk-to-your-PDF/pdf_folder/' 
    pdf_file_paths = glob.glob(os.path.join(DIRECTORY_PATH, '*.pdf'))
    if not pdf_file_paths:
        print("No PDF files found in the specified directory.")
        return False, None
    pdf_file_path = pdf_file_paths[0]
    preprocessed_question = service.preprocess_question(input_question)
    is_flagged = service.detect_malicious_intent(preprocessed_question)
    pdf_text_extracted = service.extract_text_from_pdf(pdf_file_path)
    is_question_related = service.calculate_similarity(preprocessed_question, pdf_text_extracted)
    can_proceed = service.check_item_status(is_question_related, is_flagged)
    return can_proceed, preprocessed_question  # Return both the result and the preprocessed question

In [13]:
def process_user_question():
    service = IntentService()
    preprocessed_question = question_pipeline(service)
    if preprocessed_question:
        return preprocessed_question
    else:
        print("No question was preprocessed.")