In [227]:
import string, nltk, os, openai, PyPDF2, glob, spacy, warnings
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [228]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sternsemasuka/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [229]:
class IntentService:
    def __init__(self):
        self.spell_checker = Speller()
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.nlp = spacy.load('en_core_web_sm')
        openai.api_key = os.getenv("OPENAI_API_KEY")

    def preprocess_question(self, question):
        # Lowercase
        question = question.lower()
        # Remove punctuation and special characters
        question = question.translate(str.maketrans('', '', string.punctuation))
        # Tokenization
        tokens = word_tokenize(question)
        # Stop words removal
        tokens = [word for word in tokens if word not in self.stop_words]
        # Stemming and Lemmatization
        tokens = [self.stemmer.stem(word) for word in tokens]
        tokens = [self.nlp(word)[0].lemma_ for word in tokens]
        # Negation tracking, POS tagging, NER, and spelling correction
        revised_tokens = []
        doc = self.nlp(' '.join(tokens))
        for token in doc:
            # Correct spelling
            corrected_word = self.spell_checker(token.text)
            # Handle negation
            if "not_" in token.dep_:
                corrected_word = "not_" + corrected_word
            # POS tagging and NER are included in the Spacy pipeline
            revised_tokens.append(corrected_word)
        # Rejoin tokens and trim spaces
        question = ' '.join(revised_tokens).strip()
        print("preprocessing done...")
        return question
    
    def detect_malicious_intent(self, question):
        is_flagged = None
        try:
            # Calling the openai moderation model 
            response = openai.moderations.create(
                input=question,
                model="text-moderation-latest"
                )
            # checking if it is flagged or not
            is_flagged = response.results[0].flagged
            if is_flagged:
                print("This question has been flagged for malicious content and cannot be processed.")
            else:
                print("No malicious intent detected.")
        except Exception as e:
            print(f"Error in moderation: {e} Unable to determine intent due to an error.")
        return is_flagged

    def extract_text_from_pdf(self, pdf_path):
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfFileReader(file)
            text = ''
            for page in range(reader.numPages):
                text += reader.getPage(page).extractText()
        return text
    
    def calculate_similarity(self, question, pdf_text):
        is_similar = None
        try:
            # vectorization using Tfid
            vectorizer = TfidfVectorizer()
            tfidf = vectorizer.fit_transform([question, pdf_text])
            cosine_similarity_num = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
            print("Cosine similarity: ", cosine_similarity_num)
            if cosine_similarity_num > 0.36:
                is_similar = True
                print("The question is related to the pdf")
                return is_similar
            else:
                is_similar = False
                print("The question is not related to the pdf")
                return is_similar
        except Exception as e:
            print(f"Error in similarity check: {e} Unable to determine the similarity due to an error.")
        return is_similar
    def check_item_status(self,is_question_related, is_flagged):
        match (is_question_related, is_flagged):
            case (True, False):
                return "The item is question-related and not flagged."
            case (False, True):
                return "The item is not question-related but flagged."
            case (True, True):
                return "The item is question-related and flagged."
            case (False, False):
                return "The item is neither question-related nor flagged."

In [230]:
# Specify the directory path
DIRECTORY_PATH = '/Users/sternsemasuka/Desktop/ML/Project/Talk-to-your-PDF/' 
# Search for any PDF file in the directory
pdf_file_path = glob.glob(os.path.join(DIRECTORY_PATH, '*.pdf'))[0]


In [231]:
intent_preprocessor = IntentService()
sample_question = input("Enter your question here: ")
preprocessed_question = intent_preprocessor.preprocess_question(sample_question)
print(preprocessed_question)
is_flagged = intent_preprocessor.detect_malicious_intent(preprocessed_question)
pdf_text_extracted = intent_preprocessor.extract_text_from_pdf(pdf_file_path)
is_question_related = intent_preprocessor.calculate_similarity(sample_question, pdf_text_extracted)
final_result = intent_preprocessor.check_item_status(is_question_related, is_flagged)
final_result

preprocessing done...
strategy plan inter digit technology various sector economic respect relay child porn


Xref table not zero-indexed. ID numbers for objects will be corrected.


This question has been flagged for malicious content and cannot be processed.
Cosine similarity:  0.354678346543796
The question is not related to the pdf


'The item is not question-related but flagged.'