# PDF SEARCHER  ~by Saubhagya Pandey

## Note: This searcher may give absurd results if the PDF you provide is not well-encoded

### Import Libraries

In [1]:
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

### Function to extract all the text from pdf

In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

### Function for Pre-processing text

In [3]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Convert text to lowercase and tokenize
    tokens= [token.strip() for token in tokens] #Removing trailing spaces
    tokens = [token for token in tokens if token not in string.punctuation] #Removing isolated punctuations
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    return processed_text

### Function to create text chunks (list of sentences)

In [4]:
def split_text_into_chunks(text):
    sentences = sent_tokenize(text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

### Tf-Idf Vectorization of text chunks

In [5]:
def vectorize_text(text_chunks):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(text_chunks) #TAKES A LIST OF STRINGS AS INPUT
    return tfidf_vectorizer, tfidf_matrix

### Function for Searching the Query (semantic search)

In [6]:
def semantic_search(query_vector, tfidf_matrix, top_n):
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    most_similar_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return most_similar_indices

### Final Function to return the output text (gathering search results)

In [7]:
def answer_query(query, text_chunks, tfidf_vectorizer, tfidf_matrix, top_n=5, ans_len=50):
    processed_query = preprocess_text(query)
    query_vector = tfidf_vectorizer.transform([processed_query])
    most_similar_indices = semantic_search(query_vector, tfidf_matrix,top_n)
    answers = []

    for index in most_similar_indices:
        lst=[]
        x=0
        while (len(lst)<ans_len)&(len(text_chunks)>index+x):
            lst.extend([i for i in text_chunks[index+x].strip().split()])
            x+=1
        answer_chunk = ' '.join(lst)
        answers.append(answer_chunk)
    return answers

### INPUTS: Path of PDF, Query & Vaiables to decide the volume of Search Results

PATH OF PDF

In [8]:
pdf_path = "data/sample4.pdf"

QUERY TO BE SEARCHED FOR

In [9]:
query = 'aim of author for writing this book'

NUMBER OF SEARCH RESULTS USER WANTS

In [10]:
num_results = 10

MINIMUM NUMBER OF WORDS EACH SEARCH RESULT SHOUL HAVE ACCORDING TO USER

In [11]:
len_per_result = 50

### MAIN CODE

In [12]:
pdf_text = extract_text_from_pdf(pdf_path) #Extracting text from pdf
text_chunks = split_text_into_chunks(pdf_text) #Creating chunks
processed_chunks = [preprocess_text(chunk) for chunk in text_chunks] #Pre-processing chunks
tfidf_vectorizer, tfidf_matrix = vectorize_text(processed_chunks) #Vectorization of chunks
answers = answer_query(query, text_chunks, tfidf_vectorizer, tfidf_matrix, num_results, len_per_result) #Storing results to a variable

In [13]:
# Printing the results
for i,answer in enumerate(answers):
    print(f"## MATCHED TEXT FOUND : [{i+1}]",'\n\n',answer,'\n\n')

## MATCHED TEXT FOUND : [1] 

 While I am writing this book, I am all alone. I don’t have any friends. Not even a single soul who I can hang out with. Not because I am patheti c or a loner . But because I never stayed in any school for longer than two years, I never had the kind of best friend we see in movies or books. 


## MATCHED TEXT FOUND : [2] 

 Books have it all. Read books and feed your brain . In your free time, or what you call ‘alone time’, take out a book and read at least 5 pages. Allow yourself to drown in the pool of words. Read books on the topics that pique your interest. Read books written by your industry expert and learn what they have learned after spending decades working. 


## MATCHED TEXT FOUND : [3] 

 When the idea of writing this book came to my mind, I decided that I wouldn’ t write a book that speaks data but rather a book that speaks from one heart to another heart. That’s what you need, right? You don’t care how many people in the world are lonely . 

