In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
from docx import Document
import PyPDF2
from odf import text, teletype
from odf.opendocument import load
from striprtf.striprtf import rtf_to_text
from collections import defaultdict
import numpy as np

In [93]:
def read_file_content(file_path):
    """Reads a file and returns its content as a string, supporting DOCX, PDF, ODT, and RTF files."""
    try:
        if not os.access(file_path, os.R_OK):  # Checks if the file is readable
            return "The file is not readable."
        
        # Determine the file extension and process accordingly
        _, file_extension = os.path.splitext(file_path)
        
        if file_extension.lower() == '.docx':
            doc = Document(file_path)
            return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        
        elif file_extension.lower() == '.pdf':
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page in reader.pages:
                    text += page.extract_text() + '\n'
                return text
        
        elif file_extension.lower() == '.odt':
            odt_doc = load(file_path)
            all_texts = teletype.extractText(odt_doc.text)
            return all_texts
        
        elif file_extension.lower() == '.rtf':
            with open(file_path, 'r') as file:
                rtf_content = file.read()
                return rtf_to_text(rtf_content)
        
        elif file_extension.lower() in ['.txt', '.csv', '.json']:  # Add more plaintext formats as needed
            with open(file_path, 'r') as file:
                return file.read()
        
        else:
            return "Unsupported file format."
    
    except FileNotFoundError:
        return "The specified file does not exist."
    except Exception as e:
        return f"An error occurred: {e}"

In [94]:
def format_and_chunk_text(text, chunk_size):
    """
    Formats the given text to remove extra spaces and chunks it into parts of specified size.
    
    Parameters:
    - text: The input string to format and chunk.
    - chunk_size: The maximum size of each chunk.
    
    Returns:
    A list of string chunks with each chunk being up to `chunk_size` characters long.
    """
    # Format the text to replace multiple spaces with a single space and strip leading/trailing spaces
    formatted_text = ' '.join(text.split())
    
    # Initialize variables for chunking
    chunks = []
    current_chunk = ""
    
    for word in formatted_text.split():
        # Check if adding the next word would exceed the chunk size
        if len(current_chunk) + len(word) + 1 <= chunk_size:
            current_chunk += (word + " ")
        else:
            # Append the current chunk to the list and start a new one
            chunks.append(current_chunk.strip())
            current_chunk = word + " "
    
    # Don't forget to add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

In [95]:
def chunk_documents_in_folder(folder_path, chunk_size):
    """
    Reads all documents in a specified folder, chunks them, and stores the chunks.
    
    Parameters:
    - folder_path: Path to the folder containing the documents.
    - chunk_size: Size of each chunk.
    
    Returns:
    A dictionary with document names as keys and lists of their chunks as values.
    """
    documents_chunks = {}
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            content = read_file_content(file_path)
            if isinstance(content, str):  # Ensure content was successfully read
                chunks = format_and_chunk_text(content, chunk_size)
                documents_chunks[file_name] = chunks
    return documents_chunks

In [96]:
def find_most_relevant_document(documents_chunks, query):
    """
    Uses cosine similarity to find the most relevant document for a given query.
    
    Parameters:
    - documents_chunks: A dictionary with document names as keys and lists of chunks as values.
    - query: The search query.
    
    Returns:
    The name of the most relevant document.
    """
    # Flatten the chunks for TF-IDF vectorization
    doc_names = list(documents_chunks.keys())
    all_texts = [' '.join(chunks) for chunks in documents_chunks.values()]
    
    # Add the query as the last item in the list for vectorization
    all_texts.append(query)
    
    # Vectorize the text using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Calculate cosine similarities between the query and all documents
    cosine_similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
    
    # Find the index of the document with the highest cosine similarity
    most_relevant_doc_index = np.argmax(cosine_similarities)
    
    # Return the name of the most relevant document
    return doc_names[most_relevant_doc_index]

In [97]:
folder_path = '/Users/shrutikmk/Documents/compsci/Prakya/semantic-search/sample-docs'
chunk_size = 1024

In [98]:
documents_chunks = chunk_documents_in_folder(folder_path, chunk_size)

In [99]:
query = "Knowledge is Power"
most_relevant_document = find_most_relevant_document(documents_chunks, query)


In [100]:
print(f"The most relevant document is: {most_relevant_document}")

The most relevant document is: uChicago S1D3.txt
