In [None]:
# # Document loading, retrieval methods and text splitting
# !pip install -qU langchain langchain_community

# # Local vector store via Chroma
# !pip install -qU langchain_chroma

# # Local inference and embeddings via Ollama
# !pip install -qU langchain_ollama

# # Web Loader
# !pip install -qU beautifulsoup4

# # Pull the model first
# !ollama pull nomic-embed-text

# !pip install -qU pypdf

In [3]:
#Imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

In [4]:
from typing import Union
import docx 
import docx2txt 
import PyPDF2
from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
    Docx2txtLoader,
    CSVLoader
)

from typing import Optional, Union, Dict, Set
from pathlib import Path
from langchain.document_loaders import (
    TextLoader,
    PyPDFLoader,
    Docx2txtLoader,
    CSVLoader,
    UnstructuredMarkdownLoader,
    JSONLoader
)

class UnsupportedFileTypeError(Exception):
    """Custom exception for unsupported file types."""
    pass

def get_loader_class(
    file_path: Union[str, Path],
    strict_mode: bool = False
) -> Optional[Union[TextLoader, PyPDFLoader, Docx2txtLoader, CSVLoader, UnstructuredMarkdownLoader, JSONLoader]]:
    """
    Determine the appropriate document loader based on file extension.
    
    Args:
        file_path (Union[str, Path]): Path to the file that needs to be loaded
        strict_mode (bool): If True, raises an exception for unsupported file types
                          If False, returns None for unsupported types
    
    Returns:
        Optional[Union[...]]: Appropriate loader class for the file type, or None if unsupported
        
    Raises:
        UnsupportedFileTypeError: If strict_mode is True and file type is unsupported
        ValueError: If file_path is empty or invalid
    """
    if not file_path:
        raise ValueError("File path cannot be empty")
    
    # Convert to Path object for more robust path handling
    path = Path(file_path)
    
    # Get lowercase extension without the dot
    try:
        extension = path.suffix.lower().lstrip('.')
    except Exception as e:
        raise ValueError(f"Invalid file path format: {str(e)}")
    
    # Define skip extensions
    SKIP_EXTENSIONS: Set[str] = {
        'zip', 'exe', 'bin', 'ppt', 'pptx', 
        'jpg', 'jpeg', 'png', 'gif', 'bmp',  # Skip binary image formats
        'mp3', 'mp4', 'wav', 'avi',          # Skip media files
        'db', 'sqlite', 'sqlite3'            # Skip database files
    }
    
    # Define loader mapping
    LOADER_MAP: Dict[str, type] = {
        'pdf': PyPDFLoader,
        'txt': TextLoader,
        'text': TextLoader,
        'doc': TextLoader,         # Basic text handling for .doc
        'docx': Docx2txtLoader,
        'csv': CSVLoader,
        'md': UnstructuredMarkdownLoader,
        'json': JSONLoader
    }
    
    # Check if file type should be skipped
    if extension in SKIP_EXTENSIONS:
        if strict_mode:
            raise UnsupportedFileTypeError(
                f"File type '.{extension}' is not supported for loading"
            )
        return None
    
    # Get appropriate loader or default to TextLoader
    loader_class = LOADER_MAP.get(extension)
    
    if loader_class is None and strict_mode:
        raise UnsupportedFileTypeError(
            f"No specific loader found for '.{extension}' files"
        )
    
    return loader_class or TextLoader  # Default to TextLoader if no specific loader found

def get_file_metadata(file_path: Union[str, Path]) -> dict:
    """
    Get metadata about the file to be loaded.
    
    Args:
        file_path (Union[str, Path]): Path to the file
        
    Returns:
        dict: Dictionary containing file metadata
    """
    path = Path(file_path)
    return {
        'filename': path.name,
        'extension': path.suffix.lower().lstrip('.'),
        'size_bytes': path.stat().st_size if path.exists() else None,
        'absolute_path': str(path.absolute()),
        'exists': path.exists(),
        'is_file': path.is_file() if path.exists() else None
    }

def load_documents(base_path: str) -> list:
    """Load documents from the specified path with appropriate loaders."""
    documents = []
    text_loader_kwargs = {'encoding': 'utf-8'}
    
    if not os.path.exists(base_path):
        print(f"Directory not found: {base_path}")
        return documents
        
    for root, _, files in os.walk(base_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                # Get appropriate loader
                loader_class = get_loader_class(file_path)
                
                # Skip if no loader is available for this file type
                if loader_class is None:
                    print(f"⚠ Skipping unsupported file: {file_path}")
                    continue
                
                # Initialize loader with appropriate arguments
                if loader_class == TextLoader:
                    loader = loader_class(file_path, **text_loader_kwargs)
                elif loader_class == CSVLoader:
                    loader = loader_class(file_path, encoding='utf-8')
                else:
                    loader = loader_class(file_path)
                
                # Load document
                docs = loader.load()
                
                # Add metadata
                doc_type = os.path.basename(root)
                for doc in docs:
                    doc.metadata.update({
                        "doc_type": doc_type,
                        "source": file_path,
                        "file_type": file_path.split('.')[-1],
                        "file_name": file
                    })
                    documents.append(doc)
                
                print(f"✓ Successfully loaded: {file_path}")
                
            except Exception as e:
                print(f"✗ Error loading {file_path}: {str(e)}")
                continue
    
    return documents

# Usage
base_path = "../WILP_ASSIGNMENT_WORK"
documents = load_documents(base_path)
print(f"\nTotal documents loaded: {len(documents)}")

# Print document statistics
if documents:
    file_types = {}
    for doc in documents:
        file_type = doc.metadata.get('file_type', 'unknown')
        file_types[file_type] = file_types.get(file_type, 0) + 1
    
    print("\nDocument breakdown by file type:")
    for file_type, count in file_types.items():
        print(f"- {file_type}: {count} documents")

✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\2020HT80616.pdf
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\2020HT80616_RCC_Lab_Assignment_1.docx
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\2020HT80616_RCC_Lab_Assignment_1.pdf
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\2020HT80616_RCC_Lab_Assignment_2.docx
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\2020HT80616_RCC_Lab_Assignment_2.pdf
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\Abstract_2020ht80616.pdf
⚠ Skipping unsupported file: ../WILP_ASSIGNMENT_WORK\cpu_risc-master.zip
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\FINAL SEM REPORT_2020HT80616.docx
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\MID SEM REPORT_2020HT80616.docx
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\MID SEM REPORT_2020HT80616.pdf
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\SampleProjectReport.pdf
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\VLSI_Architecture_Assignment_2_ALU.docx
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\VLSI_Archit

Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 84 0 (offset 0)
Ignoring wrong pointing object 94 0 (offset 0)
Ignoring wrong pointing object 100 0 (offset 0)
Ignoring wrong pointing object 220 0 (offset 0)
Ignoring wrong pointing object 225 0 (offset 0)


✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\WILP Final Dissertation\From Mac\bitsfinalreportdocs\2020HT80616FR.pdf
✓ Successfully loaded: ../WILP_ASSIGNMENT_WORK\WILP Final Dissertation\From Mac\bitsfinalreportdocs\2020HT80616ppt.pdf

Total documents loaded: 288

Document breakdown by file type:
- pdf: 251 documents
- docx: 7 documents
- v: 28 documents
- lock: 2 documents


In [5]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

# folders = glob.glob("Manuals/*")

# def add_metadata(doc, doc_type):
#     doc.metadata["doc_type"] = doc_type
#     return doc

# documents = []
# for folder in folders:
#     doc_type = os.path.basename(folder)
#     loader = DirectoryLoader(folder, glob="**/*.pdf", loader_cls=PyPDFLoader)
#     folder_docs = loader.load()
#     documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

Total number of chunks: 393
Document types found: {'.metadata', 'bitsfinalreportdocs', 'cpu_risc-master', 'uart_led_lab', 'WILP_ASSIGNMENT_WORK', 'WILP Final Dissertation'}


In [6]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite
DB_NAME = "vector_db"

embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Delete if already exists

if os.path.exists(DB_NAME):
    Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_NAME)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 393 documents


In [7]:
#run a quick test - should return a list of documents = 4
question = "Who is Samar Singh?"
docs = vectorstore.similarity_search(question)
len(docs)

4

In [8]:
docs[0]

Document(id='b5e55bdf-17d3-49e8-ab1c-fdb2aebd2b97', metadata={'doc_type': 'WILP_ASSIGNMENT_WORK', 'file_name': '2020HT80616_RCC_Lab_Assignment_1.pdf', 'file_type': 'pdf', 'page': 2, 'source': '../WILP_ASSIGNMENT_WORK\\2020HT80616_RCC_Lab_Assignment_1.pdf'}, page_content='Name : Samar Singh Billawaria \n                                                                                                                  BITS ID : 2020HT80616 \n \n3. N-BIT COUNTER CODE : \n \n \n \n4. CUSTOM DFF CODE :')

In [9]:
# create a new Chat with Ollama
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
MODEL = "llama3.2:latest"
llm = ChatOllama(temperature=0.7, model=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


In [10]:
# Let's try a simple question

query = "Who is Samar ?"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Samar Singh Billawaria is a student, with a BITS ID (Birla Institute of Technology and Science) of 2020HT80616.


In [15]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the  LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [16]:
# Wrapping that in a function

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

## Now we will bring this up in Gradio using the Chat interface -

A quick and easy way to prototype a chat with an LLM

In [None]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)