In [12]:
%pip install langchain_community langchain_text_splitters langchain_openai langchain_chroma gradio python-dotenv pypdf

Note: you may need to restart the kernel to use updated packages.


## Requirements
#### Vector Database = langchain_chroma
#### Embeddings Model Provider= SentenceTransformerEmbeddings
#### Embedding Model Name = model_name="all-mpnet-base-v2"

In [13]:
# from google.colab import drive

# # Mount Google Drive
# drive.mount('/content/drive')

# # Define the Google Drive path where your documents are stored
# DATA_PATH = '/content/drive/MyDrive/local_rag/data'

In [3]:
# The DATA_PATH and CHROMA_PATH variables are defined in the cell below

DATA_PATH = r"data"
CHROMA_PATH = r"chroma_db"

In [15]:
import os

# Get a list of all files in the specified folder
file_list = os.listdir(DATA_PATH)

# Filter the list to only include files with .pdf or .docx extensions
pdf_files = [f for f in file_list if f.endswith('.pdf')]
docx_files = [f for f in file_list if f.endswith('.docx')]

# Example: Print the list of PDF and DOCX files in the folder
print("PDF Files:", pdf_files)
print("DOCX Files:", docx_files)

PDF Files: ['M20.1_Introduction_to_Natural_Language_Processing(NLP).pdf', 'M3.2__Programming_with_Functions___Part_2.pdf', 'M13.1__Linear_Classification.pdf', 'M19.2_Image_Classification_With_Neural Networks.pdf', 'M9.2_ Project 1 , Week 1.pdf', 'M15.3__AI_Laws_and_Regulations.pdf', 'M6.1__Sourcing_Data_for_AI_Projects.pdf', 'M18.1_Neural_Networks.pdf', 'M1.2_ The Impact of Machine Learning.pdf', 'M14.1__Model_Validation_and_Imbalanced_Data.pdf', 'M13.3__Model_Selection_and_Multiclass_Classification.pdf', 'M11.2__Unsupervised_Learning_in_Practice.pdf', 'M8.1__Introduction_to_Time_Series.pdf', 'M3.3__Python_Classes_and_OOP.pdf', 'M7.3_Visualization_and_Statistics.pdf', 'M15.2__Legal_and_Ethical_Issues_in_AI.pdf', 'M2.2_Programming Decisions.pdf', 'M6.2__Accessing__APIs_Securely.pdf', 'M1.3_ Overview of Machine Learning Tools.pdf', 'M2.1_ Programming Basics.pdf', 'M13.2__Nonlinear_Classification.pdf', 'M11.1__Introduction_to_Machine_Learning.pdf', 'M20.2_Advanced_NLP_Techniques_TextExtra

In [1]:
# import the .env file
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# Import necessary modules
import os
import time
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

# Initialize the embeddings model with the specified model name
embeddings_model = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# Initialize the vector store with the specified collection name, embedding function, and persistence directory
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings_model,
    persist_directory=CHROMA_PATH,
)


  vector_store = Chroma(


In [19]:
def indexed_files(vector_store):
    stored_files = set(
        [
            meta["source"].split("/")[-1]
            for meta in vector_store._collection.get(include=["metadatas"])["metadatas"]
            if meta and "source" in meta
        ]
    )

    print ("stored_files:",stored_files)
    return stored_files

indexed_files(vector_store)

stored_files: set()


set()

In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from uuid import uuid4


def process_files(vector_store, embeddings_model):
    """
    Processes files in DATA_PATH, adding new files and removing deleted files.
    """
    current_files = set(os.listdir(DATA_PATH))
    updates = False
    print ("current_files:",current_files)

    stored_files = indexed_files(vector_store)


    # Find new files
    new_files = current_files - stored_files
    for filename in new_files:
        updates = True
        if filename.endswith(".pdf"):
            filepath = os.path.join(DATA_PATH, filename)
            loader = PyPDFLoader(filepath)
            raw_documents = loader.load()

            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=300,
                chunk_overlap=100,
                length_function=len,
                is_separator_regex=False,
            )
            chunks = text_splitter.split_documents(raw_documents)
            print ("Document Split Complete",filename)

            # Index original document chunks
            uuids = [str(uuid4()) for _ in range(len(chunks))]
            vector_store.add_documents(documents=chunks, ids=uuids)

            print(f"Added file: {filename}")

    # Find deleted files
    deleted_files = stored_files - current_files
    for filename in deleted_files:
        updates = True
        if filename.endswith(".pdf"):
            filepath = os.path.join(DATA_PATH, filename)
            # Find document IDs associated with the deleted file
            results = vector_store._collection.get(
                where={"source": filepath}, include=["ids"]
            )
            if "ids" in results:
                ids_to_delete = results["ids"]
                vector_store._collection.delete(ids=ids_to_delete)
                print(f"Deleted file: {filename}")
    
    print("-" * 80)
    if updates:
        print("Added files: ", new_files)
        print("Removed files: ", deleted_files)

    else:    
        print("No changes detected.")

def run_periodic_check(interval_seconds = 60):
    """
    Runs the periodic file check in a notebook environment.
    """
    
    print(f"Checked for file changes....")

    process_files(vector_store, embeddings_model)
    # while True:
    #     process_files(vector_store, embeddings_model)
    #     time.sleep(interval_seconds)
    #     print(f"Checked for file changes. Next check in {interval_seconds} seconds...")



In [21]:
# To start the periodic check in your notebook, call this function:
run_periodic_check()

# To stop the check, you'll need to interrupt the kernel.

Checked for file changes....
current_files: {'M9.1_ Project 1, Week 1.pdf', 'M4.1__Introduction_to_Pandas.pdf', 'M18.1_Neural_Networks.pdf', 'M7.3_Visualization_and_Statistics.pdf', 'M20.1_Introduction_to_Natural_Language_Processing(NLP).pdf', 'M20.2_Advanced_NLP_Techniques_TextExtraction_and_Classification.pdf', 'M6.2__Accessing__APIs_Securely.pdf', 'M3.1__Programming_with_Functions___Part_1.pdf', 'M13.3__Model_Selection_and_Multiclass_Classification.pdf', 'M9.2_ Project 1 , Week 1.pdf', 'M8.3__Time_Series_Forecasting_with_Prophet.pdf', '.DS_Store', 'M2.1_ Programming Basics.pdf', 'M1.2_ The Impact of Machine Learning.pdf', 'M4.3__Transforming_Data_with_Pandas.pdf', 'M6.1__Sourcing_Data_for_AI_Projects.pdf', 'M5.1__Combining_DataFrames_with_Pandas.pdf', 'M12.3__Machine_Learning_Pipeline_and_Mini-Project.pdf', 'M3.2__Programming_with_Functions___Part_2.pdf', 'M14.1__Model_Validation_and_Imbalanced_Data.pdf', 'M18.2_Neural_Networks_for_Classification.pdf', 'M6.3__SDKs_and__Mini_In-Class

could not convert string to float: b'0.00-29555193' : FloatObject (b'0.00-29555193') invalid; use 0.0 instead
could not convert string to float: b'0.00-22356359' : FloatObject (b'0.00-22356359') invalid; use 0.0 instead


Added file: M8.3__Time_Series_Forecasting_with_Prophet.pdf
Document Split Complete M2.1_ Programming Basics.pdf
Added file: M2.1_ Programming Basics.pdf
Document Split Complete M1.2_ The Impact of Machine Learning.pdf


could not convert string to float: b'0.000-29396326' : FloatObject (b'0.000-29396326') invalid; use 0.0 instead


Added file: M1.2_ The Impact of Machine Learning.pdf
Document Split Complete M4.3__Transforming_Data_with_Pandas.pdf


could not convert string to float: b'0.0000-52493438' : FloatObject (b'0.0000-52493438') invalid; use 0.0 instead
could not convert string to float: b'0.0000-52493438' : FloatObject (b'0.0000-52493438') invalid; use 0.0 instead
could not convert string to float: b'0.00-50' : FloatObject (b'0.00-50') invalid; use 0.0 instead
could not convert string to float: b'0.00-50' : FloatObject (b'0.00-50') invalid; use 0.0 instead
could not convert string to float: b'0.0000-13123359' : FloatObject (b'0.0000-13123359') invalid; use 0.0 instead


Added file: M4.3__Transforming_Data_with_Pandas.pdf
Document Split Complete M6.1__Sourcing_Data_for_AI_Projects.pdf
Added file: M6.1__Sourcing_Data_for_AI_Projects.pdf
Document Split Complete M5.1__Combining_DataFrames_with_Pandas.pdf
Added file: M5.1__Combining_DataFrames_with_Pandas.pdf
Document Split Complete M12.3__Machine_Learning_Pipeline_and_Mini-Project.pdf
Added file: M12.3__Machine_Learning_Pipeline_and_Mini-Project.pdf
Document Split Complete M3.2__Programming_with_Functions___Part_2.pdf
Added file: M3.2__Programming_with_Functions___Part_2.pdf
Document Split Complete M14.1__Model_Validation_and_Imbalanced_Data.pdf
Added file: M14.1__Model_Validation_and_Imbalanced_Data.pdf
Document Split Complete M18.2_Neural_Networks_for_Classification.pdf
Added file: M18.2_Neural_Networks_for_Classification.pdf
Document Split Complete M6.3__SDKs_and__Mini_In-Class_Project.pdf
Added file: M6.3__SDKs_and__Mini_In-Class_Project.pdf
Document Split Complete M12.1__Introduction_to_Supervised_Le

could not convert string to float: b'0.00-25406823' : FloatObject (b'0.00-25406823') invalid; use 0.0 instead


Added file: M15.2__Legal_and_Ethical_Issues_in_AI.pdf
Document Split Complete M2.3__Complex_Programming_Decisions.pdf
Added file: M2.3__Complex_Programming_Decisions.pdf
Document Split Complete M1.1__Overview_of_AI_and_Machine_Learning.pdf
Added file: M1.1__Overview_of_AI_and_Machine_Learning.pdf
Document Split Complete M5.3__Advanced_Data_Reshaping_with_Pandas.pdf
Added file: M5.3__Advanced_Data_Reshaping_with_Pandas.pdf
Document Split Complete M14.2__Advanced_Preprocessing_Techniques.pdf
Added file: M14.2__Advanced_Preprocessing_Techniques.pdf
Document Split Complete M19.2_Image_Classification_With_Neural Networks.pdf
Added file: M19.2_Image_Classification_With_Neural Networks.pdf
Document Split Complete M12.2__Advanced_Regression_Techniques.pdf
Added file: M12.2__Advanced_Regression_Techniques.pdf


could not convert string to float: b'0.00-60' : FloatObject (b'0.00-60') invalid; use 0.0 instead
could not convert string to float: b'0.00-60' : FloatObject (b'0.00-60') invalid; use 0.0 instead
could not convert string to float: b'0.00-60' : FloatObject (b'0.00-60') invalid; use 0.0 instead
could not convert string to float: b'0.00-60' : FloatObject (b'0.00-60') invalid; use 0.0 instead


Document Split Complete M8.1__Introduction_to_Time_Series.pdf
Added file: M8.1__Introduction_to_Time_Series.pdf
Document Split Complete M13.2__Nonlinear_Classification.pdf
Added file: M13.2__Nonlinear_Classification.pdf
Document Split Complete M15.1__AI_Ethics_An_Introduction.pdf
Added file: M15.1__AI_Ethics_An_Introduction.pdf
Document Split Complete M14.3__Tuning_Models_and_Sampling_Data.pdf
Added file: M14.3__Tuning_Models_and_Sampling_Data.pdf
Document Split Complete M9.3_ Project 1, Week 1.pdf
Added file: M9.3_ Project 1, Week 1.pdf
Document Split Complete M3.3__Python_Classes_and_OOP.pdf
Added file: M3.3__Python_Classes_and_OOP.pdf
Document Split Complete M7.2__Visualizing_DataFrames_using_Pandas.pdf
Added file: M7.2__Visualizing_DataFrames_using_Pandas.pdf
Document Split Complete M16.1__Project_2__Week_1.pdf
Added file: M16.1__Project_2__Week_1.pdf
Document Split Complete M13.1__Linear_Classification.pdf
Added file: M13.1__Linear_Classification.pdf
Document Split Complete M11.2_

# Testing the knowledge base

In [23]:
num_results = 5

retriever = vector_store.as_retriever(search_kwargs={'k': num_results})

docs = retriever.invoke("What is a neural network?")
for i, doc in enumerate(docs, 1):
    print(f"Result {i}:")
    print(doc)
    print("-" * 80)
#print((docs[0].metadata["source"], docs[0].metadata["page_label"]))

Result 1:
page_content='Neural Network is an advanced form 
of machine learning that contains 
multiple layers of nodes which 
perform individual computations.' metadata={'creationdate': '', 'creator': 'Google', 'page': 8, 'page_label': '9', 'producer': 'PyPDF', 'source': 'data/M18.1_Neural_Networks.pdf', 'title': 'M18.1: Neural Networks', 'total_pages': 71}
--------------------------------------------------------------------------------
Result 2:
page_content='Neural networks are a powerful 
machine learning technique 
modeled after neurons in the brain.
Industry leaders such as 
Google, Facebook, Twitter, and 
Amazon use neural networks 
for analyzing complex datasets.
What is a neural network?' metadata={'creationdate': '', 'creator': 'Google', 'page': 6, 'page_label': '7', 'producer': 'PyPDF', 'source': 'data/M18.1_Neural_Networks.pdf', 'title': 'M18.1: Neural Networks', 'total_pages': 71}
--------------------------------------------------------------------------------
Result 3:
pa