In [1]:
# Adapted from Medium article by Rubentak dated 24 October 2023: 
# https://medium.com/@rubentak/talk-to-your-files-in-a-local-rag-application-using-mistral-7b-langchain-and-chroma-db-no-2b4ba77358e0
# Requires Ollama (see ollama.ai)

# Import libraries
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings import OllamaEmbeddings
from langchain.prompts import PromptTemplate
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, PDFMinerLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.chains import RetrievalQA

In [25]:
# Ollama embeddings
embeddings_open = OllamaEmbeddings(model="Llama2") #(model="mistral")
# OpenAI embeddings
#embedding = OpenAIEmbeddings()

llm_open = Ollama(  #model="mistral",
                    model='Llama2',
                    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

In [3]:
from langchain.document_loaders import DirectoryLoader

# Print number of pdf files in directory
loader = DirectoryLoader('Source Documents/', glob="./*.pdf", loader_cls=PDFMinerLoader)

# load pdfs from directory and print number of pdfs
# loader = PyPDFLoader('.../data/PDFs/How_to_build_your_carreer_in_AI.pdf')
# loader = PyPDFLoader('MADS Student Handbook.pdf')

# load another file directly
# loader = DirectoryLoader('/your/path/to/file.txt')

doc = loader.load()
#doc = loader.load_and_split()

len(doc)

4

In [35]:
#doc

In [5]:
text_splitter = RecursiveCharacterTextSplitter (chunk_size=1000, chunk_overlap=100)
#text_splitter = CharacterTextSplitter (chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(doc)

len(texts)

121

In [36]:
#texts[32]

In [7]:
# PDFs from directory
# persist_directory = 'PDFs_How_to_build_your_carreer_in_AI'

# Langchain documentation
# persist_directory = 'vdb_langchain_doc_small'

# Your documents 
# persist_directory = 'your_new_database'
persist_directory = 'chromadb_test'

vectordb = Chroma.from_documents(documents=texts, collection_name='MADS',
                                 # Chose the embedding you want to use
                                 embedding=embeddings_open,                                 
                                 persist_directory=persist_directory,
                                )

In [8]:
# Save to disk
vectordb.persist()
vectordb = None

In [9]:
# Reload from disk
vectordb = Chroma('MADS', persist_directory = persist_directory,
                  embedding_function = embeddings_open,
                  collection_metadata={"hnsw:space": "cosine"})

In [32]:
# Create retriever
retriever = vectordb.as_retriever(
            #search_type="mmr",
            #search_kwargs={'k': 5, 'fetch_k': 20}
            #search_type="similarity_score_threshold",
            #search_kwargs={'score_threshold': 0.8}
        )

In [13]:
docs = retriever.get_relevant_documents("Who is the course manager for SIADS 699?")
len(docs)

5

In [19]:
def process_llm_response(llm_response):
    #print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [26]:
qa_chain = RetrievalQA.from_chain_type(llm=llm_open,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True,
                                  verbose=True)

In [37]:
# Question
query = "Tell me what I need to turn in for the capstone project"
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m
Based on the provided context, it appears that you are asking about the requirements for the Capstone project in the SIADS 699 course. Specifically, you want to know what you need to turn in for the project.

From the course syllabus, it is mentioned that the Capstone project is a group data science project that students will independently design and execute. The intention of the project is to create a resume-worthy portfolio piece in a format that is ready to share and present with the broader community of data science professionals.

To answer your question, you will need to propose and build an end-to-end data science project in your domain of interest. The project will be supervised by instructors with regular peer review, and students are expected to demonstrate mastery of data science concepts and methods from their MADS training.

The course syllabus also mentions that students are required to uphold the Coursera Honor Code, which in