In [6]:
# Adapted from Medium article by Rubentak dated 24 October 2023:
# https://medium.com/@rubentak/talk-to-your-files-in-a-local-rag-application-using-mistral-7b-langchain-and-chroma-db-no-2b4ba77358e0
# Requires Ollama (see ollama.ai)

from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import  MarkdownTextSplitter
from langchain.chains import RetrievalQA

In [2]:
# Ollama embeddings
embeddings_open = OllamaEmbeddings(model="Llama2")
# (model="mistral")

# Download Ollama executable

# model="mistral",
llm_open = Ollama(
    model="Llama2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
)

In [3]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

loader = DirectoryLoader("./parsed_syllabi", glob="*.md", loader_cls=TextLoader)

doc = loader.load()

len(doc)

30

In [4]:
doc[0]

Document(page_content='# Course Overview and Prerequisites\n\nThis course will introduce and in some cases, review mathematical concepts relevant to future work in applied data science. It will cover important concepts in basic and linear algebra, matrix and vector algebra, eigenvectors and eigenvalues, optimization techniques, Bayes rule, and maximum likelihood.\n\nThere are no course prerequisites.\n\n# Instructor and Teaching Staff\n\nInstructor: Alex McLeod (mcleodal@umich.edu)\n\nTeaching Staff: Ben Merrill (benme@umich.edu), Nhan Le (nhanle@umich.edu)\n\n# Course Schedule\n\n- This course begins on **Monday, September 27, 2021** and ends on **Sunday, October 24, 2021** .\n- Weekly assignments will be due on **Mondays at 11:59 pm (Ann Arbor, Michigan time-Eastern Standard Time - EST, UTC -5) except for week 4, which will be due on the last day of class, Sunday, October 24 at 11:59pm (Ann Arbor, Michigan time-Eastern Standard Time - EST, UTC -5)**\n\n  **Schedule of Weekly Office H

In [7]:
text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(doc)

len(texts)

485

In [8]:
texts[32]

Document(page_content='Online access to these textbooks are provided through the University of Michigan Library. You may be asked to sign in with your UMich uniquename and password to access these materials.\n\n### **Technology Requirements (unique to this course)**\n\n_None_\n\n### **Accessibility**\n\n[Screen reader configuration for Jupyter Notebook Content](https://docs.google.com/document/d/1ct4BShNTYVU2J_oYeuTTsODSAFlEhtODXMlfc4-t5PM/edit?usp=sharing)\n\n### **Learning Outcomes**', metadata={'source': 'parsed_syllabi/2021-12_532.md'})

In [10]:
persist_directory = "chromadb_test"

In [9]:
from huggingface_hub import snapshot_download

# Running hugging face hub in a cell and downloading cache
snapshot_download(repo_id="sentence-transformers/all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm
README.md: 100%|██████████| 10.7k/10.7k [00:00<00:00, 23.4MB/s]s]
Fetching 16 files: 100%|██████████| 16/16 [00:00<00:00, 57.33it/s]


'/Users/Pat/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/44eb4044493a3c34bc6d7faae1a71ec76665ebc6'

In [11]:
vectordb = Chroma.from_documents(
    documents=texts,
    collection_name="MADS",
    # Chose the embedding you want to use
    embedding=embeddings_open,
    persist_directory=persist_directory,
)

In [12]:
# Save to disk
vectordb.persist()
vectordb = None

In [13]:
# Reload from disk
vectordb = Chroma('MADS', persist_directory = persist_directory,
                  embedding_function = embeddings_open,
                  collection_metadata={"hnsw:space": "cosine"})

In [14]:
# Create retriever
retriever = vectordb.as_retriever(
    # search_type="mmr",
    # search_kwargs={'k': 5, 'fetch_k': 20}
    # search_type="similarity_score_threshold",
    # search_kwargs={'score_threshold': 0.8}
)

In [15]:
docs = retriever.get_relevant_documents("Who is the course manager for SIADS 699?")
len(docs)

4

In [16]:
docs

[Document(page_content='1. American Civil Liberties Union. (2004). "Scary Pizza." (video.) 01:42. Available online: <https://www.youtube.com/watch?v=33CIVjvYyEk>\n2. Raji, Deborah. (December 10, 2020). " [How our data encodes systematic racism](https://www.technologyreview.com/2020/12/10/1013617/racism-data-science-artificial-intelligence-ai-opinion/) ." MIT Technology Review.\n3. Wallach, Hanna. (December 14, 2014). [Big Data, Machine Learning, and the Social Sciences: Fairness, Accountability, and Transparency.](https://medium.com/@hannawallach/big-data-machine-learning-and-the-social-sciences-927a8e20460d) Medium.com.', metadata={'source': 'parsed_syllabi/2022-03_503.md'}),
 Document(page_content='- Correctly apply and interpret results from clustering methods in scikit-learn, including k-means, agglomerative clustering, hierarchical clustering, and DBSCAN.\n- Understand the use of topic modeling (Latent Dirichlet Allocation and Non-Negative Matrix Factorization forms) and best prac

In [17]:
def process_llm_response(llm_response):
    # print(llm_response['result'])
    print("\n\nSources:")
    for source in llm_response["source_documents"]:
        print(source.metadata["source"])

In [18]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_open,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

In [24]:
# Question
query = "Tell me what I need to turn in for the capstone project"
# query = "Which course can I take to learn about degree centrality?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m


ValueError: Error raised by inference API HTTP code: 404, {"error":"model 'Llama2' not found, try pulling it first"}