In [4]:
# Adapted from Medium article by Rubentak dated 24 October 2023:
# https://medium.com/@rubentak/talk-to-your-files-in-a-local-rag-application-using-mistral-7b-langchain-and-chroma-db-no-2b4ba77358e0
# Requires Ollama (see ollama.ai)

from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

In [5]:
# Ollama embeddings
embeddings_open = OllamaEmbeddings(model="Llama2")
# (model="mistral")

# Download Ollama executable

# model="mistral",
llm_open = Ollama(
    model="Llama2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
)

In [6]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

loader = DirectoryLoader("./parsed_syllabi", glob="*.txt", loader_cls=TextLoader)

doc = loader.load()

len(doc)

30

In [12]:
doc[0]

Document(page_content="Course Syllabus for SIADS 515: Efficient Data Processing Course Overview and Prerequisites This course will introduce students to the basics of the linux command-line interface, debugging concepts, basic algorithmic principles such as memoization, recursion, caching, and generators, as well as efficiency and code profiling.\n\nThere are no prerequisites. Instructor and Course Assistants Instructor: Chris Teplovs, Ph.D. Lecturer IV in Information and Research Investigator, School of Information\n\nCourse Assistant: Kris Steinhoff, Intermittent Lecturer in Information, School of Information and Staff Data Engineer, Toyota Research Institute Course Communication Expectations If you have questions about course content (e.g. lecture videos, quizzes, or assignments), please use the class Slack channel to discuss with classmates and the instructional team. Instructor and course assistant response time to Slack messages will be within 24 hours. Personal communication tha

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(doc)

len(texts)

313

In [13]:
texts[32]

Document(page_content='working from notes you took while participating in a study session. Largely duplicate copies of the same assignment will receive an equal division of the total point score from the one piece of work. You may incorporate selected excerpts from publications by other authors, but they must be clearly marked as quotations and must be attributed. If you build on the ideas of prior authors, you must cite their work. You may obtain copy editing assistance, and you may discuss your ideas with others, but all substantive writing and ideas must be your own, or be explicitly attributed to another. See the Rackham Graduate policy on Academic and Professional Integrity for the definition of plagiarism, and associated consequences. Letter Grades, Course Grades, and Late Submission Policy If you are late submitting an assignment, the following late policy will typically apply: 15% reduction if assignment is turned in one day late, 30% reduction if two days late, 45% reduction i

In [9]:
persist_directory = "chromadb_test"

In [17]:
from huggingface_hub import snapshot_download

# Running hugging face hub in a cell and downloading cache
snapshot_download(repo_id="sentence-transformers/all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm
config.json: 100%|██████████| 612/612 [00:00<00:00, 1.29MB/s]

data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 18.4MB/s]
.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 138kB/s]


Fetching 16 files:   6%|▋         | 1/16 [00:00<00:03,  3.79it/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 37.2kB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 27.6kB/s]
README.md: 100%|██████████| 10.7k/10.7k [00:00<00:00, 1.10MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 213kB/s]

sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 241kB/s]

special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 200kB/s]

tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 197kB/s]

[A

[A[A


train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 32.5MB/s]



[A[A[A



[A[A[A[A
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 4.2

'/Users/Pat/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/1a310852cf8e58d22c5ebff537711d504ad4ad66'

In [None]:
vectordb = Chroma.from_documents(
    documents=texts,
    collection_name="MADS",
    # Chose the embedding you want to use
    embedding=embeddings_open,
    persist_directory=persist_directory,
)

In [22]:
# Save to disk
vectordb.persist()
vectordb = None

In [10]:
# Reload from disk
vectordb = Chroma('MADS', persist_directory = persist_directory,
                  embedding_function = embeddings_open,
                  collection_metadata={"hnsw:space": "cosine"})

In [11]:
# Create retriever
retriever = vectordb.as_retriever(
    # search_type="mmr",
    # search_kwargs={'k': 5, 'fetch_k': 20}
    # search_type="similarity_score_threshold",
    # search_kwargs={'score_threshold': 0.8}
)

In [12]:
docs = retriever.get_relevant_documents("Who is the course manager for SIADS 699?")
len(docs)

4

In [13]:
docs

[Document(page_content='Services for Students with Disabilities.\nAccessibility\nIf you have accessibility issues with the material in this class, please reach out to the instructional team.\nStudent Mental Health\nRefer to the University’s Resources for Stress and Mental Health website for a listing of resources for students.\nStudent Services\nRefer to the Introduction to UMSI Student Life section of the UMSI Student Handbook (access to the Student Orientation course\nrequired).\nTechnology Tips\n● Working Offline\n○ If you have an issue with ongoing access to the coursera platform, and have docker running on your local\ncomputer, please reach out to the instructional team for help getting setup offline.', metadata={'source': 'parsed_syllabi/2023-03_643.txt'}),
 Document(page_content='For questions regarding course content, refer to the Communications Expectations section above. \n \nWeekly Readings or Textbook Information \n \n \n \n \nDaniel M Romero \nAssociate Professor  \nSchool

In [14]:
def process_llm_response(llm_response):
    # print(llm_response['result'])
    print("\n\nSources:")
    for source in llm_response["source_documents"]:
        print(source.metadata["source"])

In [15]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_open,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

In [16]:
# Question
# query = "Tell me what I need to turn in for the capstone project"
query = "Which course can I take to learn about item sets?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m
Based on the context provided, it seems that the course you are referring to is likely "Complex Systems" offered by the University of Michigan through Coursera. The course description mentions the topic of item sets, which are a central concept in the field of complex systems.

To answer your question, I would recommend taking the "Complex Systems" course offered by Professor Daniel M Romero and colleagues at the University of Michigan. This course provides an introduction to the field of complex systems, including the concepts of item sets and their role in understanding complex systems. The course also covers a range of topics such as networks, crowds, and markets, which are relevant to understanding the behavior of item sets in complex systems.

Alternatively, if you are interested in learning more about item sets specifically, you could consider taking courses on data mining or machine learning, which often cover techniques for discover