In [1]:
!pip install langchain llama-cpp-python chromadb==0.4.14 sentence_transformers

Collecting langchain
  Downloading langchain-0.1.13-py3-none-any.whl (810 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/810.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/810.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m810.5/810.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-cpp-python
  Downloading llama_cpp_python-0.2.57.tar.gz (36.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.9/36.9 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting chromadb==0.4.14
  Downloading chromadb-0.4.14-py3-none-any.whl (448 kB)
[2K     [90m━━━━

In [2]:
from langchain_community.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader, TextLoader, JSONLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA, ConversationalRetrievalChain, RetrievalQAWithSourcesChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import LlamaCpp
import pickle

In [11]:
from google.colab import drive
import os


drive.mount('/content/drive', force_remount=True)
# os.chdir('/content/drive/MyDrive/Capstone')
!ls ./drive/MyDrive/Capstone

Mounted at /content/drive
ls: cannot access './drive/MyDrive/Capstone': No such file or directory


In [12]:
persist_directory = "/content/drive/MyDrive/Capstone/embeddings"

with open(f"{persist_directory}/embeddings.pickle", 'rb') as handle:
    embeddings = pickle.load(handle)


In [13]:
vectordb = Chroma(
    "embeddings",
    embedding_function=embeddings,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"},
)

In [14]:
retriever = vectordb.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "fetch_k": 20},
)

In [16]:
llm_open = LlamaCpp(
    model_path="/content/drive/MyDrive/Capstone/models/llama-2-7b.Q4_K_M.gguf", # https://huggingface.co/TheBloke/Llama-2-7B-GGUF
    n_ctx=4096,  # 4096 for Llama, 32*1024 for Mistral
    n_gpu_layers=50,
    temperature=0.15,
    top_p=1,
    top_k=40,
    repeat_penalty=1.1,
    max_tokens=1024,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    stream=True,
)

                stream was transferred to model_kwargs.
                Please confirm that stream is what you intended.
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /content/drive/MyDrive/Capstone/models/llama-2-7b.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:    

In [17]:
def process_llm_response(llm_response):
    print("\n\nSources:")
    for i, source in enumerate(llm_response["source_documents"]):
        m = source.metadata
        print(f"{i + 1}. {m['course_title']} ({m['course_number']}): {m['heading']}")

In [18]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_open,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

In [None]:
import pandas as pd

class ResponseFormatter:
    def __init__(self):
        self.responses = {
            "question": [],
            "contexts": [],
            "answer": [],
            "ground_truth": [],
        }

    def add_response(self, llm_response, ground_truth):
        self.responses["question"].append(llm_response["query"])
        self.responses["contexts"].append(
            [doc.page_content for doc in llm_response["source_documents"]]
        )
        self.responses["answer"].append(llm_response["result"])
        self.responses["ground_truth"].append(ground_truth)

    def get_responses(self):
        return self.responses
    
    def get_dataframe(self):
        pd.DataFrame(self.responses[["question", "answer", "ground_truth"]])

In [15]:
query = "Where can I find the MADS Incomplete Request Form?"

vectordb.similarity_search_with_score(query)

[(Document(page_content='Context from MADS Student Handbook, Grade of Incomplete (I): If a MADS student needs to drop a course, effort should be made to do so within the first week (or seven days) from when the course starts, before the MADS drop/add deadline. However, when extenuating circumstances prevent completion of coursework by the end of the MADS session, and the student participated in the course past the standard MADS drop/add deadline, a grade of incomplete (“I”) may be assigned.\n\nTo request an “I” grade for a course that has not been awarded a letter grade or course withdrawal (W), students must complete the [MADS Incomplete Request Form](https://docs.google.com/forms/d/e/1FAIpQLSccjy5cRpEN5eljekRqiTJg_CK1ihFvvMa3y-BPj-82zXHH4g/viewform?usp=sf_link) as soon as possible, and no later than the last day of the MADS session (11:59 PM Eastern Time). After the request is submitted, an academic advisor will partner with the student to create a plan for resolution of the incomple

In [19]:
# query = "Tell me what I need to turn in for the capstone project."

llm_response = qa_chain(query)
process_llm_response(llm_response)

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m
 [MADS Incomplete Request Form](https://docs.google.com/forms/d/e/1FAIpQLSccjy5cRpEN5eljekRqiTJg_CK1ihFvvMa3y-BPj-82zXHH4g/viewform?usp=sf_link)



llama_print_timings:        load time =    3534.89 ms
llama_print_timings:      sample time =      44.81 ms /    80 runs   (    0.56 ms per token,  1785.44 tokens per second)
llama_print_timings: prompt eval time =  782292.66 ms /  1668 tokens (  469.00 ms per token,     2.13 tokens per second)
llama_print_timings:        eval time =   58518.98 ms /    79 runs   (  740.75 ms per token,     1.35 tokens per second)
llama_print_timings:       total time =  842020.27 ms /  1747 tokens



[1m> Finished chain.[0m


Sources:
1.  (): Grade of Incomplete (I)
2.  (): Waitlists
3.  (): Petition for Modification or Waiver of Policy
4. Communicating Data Science Results (SIADS 523): Letter Grades, Course Grades, And Late Submission Policy
5.  (): Eligibility


In [None]:
process_llm_response( llm_response)



Sources:
1. Capstone (SIADS 699): Course Syllabus For SIADS 699
2. Milestone I (SIADS 593): 14.0 Student Mental Health And Well-Being
3. Capstone (SIADS 699): Instructor And Course Assistants
4. Milestone II (SIADS 696): Project Component
5. Capstone (SIADS 699): Grading


In [None]:
capstone_requirements_llm_response = llm_response

capstone_requirements_ground_truth = """
Final Project Submission including:
    A report that tells the story of your project, which can be formatted as a blog, a scientific manuscript, or something else.
    A GitHub repository full of the code required to reproduce your analysis and figures, documented with an informative README.
    One of the following high-level overviews of your work designed to be shared:
    A 3-5 minute video from your team about what you made.
    A poster appropriate for a data science conference, such as the MIDAS Data Science Symposium.
Weekly Mini-Deliverables: There will be weekly mini-deliverables throughout the course, as outlined in the grading section of the syllabus.
"""

In [None]:
formatter = ResponseFormatter()

In [None]:
formatter.add_response(capstone_requirements_llm_response, capstone_requirements_ground_truth)

with open(f"./evaluation_set_rag_formatter.pickle", 'wb') as handle:
    pickle.dump(formatter, handle)

with open(f"./evaluation_set_rag_responses.pickle", 'wb') as handle:
    pickle.dump(formatter.get_responses(), handle)

In [None]:
prompts = [
    {
        "question": "Which class involves time series analysis?",
        "ground_truth": "Data Mining II, SIADS 632",
    },
    {
        "question": "Who teaches the SQL and Databases class?",
        "ground_truth": "The primary instructor for SQL and Databases is Graham Hukill. Additional course assistants include, Derek Bruckner, Emily Schemanske, Jungseo Lee, and Toby Kemp.",
    },
    {
        "question": "What are the prerequisites for Data Science for Social Good?",
        "ground_truth": "SIADS 630, 631, and 694",
    },
    {
        "question": "When are the office hours for the Math Methods course?",
        "ground_truth": "Office hours for Math Methods are held at the following times: Alex McLeod: Monday at 11:45 am EST, Saurabh Budholiya: Friday at 9:00 am EST, Alexis Castellano: Thursdays at 7:00 pm EST",
    },
    {
        "question": "Are there any weekly readings for Milestone II?",
        "ground_truth": "There is introductory material during the first week of the course, but generally speaking, no weekly readings in this course.",
    },
    {
        "question": "What are the outcomes of Qualitative Inquiry?",
        "ground_truth": "Upon successful completion of this course, students will be able to:1.  Collect, represent, and analyze qualitative data about a quantitative data set, by…2.  Conducting semi-structured interviews;3.  Processing interview notes into discrete pieces of qualitative data; and 4. Analyzing qualitative data using affinity walls.4.  Develop a narrative about qualitative findings that support later quantitative analysis.5.  Communicate qualitative findings in written form.",
    },
]

In [5]:
import pandas as pd

df = pd.read_csv("./evaluation_set.csv")

In [8]:
for _, row in df.iterrows():
    print(row["question"])

0 Which class involves time series analysis?
1 Who teaches the SQL and Databases class?
2 What are the prerequisites for Data Science for Social Good?
3 When are the office hours for the Math Methods course?
4 Are there any weekly readings for Milestone II?
5 What are the outcomes of Qualitative Inquiry?
6 What textbook is required for SIADS 505?
7 What textbook is required for Data Manipulation?
8 Which week of unsupervised learning covers DBSCAN?
9 How many credits are required to complete the MADS program?
10 How long do students have to complete the MADS program start to finish?


In [None]:
for prompt in prompts:
  llm_response = qa_chain(prompt["question"])
  process_llm_response(llm_response)

  formatter.add_response(llm_response, prompt["ground_truth"])

  # Save after each response
  with open(f"./evaluation_set_rag_formatter.pickle", 'wb') as handle:
      pickle.dump(formatter, handle)

  with open(f"./evaluation_set_rag_responses.pickle", 'wb') as handle:
      pickle.dump(formatter.get_responses(), handle)



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit


 Data Mining II - SIADS 632


llama_print_timings:        load time =    3589.85 ms
llama_print_timings:      sample time =       6.98 ms /    13 runs   (    0.54 ms per token,  1862.20 tokens per second)
llama_print_timings: prompt eval time =  376111.82 ms /   842 tokens (  446.69 ms per token,     2.24 tokens per second)
llama_print_timings:        eval time =    7726.96 ms /    12 runs   (  643.91 ms per token,     1.55 tokens per second)
llama_print_timings:       total time =  384185.60 ms /   854 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


Sources:
1. Health Analytics (SIADS 681): No Required Textbooks
2. Data Mining II (SIADS 632): Weekly Office Hours Via Zoom (Ann Arbor, Michigan Time):
3. Big Data: Scalable Data Processing (SIADS 516): Instructor And Course Assistants
4. Math Methods I (SIADS 502): Course Syllabus Mads 502: Math Methods For Data Science Course Overview
5. Communicating Data Science Results (SIADS 523): Course Schedule


[1m> Entering new RetrievalQA chain...[0m
 The instructor for this class is Graham Hukill (gshukill@umich.edu). The course assistants are Derek Bruckner (dbrucknr@umich.edu), Emily Schemanske (landise@umich.edu), Jungseo Lee (jungseo@umich.edu), Toby Kemp (tobyk@umich.edu).



llama_print_timings:        load time =    3589.85 ms
llama_print_timings:      sample time =      47.83 ms /    88 runs   (    0.54 ms per token,  1839.73 tokens per second)
llama_print_timings: prompt eval time =  333765.50 ms /   744 tokens (  448.61 ms per token,     2.23 tokens per second)
llama_print_timings:        eval time =   58223.68 ms /    87 runs   (  669.24 ms per token,     1.49 tokens per second)
llama_print_timings:       total time =  392760.85 ms /   831 tokens
Llama.generate: prefix-match hit



[1m> Finished chain.[0m


Sources:
1. SQL and Databases (SIADS 511): Course Syllabus Course Overview And Prerequisites
2. SQL and Databases (SIADS 511): Course Outcomes
3. SQL and Databases (SIADS 511): Textbooks
4. SQL and Databases (SIADS 511): Instructor And Course Assistants
5. Learning Analytics and Educational Data Science (SIADS 680): SIADS 680: Learning Analytics Course Syllabus Course Overview And Prerequisites


[1m> Entering new RetrievalQA chain...[0m
