In [1]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import (
    RetrievalQA,
    ConversationalRetrievalChain,
    RetrievalQAWithSourcesChain,
)
from langchain.memory import ConversationBufferMemory
from langchain_openai.llms import OpenAI
from langchain.retrievers import EnsembleRetriever
from langchain_core.prompts import PromptTemplate

import pandas as pd
from ragatouille import RAGPretrainedModel
from semantic_router import Route, RouteLayer
from semantic_router.encoders import HuggingFaceEncoder

import gradio as gr
from json2html import Json2Html

In [2]:
# Setting some helper variables

retrieval_strategy = "colbert"
model = "mistral-7b-instruct-v0.2"

rootdir = ".."
persist_directory = "./../embeddings"
index_root = rootdir + "/../colbert_index/"
colbert_path = rootdir + "/../colbertv2.0/"
index_path = rootdir + "/../colbert_index/colbert/indexes/documents/"
transcript_path = rootdir + "/../colbert_index/colbert/indexes/transcripts/"

In [3]:
# Start Colbert models for documents and transcripts
RAG1 = RAGPretrainedModel.from_index(index_path = index_path)
RAG2 = RAGPretrainedModel.from_index(index_path = transcript_path)

# Get metadata for both models and convert to DataFrame
df_rag1 = pd.read_json(index_path+'docid_metadata_map.json').T.reset_index()
df_rag2 = pd.read_json(transcript_path+'docid_metadata_map.json').T.reset_index()

# Helper to identify relevant documents for retrievers
def filter_pids(df, search_term):
    return list(df['index'][df.course_number==search_term])

[Apr 08, 18:30:22] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




In [4]:
courses = {
    "501": "Being a Data Scientist",
    "502": "Math Methods I",
    "503": "Data Science Ethics",
    "505": "Data Manipulation",
    "511": "SQL and Databases",
    "515": "Efficient Data Processing",
    "516": "Big Data: Scalable Data Processing",
    "521": "Visual Exploration of Data",
    "522": "Information Visualization I",
    "523": "Communicating Data Science Results",
    "524": "Presenting Uncertainty",
    "532": "Data Mining I",
    "542": "Supervised Learning",
    "543": "Unsupervised Learning",
    "571": "Business SQL",  # No syllabus for this one :(
    "593": "Milestone I",
    "601": "Qualitative Inquiry for Data Scientists",
    "602": "Math Methods II",
    "611": "Database Architecture & Technology",
    "622": "Information Visualization II",
    "630": "Causal Inference",
    "631": "Experiment Design and Analysis",
    "632": "Data Mining II",
    "642": "Deep Learning I",
    "643": "Machine Learning Pipelines",
    "644": "Reinforcement Learning Algorithms",
    "652": "Network Analysis",
    "655": "Applied Natural Language Processing",
    "673": "Cloud Computing",
    "680": "Learning Analytics and Educational Data Science",
    "681": "Health Analytics",
    "682": "Social Media Analytics",
    "685": "Search and Recommender Systems",
    "687": "Introduction to Sports Analytics",
    "688": "Data Science for Social Good",
    "696": "Milestone II",
    "699": "Capstone",
}

In [5]:
# Build semantic routes for each class
routes = []
for num, name in zip(courses.keys(), courses.values()):
    route_name = "SIADS " + str(num)
    route_utterances = [route_name.lower(), route_name.lower().replace(" ",""),
                        name.lower(), name.lower() + " class", name.lower() + " course",
                       "who teaches " + route_name.lower(), "who teaches " + name.lower()]
    routes.append(Route(name=route_name, utterances=route_utterances))

# Select local encoder and build route layer
encoder = HuggingFaceEncoder(str="/Users/arnewman/.cache/huggingface/hub/models--sentence-transformers--UAE-Large-V1/", device="mps")
rl = RouteLayer(encoder=encoder, routes=routes)    

[32m2024-04-08 18:30:27 INFO semantic_router.utils.logger local[0m


In [6]:
# Use evaluation questions to test accuracy
# From: https://github.com/aurelio-labs/semantic-router/blob/main/docs/06-threshold-optimization.ipynb

test_data = [
    ("Which class involves time series analysis?", "SIADS 632"),
    ("Who teaches the SQL and Databases class?", "SIADS 511"),
    ("What are the prerequisites for Data Science for Social Good?", "SIADS 688"),
    ("When are the office hours for the Math Methods course?", "SIADS 502"),
    ("Are there any weekly readings for Milestone II?", "SIADS 699"),
    ("What are the outcomes of Qualitative Inquiry?", "SIADS 601"),
    ("What textbook is required for SIADS 505?", "SIADS 505"),
    ("What textbook is required for Data Manipulation?", "SIADS 505"),
    ("Which week of unsupervised learning covers DBSCAN?", "SIADS 543"),
    ("How many credits are required to complete the MADS program?", None),
    ("How long do students have to complete the MADS program start to finish?", None),
    ("How many points is the comprehensive oral exam worth in SIADS 593?", "SIADS 593"),
    ("What is the penalty for late submission in SIADS 630?", "SIADS 630"),
    ("How do I get accommodations for a class?", None),
    ("What is a backpack?", None),
    ("When is the latest I can drop a course?", None),
    ("How do I get an override to take a class?", None),
    ("How do I take a leave of absence from the MADS program?", None),
    ("What are the prerequisites for Search and Recommender Systems?", "SIADS 685")
]

# unpack the test data
X, y = zip(*test_data)

# evaluate using the default thresholds
accuracy = rl.evaluate(X=X, y=y)
print(f"Original Accuracy: {accuracy*100:.2f}%")

# Check defaults
# route_thresholds = rl.get_thresholds()
# print("Default route thresholds:", route_thresholds)

# Call the fit method
rl.fit(X=X, y=y)

# Updated thresholds
# route_thresholds = rl.get_thresholds()
# print("Updated route thresholds:", route_thresholds)

# evaluate using the new thresholds
accuracy = rl.evaluate(X=X, y=y)
print(f"Revised Accuracy: {accuracy*100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Original Accuracy: 78.95%


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Revised Accuracy: 84.21%


In [7]:
# Load model
llm_open = OpenAI(openai_api_base = "http://localhost:7999/v1",
                  model = "mistral-7b-instruct-v0.2",
                  openai_api_key = "hello",
                  temperature = 0.1,
                  top_p = 1,
                  max_tokens = 1024,
                  callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

memory = ConversationBufferMemory(memory_key = "chat_history", return_messages=True,
                                  output_key = "result")


# Set prompt template

template = '''
Use only the following pieces of context to answer the question at the end. 
Keep your answers concise and do not provide additional explanations or interpretations. 
If the answer cannot be deduced from the context, just say that you don't know the answer, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:
'''

In [24]:
# Select relevant documents based on query and create retrievers
# If relevant documents cannot be identified, build retrievers on all documents
# Separate retrievers from the documents index and the transcripts index

#query = "Who teaches SIADS 511?"
#query = "Does he teach any other courses?"
query = "Who is Graham Hukill?"
r = rl(query)
print(r.name if r.name else "No match")

if r.name:
    doc_list = filter_pids(df_rag1, r.name)
    if len(doc_list) > 0:
        retriever1 = RAG1.as_langchain_retriever(doc_ids=doc_list)        
    trans_list = filter_pids(df_rag2, r.name)
    if len(trans_list) > 0:
        retriever2 = RAG2.as_langchain_retriever(doc_ids=trans_list)
else:
    retriever1 = RAG1.as_langchain_retriever() # Could set k if desired
    retriever2 = RAG2.as_langchain_retriever()

retriever = EnsembleRetriever(retrievers=[retriever1, retriever2], weights=[0.5, 0.5])


No match


In [25]:
# Define processing chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_open,
    chain_type="stuff",
    memory=memory,
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"])},
)

In [26]:
# Function to display results
def process_llm_response(llm_response):
    print(f"\nQuestion: {llm_response['query']}")
    print(f"\nAnswer: {llm_response['result']}")
    print("\n\nSources:")
    for i, source in enumerate(llm_response["source_documents"]):
        m = source.metadata
        try:
            print(f"{i + 1}. {m['course_title']} ({m['course_number']}): {m['heading']}")
        except:
            print(f"{i + 1}. {m['course_title']} ({m['course_number']}): {m['source']}")

In [27]:
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m





[1m> Finished chain.[0m

Question: Who is Graham Hukill?

Answer: Graham Hukill is an Instructor for Database Architecture & Technology (SIADS 611) and SQL and Databases (SIADS 511) courses at the University of Michigan. He is also listed as an intermittent lecturer in the School of Information.


Sources:
1. Experiment Design and Analysis (SIADS 631): 01_stats-review-and-statistical-power-part-1.en.txt
2. Database Architecture & Technology (SIADS 611): Instructor And Course Assistants
3. Visual Exploration of Data (SIADS 521): 06_bonus-interview-with-damian-avila.en.txt
4. Data Science for Social Good (SIADS 688): 01_social-identity-for-social-good-theory-part-1.en.txt
5. SQL and Databases (SIADS 511): Instructor And Course Assistants
6. SQL and Databases (SIADS 511): Office Hours
7. Machine Learning Pipelines (SIADS 643): 03_interview-with-kevin-hartman.en.txt
8. Database Architecture & Technology (SIADS 611): Course Schedule
9. SQL and Databases (SIADS 511): 09_demonstration-gene

In [28]:
llm_response['source_documents']

[Document(page_content='Hi everyone.', metadata={'source': '01_stats-review-and-statistical-power-part-1.en.txt', 'course_number': 'SIADS 631', 'course_title': 'Experiment Design and Analysis', 'start_index': 0}),
 Document(page_content='Database Architecture & Technology (SIADS 611), Instructor And Course Assistants: - Instructor: Graham Hukill ( gshukill@umich.edu  ) -- Intermittent Lecturer in Information, School of Information\n- Course Team:\n  - Derek Bruckner ( dbrucknr@umich.edu  -- Software Programmer/Analyst Senior and Adjunct Lecturer in Information, School of Information\n  - Toby Kemp ( tobyk@umich.edu  ) -- Intermittent Lecturer in Information, School of Information', metadata={'source': '611_2023-11.md', 'heading': 'Instructor And Course Assistants', 'section': '2', 'course_number': 'SIADS 611', 'course_title': 'Database Architecture & Technology', 'course_date': 'November 2023', 'document': 'https://www.si.umich.edu/sites/default/files/611%20_0.pdf'}),
 Document(page_co