In [1]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import (
    RetrievalQA,
    ConversationalRetrievalChain,
    RetrievalQAWithSourcesChain,
)
from langchain.memory import ConversationBufferMemory
from langchain_openai.llms import OpenAI
from langchain.retrievers import EnsembleRetriever
from langchain_core.prompts import PromptTemplate

import pandas as pd
from ragatouille import RAGPretrainedModel
from semantic_router import Route, RouteLayer
from semantic_router.encoders import HuggingFaceEncoder



In [2]:
# Setting some helper variables

retrieval_strategy = "colbert"
model = "mistral-7b-instruct-v0.2"

rootdir = ".."
persist_directory = "./../embeddings"
index_root = rootdir + "/../colbert_index/"
colbert_path = rootdir + "/../colbertv2.0/"
index_path = rootdir + "/../colbert_index/colbert/indexes/documents/"
transcript_path = rootdir + "/../colbert_index/colbert/indexes/transcripts/"

In [3]:
# Start Colbert models for documents and transcripts
RAG1 = RAGPretrainedModel.from_index(index_path = index_path)
RAG2 = RAGPretrainedModel.from_index(index_path = transcript_path)

# Get metadata for both models and convert to DataFrame
df_rag1 = pd.read_json(index_path+'docid_metadata_map.json').T.reset_index()
df_rag2 = pd.read_json(transcript_path+'docid_metadata_map.json').T.reset_index()

# Helper to identify relevant documents for retrievers
def filter_pids(df, search_term):
    return list(df['index'][df.course_number==search_term])

[Apr 07, 13:03:30] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




In [4]:
courses = {
    "501": "Being a Data Scientist",
    "502": "Math Methods I",
    "503": "Data Science Ethics",
    "505": "Data Manipulation",
    "511": "SQL and Databases",
    "515": "Efficient Data Processing",
    "516": "Big Data: Scalable Data Processing",
    "521": "Visual Exploration of Data",
    "522": "Information Visualization I",
    "523": "Communicating Data Science Results",
    "524": "Presenting Uncertainty",
    "532": "Data Mining I",
    "542": "Supervised Learning",
    "543": "Unsupervised Learning",
    "571": "Business SQL",  # No syllabus for this one :(
    "593": "Milestone I",
    "601": "Qualitative Inquiry for Data Scientists",
    "602": "Math Methods II",
    "611": "Database Architecture & Technology",
    "622": "Information Visualization II",
    "630": "Causal Inference",
    "631": "Experiment Design and Analysis",
    "632": "Data Mining II",
    "642": "Deep Learning I",
    "643": "Machine Learning Pipelines",
    "644": "Reinforcement Learning Algorithms",
    "652": "Network Analysis",
    "655": "Applied Natural Language Processing",
    "673": "Cloud Computing",
    "680": "Learning Analytics and Educational Data Science",
    "681": "Health Analytics",
    "682": "Social Media Analytics",
    "685": "Search and Recommender Systems",
    "687": "Introduction to Sports Analytics",
    "688": "Data Science for Social Good",
    "696": "Milestone II",
    "699": "Capstone",
}

In [5]:
# Build semantic routes for each class
routes = []
for num, name in zip(courses.keys(), courses.values()):
    route_name = "SIADS " + str(num)
    route_utterances = [route_name.lower(), route_name.lower().replace(" ",""),
                        name.lower(), name.lower() + " class", name.lower() + " course"]
    routes.append(Route(name=route_name, utterances=route_utterances))

# Select local encoder and build route layer
encoder = HuggingFaceEncoder(str="/Users/arnewman/.cache/huggingface/hub/models--sentence-transformers--UAE-Large-V1/", device="mps")
rl = RouteLayer(encoder=encoder, routes=routes)    

[32m2024-04-07 13:03:31 INFO semantic_router.utils.logger local[0m


In [6]:
# Select relevant documents based on query and create retrievers
# If relevant documents cannot be identified, build retrievers on all documents
# Separate retrievers from the documents index and the transcripts index

query = "What are the week 3 assignments in the network analysis class?"
r = rl(query)
print(r.name if r.name else "No match")

if r.name:
    doc_list = filter_pids(df_rag1, r.name)
    if len(doc_list) > 0:
        retriever1 = RAG1.as_langchain_retriever(k=5, doc_ids=doc_list)        
    trans_list = filter_pids(df_rag2, r.name)
    if len(trans_list) > 0:
        retriever2 = RAG2.as_langchain_retriever(k=5, doc_ids=trans_list)
else:
    retriever1 = RAG1.as_langchain_retriever(k=5)
    retriever2 = RAG2.as_langchain_retriever(k=5)

retriever = EnsembleRetriever(retrievers=[retriever1, retriever2], weights=[0.5, 0.5])


SIADS 652


In [8]:
# Load model
llm_open = OpenAI(openai_api_base = "http://localhost:7999/v1",
                  model = "mistral-7b-instruct-v0.2",
                  openai_api_key = "hello",
                  temperature = 0.1,
                  top_p = 1,
                  max_tokens = 1024,
                  callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

# Set prompt template

template = '''
Use only the following pieces of context to answer the question at the end. 
Keep your answers concise and do not provide additional explanations or interpretations. 
If the answer cannot be deduced from the context, just say that you don't know the answer, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:
'''

# Define processing chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_open,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"])},
)

In [9]:
# Function to display results
def process_llm_response(llm_response):
    print(f"\nQuestion: {llm_response['query']}")
    print(f"\nAnswer: {llm_response['result']}")
    print("\n\nSources:")
    for i, source in enumerate(llm_response["source_documents"]):
        m = source.metadata
        try:
            print(f"{i + 1}. {m['course_title']} ({m['course_number']}): {m['heading']}")
        except:
            print(f"{i + 1}. {m['course_title']} ({m['course_number']}): {m['source']}")

In [10]:
llm_response = qa_chain(query)
process_llm_response(llm_response)

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m
Loading searcher for index documents for the first time... This may take a few seconds
[Apr 07, 13:06:17] #> Loading codec...
[Apr 07, 13:06:17] #> Loading IVF...
[Apr 07, 13:06:17] Loading segmented_lookup_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Apr 07, 13:06:18] #> Loading doclens...


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 843.92it/s]

[Apr 07, 13:06:18] #> Loading codes and residuals...



100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 44.63it/s]

[Apr 07, 13:06:18] Loading filter_pids_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...





[Apr 07, 13:06:18] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What are the week 3 assignments in the network analysis class?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  2024,  1996,  2733,  1017, 14799,  1999,  1996,
         2897,  4106,  2465,  1029,   102,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

Loading searcher for index transcripts for the first time... This may take a few seconds




[Apr 07, 13:06:18] #> Loading codec...
[Apr 07, 13:06:18] #> Loading IVF...
[Apr 07, 13:06:18] #> Loading doclens...


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1259.55it/s]

[Apr 07, 13:06:18] #> Loading codes and residuals...



100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  6.74it/s]


Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What are the week 3 assignments in the network analysis class?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  2024,  1996,  2733,  1017, 14799,  1999,  1996,
         2897,  4106,  2465,  1029,   102,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


[1m> Finished chain.[0m

Question: What are the week 3 assignments in the network analysis class?

Answer: The week 3 assignments in the network analysis class include programming and reflection questions aimed at applying network analysis concepts covered in lectures and readings using NetworkX tutorials. The assignments will be partly auto-graded and partly manually gra