In [1]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4-1106-preview")

In [2]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader, DirectoryLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [12]:
import textwrap
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [3]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/department-of-computer-science/graduate-programs/master-of-science-in-machine-learning",),
    )

# loader = DirectoryLoader("data/stevens/")

docs = loader.load()

In [4]:
docs

[Document(page_content='\n\n\n\r\n\tStevens Institute of Technology\xa0-\xa0Master of Science in Machine Learning\r\n\n\n\n\n\n\n\n\n\n\n\nSkip to main content\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCatalog Search\n\n\n\n\nSearch Options\n\n\nEntire Catalog\nPrograms\nCourses\n\nSearch\nhttp://stevens.smartcatalogiq.com\n4d3ddf27-e408-400c-b836-d1281514fc2f\n\n68aab440-d6be-464a-b585-3d808753e390\nprogram\n\n\n\n\n/Institutions/Stevens-Institution-of-Technology/json/2023-2024/Academic-Catalog-local.json/Institutions/Stevens-Institution-of-Technology/json/2023-2024/Academic-Catalog.json\n\r\n\t\tContents\r\n\t\n\n\n\nAboutOur History and MissionAccreditationAcademic IntegrityStudent LifeStudent ServicesUndergraduate EducationGraduate EducationTuition, Fees and Other Expenses for Undergraduate StudentsTuition, Fees and Other Expenses for Graduate StudentsFinancing a Stevens EducationSchool of BusinessSchool of Humanities, Arts and Social SciencesSchaefer School of Engineering and Scien

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [6]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
prompt = hub.pull("rlm/rag-prompt")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [16]:
from langchain_core.prompts import ChatPromptTemplate

chat_template = ChatPromptTemplate.from_messages(
    [   
        ("system", "You are an assistant for question-answering tasks related to Stevens Institute Of Technology."),
        ("human", """ 
        Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
         
        Answer in markdown format
        If the topic is related to a course then ensure to mention to course numbers and display the result as a table
        Question: {question}
        Context: {context}
        Answer:"""),
    ]
)
chat_template

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an assistant for question-answering tasks related to Stevens Institute Of Technology.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template=" \n        Use the following pieces of retrieved context to answer the question.\n        If you don't know the answer, just say that you don't know.\n        Use three sentences maximum and keep the answer concise.\n         \n        Answer in markdown format\n        If the topic is related to a course then ensure to mention to course numbers and display the result as a table\n        Question: {question}\n        Context: {context}\n        Answer:"))])

In [17]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# rag_chain = (
#     { "context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

In [18]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [19]:
response = rag_chain_with_source.invoke("What are the course electives for MS in Machine Learning?")

In [20]:
response

{'context': [Document(page_content='4 machine learning core courses (12 credits)\n\n\n3 machine learning core electives\xa0 (9 credits)\xa0\n\n\n3 general electives (9 credits), which can be any graduate course\n\n\nStudents must maintain a minimum grade of a C or above in any course and a minimum GPA of 3.000.\r\n\t\t\tMachine Learning Core Requirements: Complete Four out of the Five Courses Mentioned Below:\r\n\t\t\n\nCS 541Artificial Intelligence3\n\nCS 559Machine Learning: Fundamentals and Applications3\n\nCS 560Statistical Machine Learning3\n\nCS 583Deep Learning3\n\nCS 584Natural Language Processing3\n\n\r\n\t\t\tMachine Learning Core Electives Requirements: Complete Three of the Courses Mentioned Below\r\n\t\t\n\nCS 513Knowledge Discovery and Data Mining3\n\nCS 5323D Computer Vision3\n\nCS 544Health Informatics3\n\nCS 556Mathematical Foundations of Machine Learning3\n\nCS 558Computer Vision3\n\nCS 582Causal Inference3\n\nCS 589Text Mining and Information Retrieval3\n\nCS 598Visu

In [21]:
to_markdown(response["answer"])

> The course electives for an MS in Machine Learning include three machine learning core electives and three general electives. The machine learning core electives can be chosen from courses such as Knowledge Discovery and Data Mining, 3D Computer Vision, Health Informatics, Mathematical Foundations of Machine Learning, Computer Vision, Causal Inference, Text Mining and Information Retrieval, Visual Information Retrieval, Data Management and Exploration on the Web, and several others. General electives can be any graduate course that meets the program requirements.

In [11]:
response

'The course electives for MS in Machine Learning include: Wireless Networking, Cognitive Radio, Signal Processing, Wireless Security, RF Antenna, AI and Machine Learning.'

In [10]:
# cleanup
vectorstore.delete_collection()