In [1]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4-1106-preview")

In [2]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader, DirectoryLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
import textwrap
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [4]:
# Load, chunk and index the contents of the blog.
# loader = WebBaseLoader(
#     web_paths=("https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/department-of-computer-science/graduate-programs/master-of-science-in-machine-learning",),
#     )

loader = DirectoryLoader("../data/text files/", use_multithreading=False, show_progress=True)

docs = loader.load()

 99%|█████████▉| 2781/2799 [02:54<00:01, 15.96it/s] 


In [5]:
docs[0]

Document(page_content='Welcome to the Urban Ocean Observatory!\n\nThe New York Harbor Observing and Prediction System (NYHOPS) was established to permit an assessment of ocean, weather, environmental, and vessel traffic conditions throughout the New York Harbor and New Jersey Coast regions. The system is designed to provide a knowledge of meteorological and oceanographic conditions both in real-time and forecasted out to 72 hours in the Hudson River, the East River, NY/NJ Estuary, Raritan Bay, Long Island Sound and the coastal waters of New Jersey. In this web site you will see graphic images of:\n\nwater level; surface and bottom temperature; surface and bottom salinity; surface and bottom currents; NOAA winds; coastal waves - height, period and direction.\n\nDISCLAIMER: NYHOPS adheres to NOAA standards and guidelines for use and reliability of our forecasts. Click here to view.\n\nData administration and web development are managed by\n\nMr. David Runnels\n\n.\n\n&copy 2006-2022 Davi

In [6]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  return self.fget.__get__(instance, owner)()


In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [8]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
prompt = hub.pull("rlm/rag-prompt")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [9]:
from langchain_core.prompts import ChatPromptTemplate

chat_template = ChatPromptTemplate.from_messages(
    [   
        ("system", "You are an assistant for question-answering tasks related to Stevens Institute Of Technology."),
        ("human", """ 
        Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        If the topic is related to a course then ensure to mention to course numbers and display the result as a table.
        Answer in markdown format and render tables without code 
        Question: {question}
        Context: {context}
        Answer:"""),
    ]
)
chat_template

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an assistant for question-answering tasks related to Stevens Institute Of Technology.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template=" \n        Use the following pieces of retrieved context to answer the question.\n        If you don't know the answer, just say that you don't know.\n        If the topic is related to a course then ensure to mention to course numbers and display the result as a table.\n        Answer in markdown format and render tables without code \n        Question: {question}\n        Context: {context}\n        Answer:"))])

In [10]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# rag_chain = (
#     { "context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

In [11]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [13]:
response = rag_chain_with_source.invoke("What are the core courses  for MS in Machine Learning?")

In [14]:
response

{'context': [Document(page_content='The machine learning certificate program provides a practical foundation that is necessary to be at the forefront deep learning theory. The highly focused curriculum allows you to apply or develop the appropriate skills for real-world applications. All students must take CS 559 Machine Learning: Fundamentals & Applications and then choose three additional courses from the list below.\n\nCOURSE #\n\nCOURSE NAME\n\nCS 559\n\nMachine Learning: Fundamentals & Applications\n\nCS 541\n\nArtificial Intelligence\n\nCS 560\n\nStatistical Machine Learning\n\nCS 582\n\nCausal Inference\n\nCS 583\n\nDeep Learning\n\nCS 584\n\nNatural Language Processing\n\nProgram Director:\n\nEduardo Bonelli\n\n(c)\n\n2024\n\nStevens Institute of Technology', metadata={'source': '..\\data\\text files\\https_  www.stevens.edu program Machine-Learning-Graduate-Certificate.txt'}),
  Document(page_content="View General Admissions Requirements >\n\nCoursework in the Data Science pro

In [15]:
to_markdown(response["answer"])

> The core courses for the MS in Machine Learning include the mandatory course CS 559 Machine Learning: Fundamentals & Applications, and students must then choose three additional courses from the following options: CS 541 Artificial Intelligence, CS 560 Statistical Machine Learning, CS 582 Causal Inference, CS 583 Deep Learning, and CS 584 Natural Language Processing.

In [24]:
# cleanup
vectorstore.delete_collection()