In [1]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4-1106-preview")

In [2]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader, DirectoryLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
from glob import glob
from utils import to_markdown
from urllib.parse import urlparse
from collections import defaultdict
from utils import to_markdown

In [4]:
def contains_course_info(url):
    url_prased = urlparse(url)
    return len(url_prased.path.split("/"))>5

In [5]:
files = []
with open("stevens_scraper/crawled_urls.txt") as fp:
    for line in fp:
        files.append(line.strip())
files = list(map(lambda x: x.lower(), files))
files = list(set(files))
files = list(filter(lambda file: "narrative-courses" not in file, files))
files = list(filter(lambda file: "javascript" not in file, files))
files = list(filter(lambda file: ".pdf" not in file, files))
files = list(filter(lambda file: ".aspx" not in file, files))
files = list(filter(lambda file: "degree-programs2" not in file, files))
files = list(filter(lambda file: "2023-2024" in file, files))
files = list(filter(lambda file: "#middle" not in file, files))
files = list(filter(contains_course_info , files))
print(len(files))

with open("stevens_scraper/crawled_urls_v1.txt", "w+") as fp:
    fp.write("\n".join(sorted(files, key=len)))

3434


In [6]:
primary = defaultdict(list)
for url in files:
    url_prased = urlparse(url.lower())
    cat = url_prased.path.replace("/en/2023-2024/academic-catalog/", "").split("/")[0]
    primary[cat].append(url)

In [7]:
school_department_keys = [
    "school-of-business",
    "department-of-chemical-engineering-and-materials-science",
    "department-of-electrical-and-computer-engineering",
    "department-of-mathematical-sciences",
    "department-of-chemistry-and-chemical-biology",
    "department-of-computer-science",
    "department-of-civil-environmental-and-ocean-engineering",
    "department-of-mechanical-engineering",
    "department-of-physics",
    "school-of-systems-and-enterprises",
    "schaefer-school-of-engineering-and-science",
    "department-of-biomedical-engineering",
    "school-of-humanities-arts-and-social-sciences"
]

In [8]:
for key in primary.keys():
    if key not in school_department_keys and key:
        print(key, len(primary[key]))

courses 2889
financing-a-stevens-education 44
undergraduate-education 77
student-services 26
graduate-education 24
tuition-fees-and-other-expenses-for-undergraduate-students 5
tuition-fees-and-other-expenses-for-graduate-students 5
student-life 6
academic-integrity 2


In [41]:
sorted(primary["graduate-education"])

['https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/graduate-education/copy-of-applying-for-admissions',
 'https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/graduate-education/graduate-procedures-and-requirements',
 'https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/graduate-education/graduate-procedures-and-requirements/academic-probation',
 'https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/graduate-education/graduate-procedures-and-requirements/academic-standing',
 'https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/graduate-education/graduate-procedures-and-requirements/course-options',
 'https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/graduate-education/graduate-procedures-and-requirements/course-options/auditing-courses',
 'https://stevens.smartcatalogiq.com/en/2023-2024/academic-catalog/graduate-education/graduate-procedures-and-requirements/course-options/course-by-application',
 'https:/

In [37]:
with open("stevens_scraper/tuition-fees-and-other-expenses-for-undergraduate-students.txt", "a") as fp:
    fp.write("\n".join(primary["tuition-fees-and-other-expenses-for-undergraduate-students"]))

In [11]:
import pandas as pd
df = pd.read_csv("data/financing-a-stevens-education.csv")

In [12]:
assert df.shape[0]==df["url"].str.endswith("/").sum()
df["url"] = df["url"].apply(lambda x: x[:-1])
df = df.set_index("url")
graduate_tuition_fees_info = df.to_dict(orient="index")

In [13]:
docs = [
    Document(page_content=graduate_tuition_fees_info[doc]["text"])
    for doc in graduate_tuition_fees_info
]

In [14]:
model_kwargs = {
    "trust_remote_code": True,
    # "device": "cpu"
    }
encode_kwargs={"normalize_embeddings": True}
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

In [15]:
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)

In [16]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
# prompt = hub.pull("rlm/rag-prompt")
# prompt

In [17]:
from langchain_core.prompts import ChatPromptTemplate

chat_template = ChatPromptTemplate.from_messages(
    [   
        ("system", "You are an assistant for question-answering tasks related to Stevens Institute Of Technology."),
        ("human", """ 
        Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        If the topic is related to a course then ensure to mention to course numbers and display the result as a table.
        Answer in markdown format and render tables without code 
        Question: {question}
        Context: {context}
        Answer:"""),
    ]
)
chat_template

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an assistant for question-answering tasks related to Stevens Institute Of Technology.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template=" \n        Use the following pieces of retrieved context to answer the question.\n        If you don't know the answer, just say that you don't know.\n        If the topic is related to a course then ensure to mention to course numbers and display the result as a table.\n        Answer in markdown format and render tables without code \n        Question: {question}\n        Context: {context}\n        Answer:"))])

In [18]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# rag_chain = (
#     { "context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

In [19]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | chat_template
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [29]:
response = rag_chain_with_source.invoke("Give me detailed information regarding teaching assistantships for graduate students?")

In [31]:
to_markdown(response["answer"])

> | Type | Description | Tuition and Fee Support | Stipend | Work Requirement | Course Load | Appointment Review |
> |------|-------------|-------------------------|---------|------------------|-------------|-------------------|
> | Teaching Assistantships | Provide support for teaching services in academic departments. | Yes | Monthly (amount not specified) | Not specified | Not specified | By the Dean of Academic Administration after academic department recommendation |
> | Graduate Assistantships | Available in every academic department and some non-academic departments for teaching, research, or other services. | Yes, for up to three courses per semester | Yes, plus possible summer support | Up to 20 hours per week, maintain satisfactory academic performance | Reduced, but can still complete master’s in two years or less | By the appropriate department director or principal investigator |
> | Research Assistantships | Provide support for services on sponsored research contracts. | Yes | Monthly (amount not specified) | Not specified | Not specified | By the Office of Graduate Admissions after academic department recommendation |
> | Financial Aid Sources | Includes fellowships, on-campus employment, employer tuition assistance plans, loan funds, deferred payment plans. | Not specified | Not specified | Not specified | Not specified | Consult with department for more information |
> 
> **Note**: Specific amounts for tuition support, fees, and stipends are not provided in the context given. Additional details would be available upon direct consultation with the respective departments or the Office of Graduate Academics.

In [46]:
# cleanup
vectorstore.delete_collection()

In [4]:
import json
chain_info = json.load(open("../chains/chain_info.json"))
chain_info = dict((c["name"], c["description"]) for c in chain_info)
chain_info

{'financing_a_stevens_education': 'This contains all the information in regards to helping students finance their education at Stevens by providing information regarding financing sources, institution financing programs, financial aid and state financing',
 'tuition_fees_and_other_expenses_for_undergraduate_students': 'This contains all the information regarding tution fees and additional fees for undergraduate students',
 'tuition_fees_and_other_expenses_for_graduate_students': 'This contains all the information regarding tution fees and additional fees for graduate students',
 'student_life_at_stevens': 'This contains all the information regarding student services including sports, code of conduct and student government bodies',
 'student_services_at_stevens': 'This contains all the information regarding academic tutoring, career guidance, counseling, disability accommodations, financial aid management, health services, and housing options to facilitate student success and well-being