In [1]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.vectorstores import FAISS, Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain.retrievers import ParentDocumentRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor, LLMChainFilter
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter, CharacterTextSplitter
from langchain.chains.query_constructor.base import AttributeInfo
from operator import itemgetter

from openai import OpenAI
import logging
import os

os.environ["OPENAI_API_KEY"] = "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"
embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceEmbeddings(model_name="Salesforce/SFR-Embedding-Mistral")

logging.getLogger().setLevel(logging.ERROR)

logging.basicConfig(level=logging.INFO, 
                    format='%(message)s')

In [2]:
import requests
from bs4 import BeautifulSoup
from langchain_community.vectorstores import Chroma

chunks = []
urls = ['https://www.scu.edu/bulletin/undergraduate/chapter-4/Accounting.html',"https://www.scu.edu/bulletin/undergraduate/chapter-5/DepartmentofComputerScienceandEngineering.html"]

for url in urls:
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    if("computer" in soup.h1.a.string.lower()):
        abbreviation = "COEN"
    else:
        abbreviation = "accounting"

    # Find all <h3> tags and the subsequent <p> tags
    for h3 in soup.find_all('h3'):
        next_p = h3.find_next_sibling('p')
        if next_p:
            chunks.append((abbreviation+h3.text, next_p.text, soup.h1.a.string.lower(),url))

for chunk in chunks[:3] + chunks[-3:]:
    print(f"Header: {chunk[0]}\nParagraph: {chunk[1]}\n---")


Header: accounting5. Personal Financial Planning
Paragraph: Overview of the tools and information necessary for personal business decision making. Includes analysis of financial services, credit and borrowing, taxes, compensation planning, consumer purchases, housing decisions, the time value of money, savings, and investments. (4 units)
---
Header: accounting11. Introduction to Financial Accounting
Paragraph: An introduction to the language of business, the accounting treatment of business transactions, and the analysis of financial reports. This course shows students how financial analysts use accounting principles to prepare financial statements and how managers, investors, and creditors use balance sheets, income statements, cash flow statements, and financial disclosures to make operating, investing, and financing decisions. Prerequisites: Must be a second-year student and have completed BUSN 70 or 170. Seniors who have not completed BUSN 70 may take this class with department per

In [3]:
#langchain documents creation + metadata tagging
documents_list = []
for chunk in chunks:
    single_document = Document(page_content=chunk[0] + " " + chunk[0] + ": " + chunk[1], metadata={'major': chunk[2],"course":"y","source":chunk[3]})
    documents_list.append(single_document)
for i in documents_list[0:3]:
    print(i)
for i in documents_list[-3:]:
    print(i)

page_content='accounting5. Personal Financial Planning accounting5. Personal Financial Planning: Overview of the tools and information necessary for personal business decision making. Includes analysis of financial services, credit and borrowing, taxes, compensation planning, consumer purchases, housing decisions, the time value of money, savings, and investments. (4 units)' metadata={'major': 'accounting', 'course': 'y', 'source': 'https://www.scu.edu/bulletin/undergraduate/chapter-4/Accounting.html'}
page_content='accounting11. Introduction to Financial Accounting accounting11. Introduction to Financial Accounting: An introduction to the language of business, the accounting treatment of business transactions, and the analysis of financial reports. This course shows students how financial analysts use accounting principles to prepare financial statements and how managers, investors, and creditors use balance sheets, income statements, cash flow statements, and financial disclosures to

In [4]:
winter_file = 'CalendarWinter.txt'
fall_file = 'CalendarFall.txt'
spring_file = 'CalendarSpring.txt'
major_file = "CSEMajor.txt"
minor_file = "CSEMinor.txt"
professor_file = "CSEProfessors.txt"
major_file2 = "AccountingMajor.txt"
professor_file2 = "AccountingProfessors.txt"

#academic calendar
chunks = []
with open(winter_file, 'r') as file:
    for line in file:
        chunks.append("Winter 2024 "+ line.strip())
with open(fall_file, 'r') as file:
    for line in file:
        chunks.append("Fall 2023 "+ line.strip())
with open(spring_file, 'r') as file:
    for line in file:
        chunks.append("Spring 2024 "+ line.strip())
for chunk in chunks:
    single_document = Document(page_content=chunk, metadata={"source":"https://www.scu.edu/media/offices/registrar/2023-24-ugrd-ac-cal-WITH-Summer-2024_3-19-2024pc.pdf"})
    documents_list.append(single_document)

#computer science
chunks = []
with open(major_file, 'r') as file:
    content = file.read()
    chunks.append(content)
with open(minor_file, 'r') as file:
    content = file.read()
    chunks.append(content)
with open(professor_file, 'r') as file:
    content = file.read()
    chunks.append(content)
for chunk in chunks:
    single_document = Document(page_content=chunk, metadata={"source":"https://www.scu.edu/bulletin/undergraduate/chapter-5/computer-science-and-engineering.html#59ffa8ec905c"})
    documents_list.append(single_document)

#accounting
chunks = []
with open(major_file2, 'r') as file:
    content = file.read()
    chunks.append(content)
with open(professor_file2, 'r') as file:
    content = file.read()
    chunks.append(content)
for chunk in chunks:
    single_document = Document(page_content=chunk, metadata={"source":"https://www.scu.edu/bulletin/undergraduate/chapter-4/accounting.html#88aede57b843"})
    documents_list.append(single_document)

In [5]:
for i in documents_list[0:3]:
    print(i)
for i in documents_list[50:53]:
    print(i)
for i in documents_list[-5:]:
    print(i)

page_content='accounting5. Personal Financial Planning accounting5. Personal Financial Planning: Overview of the tools and information necessary for personal business decision making. Includes analysis of financial services, credit and borrowing, taxes, compensation planning, consumer purchases, housing decisions, the time value of money, savings, and investments. (4 units)' metadata={'major': 'accounting', 'course': 'y', 'source': 'https://www.scu.edu/bulletin/undergraduate/chapter-4/Accounting.html'}
page_content='accounting11. Introduction to Financial Accounting accounting11. Introduction to Financial Accounting: An introduction to the language of business, the accounting treatment of business transactions, and the analysis of financial reports. This course shows students how financial analysts use accounting principles to prepare financial statements and how managers, investors, and creditors use balance sheets, income statements, cash flow statements, and financial disclosures to

In [6]:
#vectorstore creation
vectorstore = FAISS.from_documents(documents_list,embedding=embeddings)
vectorstore.save_local("faiss_index") #created index saved in RAG/model

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Loading faiss.
Successfully loaded faiss.


In [7]:
#vectorstore -> metadata-tagged + document compression + document filtering retriever
def advanced_retrievers(vstore, top_k=3):
    # defining metadata fields
    metadata_field_info = [
        AttributeInfo(
            name="major",
            description="Any university major from the list: ['accounting','computer science and engineering']",
            type="string",
        ),
        AttributeInfo(
            name="course",
            description="Whether the query mentions one specific academic course. One of ['y','n']",
            type="string",
        ),
    ]

    # pre-retrieval metadata-tagged search
    document_content_description = "chunks from a university database"
    llm = ChatOpenAI(model_name='gpt-4', temperature=0)
    meta_retriever = SelfQueryRetriever.from_llm(
        llm,
        vstore,
        document_content_description,
        metadata_field_info,
        verbose = True,
        search_kwargs={"k":top_k}
    )
    # # post-retrieval compression
    # compressor = LLMChainExtractor.from_llm(llm)
    # compression_retriever = ContextualCompressionRetriever(
    #     base_compressor=compressor, base_retriever=meta_retriever
    # )
    # post-retrieval filtering
    _filter = LLMChainFilter.from_llm(llm)
    filter_retriever = ContextualCompressionRetriever(
        base_compressor=_filter, base_retriever=meta_retriever
    )

    return filter_retriever


In [8]:
retriever = vectorstore.as_retriever()
retriever.get_relevant_documents("help me create a four year plan")

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Document(page_content='Spring 2024 Apr 29-May 10 Annual mandatory academic advising period', metadata={'source': 'https://www.scu.edu/media/offices/registrar/2023-24-ugrd-ac-cal-WITH-Summer-2024_3-19-2024pc.pdf'}),
 Document(page_content='Fall 2023 Apr 24-May 5 M-F Mandatory academic advising period', metadata={'source': 'https://www.scu.edu/media/offices/registrar/2023-24-ugrd-ac-cal-WITH-Summer-2024_3-19-2024pc.pdf'}),
 Document(page_content='Winter 2024 Mar 25-29 Spring recess', metadata={'source': 'https://www.scu.edu/media/offices/registrar/2023-24-ugrd-ac-cal-WITH-Summer-2024_3-19-2024pc.pdf'}),
 Document(page_content='Winter 2024 Feb 15-28 Spring registration appointment period', metadata={'source': 'https://www.scu.edu/media/offices/registrar/2023-24-ugrd-ac-cal-WITH-Summer-2024_3-19-2024pc.pdf'})]

In [9]:
# create Langchain RAG chain and query
def retrieve_and_generate(query, super_retriever):
    template = """You are a contract chatbot answering questions. Use the following pieces of context to answer the question at the end. If the answer isn't in the context, say that you didn't find the answer within the document, don't try to make up an answer. End the answer with *source* and *page* of referenced document(s).

    Context:
    {context}

    Question: {question}
    Helpful Answer:
    """
    prompt = ChatPromptTemplate.from_template(template)
    model = ChatOpenAI(model="GPT-4")

    # create langchain chain
    chain = (
        {
            "question": itemgetter("query"),
            "context": itemgetter("query") | super_retriever
        }
        | prompt
        | model
        | StrOutputParser()
    )

    return chain.invoke({'query': query})


## Features & Experiments

In [10]:
a_list = [i for i in range(100)]
a_list.append("COEN")

query = "what are the prerequisites for COEN11?"

for i in a_list:
    if str(i) in query.lower():
        print(query.lower().replace("coen", "computer science and engineering"))

print(query)

what are the prerequisites for computer science and engineering11?
what are the prerequisites for computer science and engineering11?
what are the prerequisites for COEN11?
