In [1]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.vectorstores import FAISS, Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain.retrievers import ParentDocumentRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor, LLMChainFilter
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter, CharacterTextSplitter
from langchain.chains.query_constructor.base import AttributeInfo
from operator import itemgetter

from openai import OpenAI
import logging
import os

os.environ["OPENAI_API_KEY"] = "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"
embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceEmbeddings(model_name="Salesforce/SFR-Embedding-Mistral")

logging.getLogger().setLevel(logging.ERROR)

logging.basicConfig(level=logging.INFO, 
                    format='%(message)s')

In [60]:
import requests
from bs4 import BeautifulSoup
from langchain_community.vectorstores import Chroma

chunks = []
urls = ['https://www.scu.edu/bulletin/undergraduate/chapter-4/Accounting.html',"https://www.scu.edu/bulletin/undergraduate/chapter-5/DepartmentofComputerScienceandEngineering.html"]

for url in urls:
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    h1 = soup.find('h1')
    if h1:
        department_name = h1.text.strip()
        description_parts = []

        # Find all immediate p siblings of h1 and join their texts
        next_siblings = h1.find_next_siblings('p')
        for p in next_siblings:
            description_parts.append(p.text.strip())

        # Join all parts of the description into one string
        full_description = " ".join(description_parts)
        print((department_name, full_description, soup.h1.text.lower(), 'your-url-here'))  # Replace 'your-url-here' with the actual URL

    # Now handle the h2 + p + p + ... chunks until "Lower-Division Courses"
    for h2 in soup.find_all('h2'):
        # Stop adding chunks when reaching "Lower-Division Courses" h2 header
        if h2.text.strip() == "Lower-Division Courses":
            break
        
        # Initialize header text
        section_text = [h2.text]

        # Iterate over subsequent siblings until the next h2
        for sibling in h2.find_next_siblings():
            if sibling.name == 'h2':
                break
            if sibling.name == 'p':
                section_text.append(sibling.text)
            elif sibling.name == 'ul':
                # Include list items text, considering each item might contain a <p> tag
                list_items = sibling.find_all('li')
                for item in list_items:
                    if item.p:
                        section_text.append(item.p.text)
                    else:
                        section_text.append(item.text)

        if len(section_text) > 1:
            # Join all section text contents and add to chunks
            chunks.append((" ".join(section_text), soup.h1.a.string.lower(), url))


    # Find all <h3> tags and the subsequent <p> tags
    for h3 in soup.find_all('h3'):
        next_p = h3.find_next_sibling('p')
        if next_p:
            chunks.append((soup.h1.a.string.lower() + " " + h3.text[0:4] + h3.text + ": " + next_p.text, soup.h1.a.string.lower(), url))
# for chunk in chunks[:3] + chunks[-3:]:
#     print(f"Header: {chunk[0]}\nParagraph: {chunk[1]}\n---")

# for chunk in chunks:
#     if "Requirements for " in chunk[0]:
#         print(chunk[0] + chunk[1])


('Accounting', '', '\n\naccounting\n', 'your-url-here')
('Computer Science and Engineering', '', '\n\ncomputer science and engineering\n', 'your-url-here')


In [18]:
#langchain documents creation + metadata tagging
documents_list = []
for chunk in chunks:
    single_document = Document(page_content=chunk[2] + " " + chunk[0][0:4] + chunk[0] + chunk[1], metadata={'major': chunk[2], 'source': chunk[3]})
    documents_list.append(single_document)
for i in documents_list[0:3]:
    print(i)
for i in documents_list[-3:]:
    print(i)

page_content='accounting 5. P5. Personal Financial PlanningOverview of the tools and information necessary for personal business decision making. Includes analysis of financial services, credit and borrowing, taxes, compensation planning, consumer purchases, housing decisions, the time value of money, savings, and investments. (4 units)' metadata={'major': 'accounting', 'source': 'https://www.scu.edu/bulletin/undergraduate/chapter-4/Accounting.html'}
page_content='accounting 11. 11. Introduction to Financial AccountingAn introduction to the language of business, the accounting treatment of business transactions, and the analysis of financial reports. This course shows students how financial analysts use accounting principles to prepare financial statements and how managers, investors, and creditors use balance sheets, income statements, cash flow statements, and financial disclosures to make operating, investing, and financing decisions. Prerequisites: Must be a second-year student and

In [19]:
#vectorstore creation
vectorstore = FAISS.from_documents(documents_list,embedding=embeddings)
vectorstore.save_local("faiss_index") #created index saved in RAG/model

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [20]:
#vectorstore -> metadata-tagged + document compression + document filtering retriever
def advanced_retrievers(vstore, top_k=3):
    # defining metadata fields
    metadata_field_info = [
        AttributeInfo(
            name="major",
            description="Any university major from the list: ['accounting','computer science and engineering']",
            type="string",
        ),
    ]

    # pre-retrieval metadata-tagged search
    document_content_description = "chunks from a university database"
    llm = ChatOpenAI(model_name='gpt-4', temperature=0)
    meta_retriever = SelfQueryRetriever.from_llm(
        llm,
        vstore,
        document_content_description,
        metadata_field_info,
        verbose = True,
        search_kwargs={"k":top_k}
    )
    # # post-retrieval compression
    # compressor = LLMChainExtractor.from_llm(llm)
    # compression_retriever = ContextualCompressionRetriever(
    #     base_compressor=compressor, base_retriever=meta_retriever
    # )
    # post-retrieval filtering
    _filter = LLMChainFilter.from_llm(llm)
    filter_retriever = ContextualCompressionRetriever(
        base_compressor=_filter, base_retriever=meta_retriever
    )

    return filter_retriever


In [29]:
retriever = vectorstore.as_retriever()
retriever.get_relevant_documents("what happens in 11?")

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Document(page_content='computer science and engineering 11L.11L. Advanced Programming LaboratoryLaboratory for COEN 11. Corequisite: COEN 11. (1 unit)', metadata={'major': 'computer science and engineering', 'source': 'https://www.scu.edu/bulletin/undergraduate/chapter-5/DepartmentofComputerScienceandEngineering.html'}),
 Document(page_content='computer science and engineering 11. 11. Advanced ProgrammingThe C Language: structure and style. Types, operators, and expressions. Control flow. Functions. Pointers, arrays, and strings. Structures and dynamic memory allocation. I/O and file processing. Special operators. Recursion and threads. The Unix environment. Prerequisites: Previous programming experience and/or a grade of C- or better in an introductory computer programming course such as COEN 10, CSCI 10, or OMIS 30. Corequisite: COEN 11L. (4 units)', metadata={'major': 'computer science and engineering', 'source': 'https://www.scu.edu/bulletin/undergraduate/chapter-5/DepartmentofCom

In [13]:
# create Langchain RAG chain and query
def retrieve_and_generate(query, super_retriever):
    template = """You are a contract chatbot answering questions. Use the following pieces of context to answer the question at the end. If the answer isn't in the context, say that you didn't find the answer within the document, don't try to make up an answer. End the answer with *source* and *page* of referenced document(s).

    Context:
    {context}

    Question: {question}
    Helpful Answer:
    """
    prompt = ChatPromptTemplate.from_template(template)
    model = ChatOpenAI(model="GPT-4")

    # create langchain chain
    chain = (
        {
            "question": itemgetter("query"),
            "context": itemgetter("query") | super_retriever
        }
        | prompt
        | model
        | StrOutputParser()
    )

    return chain.invoke({'query': query})


## Features & Experiments

In [14]:
a_list = [i for i in range(100)]
a_list.append("COEN")

query = "what are the prerequisites for COEN11?"

for i in a_list:
    if str(i) in query.lower():
        print(query.lower().replace("coen", "computer science and engineering"))

print(query)

what are the prerequisites for computer science and engineering11?
what are the prerequisites for computer science and engineering11?
what are the prerequisites for COEN11?


In [45]:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)


<!DOCTYPE HTML>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>Computer Science and Engineering · GitBook</title>
<meta content="IE=edge" http-equiv="X-UA-Compatible">
<meta content="" name="description"/>
<meta content="GitBook 3.2.3" name="generator"/>
<link href="../gitbook/style.css" rel="stylesheet"/>
<link href="../gitbook/gitbook-plugin-highlight/website.css" rel="stylesheet"/>
<link href="../gitbook/gitbook-plugin-search/search.css" rel="stylesheet"/>
<link href="../gitbook/gitbook-plugin-fontsettings/website.css" rel="stylesheet"/>
<meta content="true" name="HandheldFriendly">
<meta content="width=device-width, initial-scale=1, user-scalable=yes" name="viewport"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="black" name="apple-mobile-web-app-status-bar-style"/>
<link href="../gitbook/images/apple-touch-icon-precomposed-152.png" rel="apple-touch-icon-precomposed" sizes="15