# parsing data from SCU Course Catalog gitbook

In [43]:
import os
import glob
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_text_splitters import CharacterTextSplitter

raw_documents = []

html_files = glob.glob(os.path.join('gitbook_data', '**', '*.html'), recursive=True)

# Loop through each HTML file and parse it
for html_file in html_files:
    try:
        loader = UnstructuredHTMLLoader(html_file)
        data = loader.load() 
        raw_documents.append(data[0])
    except Exception as e:
        print(f'Failed to parse {html_file}: {e}')

text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
documents = text_splitter.split_documents(raw_documents)


Created a chunk of size 1834, which is longer than the specified 1500
Created a chunk of size 1609, which is longer than the specified 1500
Created a chunk of size 1609, which is longer than the specified 1500
Created a chunk of size 1898, which is longer than the specified 1500
Created a chunk of size 2687, which is longer than the specified 1500
Created a chunk of size 1774, which is longer than the specified 1500
Created a chunk of size 1587, which is longer than the specified 1500
Created a chunk of size 2298, which is longer than the specified 1500
Created a chunk of size 1548, which is longer than the specified 1500
Created a chunk of size 1733, which is longer than the specified 1500
Created a chunk of size 2002, which is longer than the specified 1500


In [48]:
# printing raw documents from html
print(f"number of documents: {len(raw_documents)}")
print(f"type of documents: {type(raw_documents[0])}")
print("now printing individual documents: \n")

for doc in raw_documents:
    print(f"document length: {len(doc.page_content)}")
    print(doc.page_content[:100])
    print('---')

number of documents: 129
type of documents: <class 'langchain_core.documents.base.Document'>
now printing individual documents: 

document length: 34
SCU Undergraduate Bulletin 2024-25
---
document length: 19
Search the Bulletin
---
document length: 39446
Individual Studies

Modern Languages and Literatures

Department of Mathematics and Computer Science
---
document length: 65313
Anthropology

Biology

Department of Art and Art History

Professors Emeriti: Kathleen Maxwell

Asso
---
document length: 2358
Kids on Campus

Medical and Health Humanities

College of Arts and Sciences

Dean: Daniel Press

Ass
---
document length: 27240
Undergraduate Degrees

Art and Art History

Department of Anthropology

Professors Emeritus: Mary He
---
document length: 37786
Child Studies

Communication

Department of Classics

Professor Emeritus: William S. Greenwalt

Asso
---
document length: 2275
Catholic Studies

International Studies

Gerontology Minor

Director: Patricia M. Simone

Gerontolog
---
d

In [49]:
# printing docs after character splitting
print(f"number of documents: {len(documents)}")
print(f"type of documents: {type(documents[0])}")
print("now printing individual documents: \n")

for doc in documents:
    print(f"document length: {len(doc.page_content)}")
    print(doc.page_content[:100])
    print('---')

number of documents: 1752
type of documents: <class 'langchain_core.documents.base.Document'>
now printing individual documents: 

document length: 34
SCU Undergraduate Bulletin 2024-25
---
document length: 19
Search the Bulletin
---
document length: 780
Individual Studies

Modern Languages and Literatures

Department of Mathematics and Computer Science
---
document length: 1312
Lecturers: Linda Burks, Katelyn Byington, Will Dana, Joshua Grice, Corey Irving, Phillip Jedlovec, M
---
document length: 1118
The Department of Mathematics and Computer Science maintains a program for the discovery, encouragem
---
document length: 1481
Seven approved 5-unit upper-division courses in mathematics (CSCI 162 also permitted), which must in
---
document length: 1441
MATH 122, 123

CSCI 10, 60, 61, 62, 183

CSCI 184 or CSEN/COEN 178

Two courses from CSCI 127, 163; 
---
document length: 1276
MATH 122, CSCI 161, and 163, and CSEN/COEN 177 and 177L

Five additional 4- or 5-unit upper-division
---
docum

In [46]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import os

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.environ.get("OPENAI_API_KEY")
)

db = FAISS.from_documents(documents, OpenAIEmbeddings())


In [47]:


retriever = db.as_retriever()

# Retrieve the most similar text
retrieved_documents = retriever.invoke("what are the prereqs of coen11?")

# show the retrieved document's content
retrieved_documents[0].page_content

'11L. Advanced Programming Laboratory\n\nLaboratory for CSEN/COEN 11. Corequisite: CSEN/COEN 11. (1 unit)\n\n12. Abstract Data Types and Data Structures\n\nData abstraction: abstract data types, information hiding, interface specification. Basic data structures: stacks, queues, lists, binary trees, hashing, tables, graphs; implementation of abstract data types in the C language. Internal sorting: review of selection, insertion, and exchange sorts; quicksort, heapsort; recursion. Analysis of run-time behavior of algorithms; Big-O notation. Introduction to classes in C++. Credit not allowed for more than one introductory data structures class, such as CSEN/COEN 12 or CSCI 61. Prerequisite: a grade of C- or better in CSEN/COEN 11. Corequisite: CSEN/COEN 12L. Recommended corequisite: CSEN/COEN 19 or MATH 51. (4 units)\n\n12L. Abstract Data Types and Data Structures Laboratory\n\nLaboratory for CSEN/COEN 12. Corequisite: CSEN/COEN 12. (1 unit)\n\n19. Discrete Mathematics\n\nPredicate logic,