In [1]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader

from datetime import datetime
import pickle
import re
from typing import List, Tuple
import json
import textwrap

In [2]:
# https://www.si.umich.edu/programs/master-applied-data-science/curriculum/mads-courses


courses = {
    "501": "Being a Data Scientist",
    "502": "Math Methods I",
    "503": "Data Science Ethics",
    "505": "Data Manipulation",
    "511": "SQL and Databases",
    "515": "Efficient Data Processing",
    "516": "Big Data: Scalable Data Processing",
    "521": "Visual Exploration of Data",
    "522": "Information Visualization I",
    "523": "Communicating Data Science Results",
    "524": "Presenting Uncertainty",
    "532": "Data Mining I",
    "542": "Supervised Learning",
    "543": "Unsupervised Learning",
    "571": "Business SQL",  # No syllabus for this one :(
    "593": "Milestone I",
    "601": "Qualitative Inquiry for Data Scientists",
    "602": "Math Methods II",
    "611": "Database Architecture & Technology",
    "622": "Information Visualization II",
    "630": "Causal Inference",
    "631": "Experiment Design and Analysis",
    "632": "Data Mining II",
    "642": "Deep Learning I",
    "643": "Machine Learning Pipelines",
    "644": "Reinforcement Learning Algorithms",
    "652": "Network Analysis",
    "655": "Applied Natural Language Processing",
    "673": "Cloud Computing",
    "680": "Learning Analytics and Educational Data Science",
    "681": "Health Analytics",
    "682": "Social Media Analytics",
    "685": "Search and Recommender Systems",
    "687": "Introduction to Sports Analytics",
    "688": "Data Science for Social Good",
    "696": "Milestone II",
    "699": "Capstone",
}

In [3]:
documents = {
    "501": "https://www.si.umich.edu/sites/default/files/501%20_0.pdf",
    "502": "https://www.si.umich.edu/sites/default/files/502%20_0.pdf",
    "503": "https://www.si.umich.edu/sites/default/files/503%20_0.pdf",
    "505": "https://www.si.umich.edu/sites/default/files/505%20_1.pdf",
    "511": "https://www.si.umich.edu/sites/default/files/511%20_0.pdf",
    "515": "https://www.si.umich.edu/sites/default/files/515%20_0.pdf",
    "516": "https://www.si.umich.edu/sites/default/files/516%20_0.pdf",
    "521": "https://www.si.umich.edu/sites/default/files/521%20_0.pdf",
    "522": "https://www.si.umich.edu/sites/default/files/522%20_0.pdf",
    "523": "https://www.si.umich.edu/sites/default/files/523%20_0.pdf",
    "524": "https://www.si.umich.edu/sites/default/files/524%20_0.pdf",
    "532": "https://www.si.umich.edu/sites/default/files/532%20_0.pdf",
    "542": "https://www.si.umich.edu/sites/default/files/542%20_0.pdf",
    "543": "https://www.si.umich.edu/sites/default/files/543%20_0.pdf",
    "571": "",
    "593": "https://www.si.umich.edu/sites/default/files/593%20_0.pdf",
    "601": "https://www.si.umich.edu/sites/default/files/601%20_0.pdf",
    "602": "https://www.si.umich.edu/sites/default/files/602%20_0.pdf",
    "611": "https://www.si.umich.edu/sites/default/files/611%20_0.pdf",
    "622": "https://www.si.umich.edu/sites/default/files/622%20_0.pdf",
    "630": "https://www.si.umich.edu/sites/default/files/630%20_0.pdf",
    "631": "https://www.si.umich.edu/sites/default/files/631%20_0.pdf",
    "632": "https://www.si.umich.edu/sites/default/files/632%20_0.pdf",
    "642": "https://www.si.umich.edu/sites/default/files/642%20_0.pdf",
    "643": "https://www.si.umich.edu/sites/default/files/643%20_1.pdf",
    "644": "https://www.si.umich.edu/sites/default/files/644%20_0.pdf",
    "652": "https://www.si.umich.edu/sites/default/files/652%20_0.pdf",
    "655": "https://www.si.umich.edu/sites/default/files/655%20_0.pdf",
    "673": "https://www.si.umich.edu/sites/default/files/673%20_0.pdf",
    "680": "https://www.si.umich.edu/sites/default/files/680%20_0.pdf",
    "681": "https://www.si.umich.edu/sites/default/files/681%20_0.pdf",
    "682": "https://www.si.umich.edu/sites/default/files/682%20_0.pdf",
    "685": "https://www.si.umich.edu/sites/default/files/685%20_0.pdf",
    "687": "https://www.si.umich.edu/sites/default/files/687%20_1.pdf",
    "688": "https://www.si.umich.edu/sites/default/files/688%20_0.pdf",
    "696": "https://www.si.umich.edu/sites/default/files/696%20_0.pdf",
    "699": "https://www.si.umich.edu/sites/default/files/699%20.pdf",
    "handbook": "https://docs.google.com/document/d/1YEOcpdONdme5kmpNEnZpdbJeVFhEIw1pS0wq16QdH1I/edit",
    "advising_guide": "https://docs.google.com/document/d/1A3zdTF0AYQY_zzD2-OlpSHeDxnWqFVEhXl446SyT_pA/edit",
}

In [4]:
class SyllabusLoader(BaseLoader):
    def __init__(self, file_path: str, chunk_size=1500):
        self.file_path = file_path
        self.chunk_size = chunk_size

    def _build_document(self, heading: str, section_text: str, i: int) -> Document:
        file = self.file_path.split("/")[-1]
        heading = heading.replace("#", "").strip()
        course, date = file.split("_")
        course_number = f"SIADS {course}"
        course_title = courses[course]
        section_text = f"{course_title} ({course_number}), {heading}: {section_text}"

        metadata = {
            "source": file,
            "heading": heading,
            "section": f"{i + 1}",
            "course_number": course_number,
            "course_title": course_title,
            "course_date": datetime.strptime(date.replace(".md", ""), "%Y-%m").strftime(
                "%B %Y"
            ),
            "document": documents[course.replace(".md", "")],
        }

        return Document(page_content=section_text, metadata=metadata)

    def load(self) -> List[Document]:
        try:
            with open(self.file_path, encoding="UTF-8") as f:
                text = f.read()
        except Exception as e:
            raise RuntimeError(f"Error loading {self.file_path}") from e

        heading_pattern = r"^(#{1,6})\s*(.*)$"
        headings = [
            (match.start(), match.group())
            for match in re.finditer(heading_pattern, text, flags=re.MULTILINE)
        ]

        sections = []
        for i in range(len(headings)):
            start, heading = headings[i]
            end = headings[i + 1][0] if i + 1 < len(headings) else len(text)
            section_text = text[start:end].replace(heading, "").strip()

            # Skip empty sections
            if not section_text:
                continue

            # There are a ton of links that don't make sense to the model, remove this extra cruft
            section_text = re.sub(
                "Opens in a new tab", "", section_text, flags=re.IGNORECASE
            )

            # ...and get rid of bolded words too
            section_text = section_text.replace("**", "")

            if len(section_text) < self.chunk_size:
                d = self._build_document(heading, section_text, i)
                sections.append(d)
            else:
                # Attempt to split long section_text into chunks based on newlines
                lines = section_text.split("\n")
                chunk = ""
                for line in lines:
                    if len(chunk) + len(line) + 1 <= self.chunk_size:  # +1 for newline
                        chunk += line + "\n"
                    else:
                        # Chunk is larger than what we want, build the doc and start a new one
                        if chunk:  # Check if it's empty
                            d = self._build_document(heading, chunk, i)
                            sections.append(d)
                        chunk = line + "\n"
                # Add the last chunk regardless of size
                if chunk:
                    d = self._build_document(heading, chunk, i)
                    sections.append(d)

        return sections

In [5]:
class MarkdownDocumentLoader(BaseLoader):
    def __init__(self, file_path: str, chunk_size=1500):
        self.file_path = file_path
        self.chunk_size = chunk_size

    def _build_document(self, hierarchy: str, text: str, i: int) -> Document:
        file = self.file_path.split("/")[-1]
        text = f"{hierarchy}: {text}"

        metadata = {
            "source": file,
            "heading": hierarchy,
            "section": f"{i + 1}",
            "course_number": "n/a",
            "course_title": "n/a",
            "course_date": "n/a",
            "document": documents[file.replace(".md", "")],
        }

        return Document(page_content=text, metadata=metadata)

    def load(self) -> List[Document]:
        try:
            with open(self.file_path, encoding="UTF-8") as f:
                text = f.read()
        except Exception as e:
            raise RuntimeError(f"Error loading {self.file_path}") from e

        heading_pattern = r"^(#{1,6})\s*(.*)$"
        headings = [
            (match.start(), len(match.group(1)), match.group(2).strip())
            for match in re.finditer(heading_pattern, text, flags=re.MULTILINE)
        ]

        sections = []
        heading_stack = []
        for i in range(len(headings)):
            start, level, heading_text = headings[i]
            end = headings[i + 1][0] if i + 1 < len(headings) else len(text)
            section_text = (
                text[start:end].replace(heading_text, "").replace("#", "").strip()
            )

            while heading_stack and heading_stack[-1][0] >= level:
                heading_stack.pop()
            heading_stack.append((level, heading_text))

            hierarchy = " > ".join([h[1] for h in heading_stack])

            # Skip empty sections
            if not section_text:
                continue

            if len(section_text) < self.chunk_size:
                d = self._build_document(hierarchy, section_text, i)
                sections.append(d)
            else:
                # Attempt to split long section_text into chunks based on newlines
                lines = section_text.split("\n")
                chunk = ""
                for line in lines:
                    if len(chunk) + len(line) + 1 <= self.chunk_size:  # +1 for newline
                        chunk += line + "\n"
                    else:
                        # Chunk is larger than what we want, build the doc and start a new one
                        if chunk:  # Check if it's empty
                            d = self._build_document(hierarchy, chunk, i)
                            sections.append(d)
                        chunk = line + "\n"
                # Add the last chunk regardless of size
                if chunk:
                    d = self._build_document(hierarchy, chunk, i)
                    sections.append(d)

        return sections

In [6]:
syllabus_loader = DirectoryLoader(
    "./documents/syllabi", glob="*.md", show_progress=True, loader_cls=SyllabusLoader
)

syllabus_docs = syllabus_loader.load()


docs_loader = DirectoryLoader(
    "./documents", glob="*.md", show_progress=True, loader_cls=MarkdownDocumentLoader
)

docs = docs_loader.load()


all_docs = syllabus_docs + docs

len(syllabus_docs), len(docs)

100%|██████████| 36/36 [00:00<00:00, 565.80it/s]
100%|██████████| 2/2 [00:00<00:00, 310.90it/s]


(700, 134)

In [7]:
def print_docs(docs: List[Document]) -> None:
    for doc in docs:
        print(textwrap.fill(doc.page_content, 100))
        print("")
        print(json.dumps(doc.metadata, indent=4))
        print("")
        print("=" * 50)
        print("")


print_docs([all_docs[618], all_docs[680], all_docs[740]])

Applied Natural Language Processing (SIADS 655), Library Access: Refer to the U-M Library's
information sheet  on accessing library resources from off-campus. For more information regarding
library support services, please refer to the U-M Library Resources  section of the UMSI Student
Handbook (access to the Student Orientation course required).

{
    "source": "655_2023-12.md",
    "heading": "Library Access",
    "section": "12",
    "course_number": "SIADS 655",
    "course_title": "Applied Natural Language Processing",
    "course_date": "December 2023",
    "document": "https://www.si.umich.edu/sites/default/files/655%20_0.pdf"
}


Supervised Learning (SIADS 542), Quizzes: Each week will also contain a short quiz to test your
knowledge of material in the lectures and readings.

{
    "source": "542_2023-12.md",
    "heading": "Quizzes",
    "section": "12",
    "course_number": "SIADS 542",
    "course_title": "Supervised Learning",
    "course_date": "December 2023",
    "docum

In [8]:
long_docs = [d for d in all_docs if len(d.page_content) > 1500]

print_docs(long_docs)
len(long_docs)

Math Methods I (SIADS 502), Course Syllabus Mads 502: Math Methods For Data Science Course Overview:
There are no prerequisites for this course. This course will review and introduce some mathematical
concepts relevant to applied data science. It will cover important concepts in linear algebra,
probability, and statistics. Week 1 covers some math basics about sets, types of numbers, types of
proofs, functions, derivatives, optimization, exponents, logarithms, Euler's number, and natural
logarithms. It also covers Part 1 of the unit on Linear Algebra, specifically vector operations with
math and with Python and the concepts of span, basis, and linear independence. Week 2 is Part 2 of
the unit on Linear Algebra. It covers matrix operations with math and with Python, the concept of
matrices as linear mappings, determinants, traces, and some basics about eigenvalues and
eigenvectors. Week 3 is the unit on Probability. It covers the concepts of random variables,
introduces some important di

41

In [9]:
persist_directory = "./embeddings"

In [10]:
with open(f"{persist_directory}/documents.pickle", "wb") as handle:
    pickle.dump(all_docs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2", encode_kwargs={"normalize_embeddings": True}
)  # sentence transformers

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [12]:
vectordb = Chroma.from_documents(
    documents=all_docs,
    collection_name="embeddings",
    embedding=embeddings,
    persist_directory=persist_directory,
)

vectordb.persist()
vectordb = None

In [13]:
with open(f"{persist_directory}/embeddings.pickle", "wb") as handle:
    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [14]:
# Load database from persistent store
with open(f"{persist_directory}/embeddings.pickle", "rb") as handle:
    embeddings = pickle.load(handle)

vectordb = Chroma(
    "embeddings",
    embedding_function=embeddings,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"},
)

In [15]:
def print_docs_with_score(docs: List[Tuple[Document, float]]) -> None:
    for doc, score in docs:
        print(textwrap.fill(doc.page_content, 100))
        print("")
        print(json.dumps(doc.metadata, indent=4))
        print("")
        print("Similarity Score:", score)
        print("")
        print("=" * 50)
        print("")

In [16]:
result = vectordb.similarity_search_with_score(
    "Which class involves time series analysis?"
)

print_docs_with_score(result)

Data Mining II (SIADS 632), Learning Outcomes: - Be able to formulate real world data as sequences,
time series, or data streams. - Be able to formulate a real world problem as sequence prediction and
solve it using N-Gram language models. - Be aware of how Hidden Markov Models work. - Extract
patterns from time series data, including trends, seasons, cycles, and outliers. - Measure
similarity between time series. - Conduct time series forecasting using autoregressions. -
Articulate the restriction of data streams and strategies for mining data streams. - Implement
Reservoir sampling, Bloom filter, and lossy counting. - Name real world applications of these data
representations and methods.

{
    "course_date": "October 2022",
    "course_number": "SIADS 632",
    "course_title": "Data Mining II",
    "document": "https://www.si.umich.edu/sites/default/files/632%20_0.pdf",
    "heading": "Learning Outcomes",
    "section": "6",
    "source": "632_2022-10.md"
}

Similarity Score: 0.988

## Ensemble Retriever

In [17]:
import numpy as np

def partition_equally_by_n(n):
  return list(np.ones(n) / n)

partition_equally_by_n(3)

[0.3333333333333333, 0.3333333333333333, 0.3333333333333333]

In [18]:
from langchain.retrievers import TFIDFRetriever, BM25Retriever, EnsembleRetriever

chroma_retriever = vectordb.as_retriever(
    search_type="mmr",  # Maximum marginal relevance
    search_kwargs={"k": 5, "fetch_k": 20},
)

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf_retriever = TFIDFRetriever.from_documents(
    docs, tfidf_params={"stop_words": "english", "min_df": 1}
)

bm25_retriever = BM25Retriever.from_documents(docs)

retrievers = [
    chroma_retriever,
    tfidf_retriever,
    # bm25_retriever
]
weights = partition_equally_by_n(len(retrievers))

ensemble_retriever = EnsembleRetriever(retrievers=retrievers, weights=weights)

In [19]:
ensemble_result = ensemble_retriever.invoke(
    "Which class involves time series analysis?"
)

print_docs(ensemble_result)

Data Mining II (SIADS 632), Learning Outcomes: - Be able to formulate real world data as sequences,
time series, or data streams. - Be able to formulate a real world problem as sequence prediction and
solve it using N-Gram language models. - Be aware of how Hidden Markov Models work. - Extract
patterns from time series data, including trends, seasons, cycles, and outliers. - Measure
similarity between time series. - Conduct time series forecasting using autoregressions. -
Articulate the restriction of data streams and strategies for mining data streams. - Implement
Reservoir sampling, Bloom filter, and lossy counting. - Name real world applications of these data
representations and methods.

{
    "course_date": "October 2022",
    "course_number": "SIADS 632",
    "course_title": "Data Mining II",
    "document": "https://www.si.umich.edu/sites/default/files/632%20_0.pdf",
    "heading": "Learning Outcomes",
    "section": "6",
    "source": "632_2022-10.md"
}


Academics > Visiting (