In [1]:
# !pip install langchain gradio beautifulsoup4 unstructured jq kaleido sentence-transformers llama-cpp-python chromadb==0.4.14 rank_bm25

In [3]:
from langchain_community.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader, TextLoader, JSONLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA, ConversationalRetrievalChain, RetrievalQAWithSourcesChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import LlamaCpp
import glob
import re
import json
import pickle

In [4]:
from datetime import datetime
import re
from typing import List

from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader

# https://www.si.umich.edu/programs/master-applied-data-science/curriculum/mads-courses

courses = {
    "505": "Data Manipulation",
    "502": "Math Methods I",
    "515": "Efficient Data Processing",
    "521": "Visual Exploration of Data",
    "532": "Data Mining I",
    "501": "Being a Data Scientist",
    "511": "SQL and Databases",
    "522": "Information Visualization I",
    "503": "Data Science Ethics",
    "523": "Communicating Data Science Results",
    "542": "Supervised Learning",
    "543": "Unsupervised Learning",
    "611": "Database Architecture & Technology",
    "516": "Big Data: Scalable Data Processing",
    "622": "Information Visualization II",
    "593": "Milestone I",
    "631": "Experiment Design and Analysis",
    "652": "Network Analysis",
    "632": "Data Mining II",
    "673": "Cloud Computing",
    "642": "Deep Learning I",
    "643": "Machine Learning Pipelines",
    "655": "Applied Natural Language Processing",
    "524": "Presenting Uncertainty",
    "601": "Qualitative Inquiry for Data Scientists",
    "630": "Causal Inference",
    "696": "Milestone II",
    "682": "Social Media Analytics",
    "688": "Data Science for Social Good",
    "699": "Capstone",
    # "571": "Business SQL",
    # "602": "Math Methods II",
    # "644": "Reinforcement Learning Algorithms",
    # "680": "Learning Analytics and Educational Data Science",
    # "681": "Health Analytics",
    # "685": "Search and Recommender Systems",
    # "687": "Introduction to Sports Analytics",
}

class SyllabusLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> List[Document]:
        try:
            with open(self.file_path, encoding="UTF-8") as f:
                text = f.read()
        except Exception as e:
            raise RuntimeError(f"Error loading {self.file_path}") from e

        heading_pattern = r"^(#{1,6})\s*(.*)$"
        headings = [
            (match.start(), match.group())
            for match in re.finditer(heading_pattern, text, flags=re.MULTILINE)
        ]

        sections = []
        for i in range(len(headings)):
            start, heading = headings[i]
            end = headings[i + 1][0] if i + 1 < len(headings) else len(text)
            section_text = text[start:end].replace(heading, "").strip()

            file = self.file_path.split("/")[-1]
            date, course = file.split("_")
            course_number = course.replace(".md", "")

            metadata = {
                "source": file,
                "heading": heading.replace("#", "").strip(),
                "section": f"{i + 1}",
                "course_number": f"SIADS {course_number}",
                "course_title": courses[course_number],
                "course_date": datetime.strptime(date, "%Y-%m").strftime("%B %Y"),
            }

            sections.append(Document(page_content=section_text, metadata=metadata))

        return sections

In [5]:
loader = DirectoryLoader(
    "./parsed_syllabi", glob="*.md", recursive=True, show_progress=True, loader_cls=SyllabusLoader
)
docs = loader.load()

len(docs)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:00<00:00, 230.68it/s]


571

In [27]:
docs[2]

Document(page_content='- This course begins on **Monday, September 27, 2021** and ends on **Sunday, October 24, 2021** .\n- Weekly assignments will be due on **Mondays at 11:59 pm (Ann Arbor, Michigan time-Eastern Standard Time - EST, UTC -5) except for week 4, which will be due on the last day of class, Sunday, October 24 at 11:59pm (Ann Arbor, Michigan time-Eastern Standard Time - EST, UTC -5)**\n\n  **Schedule of Weekly Office Hours via Zoom (Ann Arbor, Michigan time):**\n\nAccess via Live Events from the course menu. All OH meeting has passcode **502**\n\n- Alex McLeod-- Mondays 8-9 AM EST\n- Ben Merrill-- Wednesdays 6-7 PM EST\n- Nhan Le-- Thursdays 1-2 PM EST\n- Rotating (Alex, Nhan, Alex, Ben)-- Saturdays 10-11AM EST', metadata={'source': '2021-10_502.md', 'heading': 'Course Schedule', 'section': '3', 'course_number': 'SIADS 502', 'course_title': 'Math Methods I', 'course_date': 'October 2021'})

In [23]:
docs[46]

Document(page_content='This course will introduce basic concepts and tasks of data mining. It focuses on how to formally represent real-world information as basic data types (itemsets, matrices, and sequences) that facilitate downstream analytics tasks. Students will learn how to characterize each type of data through pattern extraction and similarity measures.\n\nCourse prerequisites: Mathematics Methods for Applied Data Science, Data Manipulation', metadata={'source': '2021-12_532.md', 'heading': '**Course Overview and Prerequisites**', 'section': '2', 'course_number': 'SIADS 532', 'course_title': 'Data Mining I', 'course_date': 'December 2021'})

In [11]:
persist_directory = "../syllabus_loader_embeddings/"

In [10]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
vectordb = Chroma.from_documents(
    documents=docs,
    collection_name="syllabi",
    embedding=embeddings,
    persist_directory=persist_directory,
)

vectordb.persist()
vectordb = None

In [12]:
with open(f"{persist_directory}embeddings.pickle", 'wb') as handle:
    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

embeddings

FileNotFoundError: [Errno 2] No such file or directory: '../syllabus_loader_embeddings/embeddings.pickle'

In [None]:
with open(f"{persist_directory}embeddings.pickle", 'rb') as handle:
    embeddings = pickle.load(handle)

In [None]:
# Load database from persistent store
vectordb = Chroma(
    "syllabi",
    embedding_function=embeddings,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"},
)

In [None]:
vectordb.similarity_search_with_score("Which class involves time series analysis?")

[(Document(page_content='This course extends Data Mining I and introduces additional data representations and tasks involved in mining real world data, with a particular focus on sequence modeling, time series analysis, and mining data streams.\xa0 It introduces how to extract patterns, compute similarities/distances of data, and make predictions under these data representations.', metadata={'course_date': 'February 2023', 'course_number': 'SIADS 632', 'course_title': 'Data Mining II', 'heading': 'Course Overview and Prerequisites', 'source': '2023-02_632.md'}),
  0.9329724311828613),
 (Document(page_content='This course extends Data Mining I and introduces additional data representations and tasks involved in mining real world data, with a particular focus on sequence modeling, time series analysis, and mining data streams.\xa0 It introduces how to extract patterns, compute similarities/distances of data, and make predictions under these data representations.', metadata={'course_date'

In [None]:
import numpy as np

def partition_equally_by_n(n):
  return list(np.ones(n) / n)

partition_equally_by_n(3)

[0.3333333333333333, 0.3333333333333333, 0.3333333333333333]

In [None]:
from langchain.retrievers import TFIDFRetriever, BM25Retriever, EnsembleRetriever

chroma_retriever = vectordb.as_retriever(
    search_type="mmr", # Maximum marginal relevance
    search_kwargs={"k": 5, "fetch_k": 20},
)

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf_retriever = TFIDFRetriever.from_documents(docs, tfidf_params={"stop_words":"english", "min_df": 1})

bm25_retriever = BM25Retriever.from_documents(docs)

retrievers = [
    chroma_retriever,
    tfidf_retriever,
    # bm25_retriever
    ]
weights = partition_equally_by_n(len(retrievers))

ensemble_retriever = EnsembleRetriever(retrievers=retrievers, weights=weights)

In [None]:
ensemble_retriever.invoke("Which class involves time series analysis?")

[Document(page_content='This course extends Data Mining I and introduces additional data representations and tasks involved in mining real world data, with a particular focus on sequence modeling, time series analysis, and mining data streams.\xa0 It introduces how to extract patterns, compute similarities/distances of data, and make predictions under these data representations.', metadata={'source': '2023-02_632.md', 'heading': 'Course Overview and Prerequisites', 'section': '1', 'course_number': 'SIADS 632', 'course_title': 'Data Mining II', 'course_date': 'February 2023'}),
 Document(page_content='- Be able to formulate real world data as sequences, time series, or data streams.\n- Be able to formulate a real world problem as sequence prediction and solve it using N-Gram language models.\n- Be aware of how Hidden Markov Models work.\n- Extract patterns from time series data, including trends, seasons, cycles, and outliers.\n- Measure similarity between time series.\n- Conduct time s