In [None]:
from langchain_community.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader, TextLoader, JSONLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA, ConversationalRetrievalChain, RetrievalQAWithSourcesChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import LlamaCpp
import pickle

In [None]:
# https://www.si.umich.edu/programs/master-applied-data-science/curriculum/mads-courses


courses = {
    "501": "Being a Data Scientist",
    "502": "Math Methods I",
    "503": "Data Science Ethics",
    "505": "Data Manipulation",
    "511": "SQL and Databases",
    "515": "Efficient Data Processing",
    "516": "Big Data: Scalable Data Processing",
    "521": "Visual Exploration of Data",
    "522": "Information Visualization I",
    "523": "Communicating Data Science Results",
    "524": "Presenting Uncertainty",
    "532": "Data Mining I",
    "542": "Supervised Learning",
    "543": "Unsupervised Learning",
    "571": "Business SQL", # No syllabus for this one :(
    "593": "Milestone I",
    "601": "Qualitative Inquiry for Data Scientists",
    "602": "Math Methods II",
    "611": "Database Architecture & Technology",
    "622": "Information Visualization II",
    "630": "Causal Inference",
    "631": "Experiment Design and Analysis",
    "632": "Data Mining II",
    "642": "Deep Learning I",
    "643": "Machine Learning Pipelines",
    "644": "Reinforcement Learning Algorithms",
    "652": "Network Analysis",
    "655": "Applied Natural Language Processing",
    "673": "Cloud Computing",
    "680": "Learning Analytics and Educational Data Science",
    "681": "Health Analytics",
    "682": "Social Media Analytics",
    "685": "Search and Recommender Systems",
    "687": "Introduction to Sports Analytics",
    "688": "Data Science for Social Good",
    "696": "Milestone II",
    "699": "Capstone",
}


In [None]:
documents = {
    "501": "https://www.si.umich.edu/sites/default/files/501%20_0.pdf",
    "502": "https://www.si.umich.edu/sites/default/files/502%20_0.pdf",
    "503": "https://www.si.umich.edu/sites/default/files/503%20_0.pdf",
    "505": "https://www.si.umich.edu/sites/default/files/505%20_1.pdf",
    "511": "https://www.si.umich.edu/sites/default/files/511%20_0.pdf",
    "515": "https://www.si.umich.edu/sites/default/files/515%20_0.pdf",
    "516": "https://www.si.umich.edu/sites/default/files/516%20_0.pdf",
    "521": "https://www.si.umich.edu/sites/default/files/521%20_0.pdf",
    "522": "https://www.si.umich.edu/sites/default/files/522%20_0.pdf",
    "523": "https://www.si.umich.edu/sites/default/files/523%20_0.pdf",
    "524": "https://www.si.umich.edu/sites/default/files/524%20_0.pdf",
    "532": "https://www.si.umich.edu/sites/default/files/532%20_0.pdf",
    "542": "https://www.si.umich.edu/sites/default/files/542%20_0.pdf",
    "543": "https://www.si.umich.edu/sites/default/files/543%20_0.pdf",
    "571": "",
    "593": "https://www.si.umich.edu/sites/default/files/593%20_0.pdf",
    "601": "https://www.si.umich.edu/sites/default/files/601%20_0.pdf",
    "602": "https://www.si.umich.edu/sites/default/files/602%20_0.pdf",
    "611": "https://www.si.umich.edu/sites/default/files/611%20_0.pdf",
    "622": "https://www.si.umich.edu/sites/default/files/622%20_0.pdf",
    "630": "https://www.si.umich.edu/sites/default/files/630%20_0.pdf",
    "631": "https://www.si.umich.edu/sites/default/files/631%20_0.pdf",
    "632": "https://www.si.umich.edu/sites/default/files/632%20_0.pdf",
    "642": "https://www.si.umich.edu/sites/default/files/642%20_0.pdf",
    "643": "https://www.si.umich.edu/sites/default/files/643%20_1.pdf",
    "644": "https://www.si.umich.edu/sites/default/files/644%20_0.pdf",
    "652": "https://www.si.umich.edu/sites/default/files/652%20_0.pdf",
    "655": "https://www.si.umich.edu/sites/default/files/655%20_0.pdf",
    "673": "https://www.si.umich.edu/sites/default/files/673%20_0.pdf",
    "680": "https://www.si.umich.edu/sites/default/files/680%20_0.pdf",
    "681": "https://www.si.umich.edu/sites/default/files/681%20_0.pdf",
    "682": "https://www.si.umich.edu/sites/default/files/682%20_0.pdf",
    "685": "https://www.si.umich.edu/sites/default/files/685%20_0.pdf",
    "687": "https://www.si.umich.edu/sites/default/files/687%20_1.pdf",
    "688": "https://www.si.umich.edu/sites/default/files/688%20_0.pdf",
    "696": "https://www.si.umich.edu/sites/default/files/696%20_0.pdf",
    "699": "https://www.si.umich.edu/sites/default/files/699%20.pdf",
    "handbook": "https://docs.google.com/document/d/1YEOcpdONdme5kmpNEnZpdbJeVFhEIw1pS0wq16QdH1I/edit"
}

In [None]:
from datetime import datetime
import re
from typing import List

from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader


class SyllabusLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> List[Document]:
        try:
            with open(self.file_path, encoding="UTF-8") as f:
                text = f.read()
        except Exception as e:
            raise RuntimeError(f"Error loading {self.file_path}") from e

        heading_pattern = r"^(#{1,6})\s*(.*)$"
        headings = [
            (match.start(), match.group())
            for match in re.finditer(heading_pattern, text, flags=re.MULTILINE)
        ]

        sections = []
        for i in range(len(headings)):
            start, heading = headings[i]
            end = headings[i + 1][0] if i + 1 < len(headings) else len(text)
            section_text = text[start:end].replace(heading, "").strip()

            # Skip empty sections
            if not section_text:
                continue

            # There are a ton of links that don't make sense to the model, remove this extra cruft
            section_text = re.sub(
                "Opens in a new tab", "", section_text, flags=re.IGNORECASE
            )

            file = self.file_path.split("/")[-1]
            heading = heading.replace("#", "").strip()

            # Special logic for the handbook
            if file == "handbook.md":
                section_text = (f"Context from MADS Student Handbook, {heading}: {section_text}")
                metadata={
                    "source": file,
                    "heading": heading,
                    "section": f"{i + 1}",
                    "course_number": "",
                    "course_title": "",
                    "course_date": "",
                    "document": documents["handbook"],
                }

                sections.append(Document(page_content=section_text, metadata=metadata))

                continue

            course, date = file.split("_")
            course_number = f"SIADS {course}"
            course_title = courses[course]
            section_text = f"Context from {course_title} ({course_number}), {heading}: {section_text}"

            metadata = {
                "source": file,
                "heading": heading,
                "section": f"{i + 1}",
                "course_number": course_number,
                "course_title": course_title,
                "course_date": datetime.strptime(date.replace(".md", ""), "%Y-%m").strftime("%B %Y"),
                "document": documents[course.replace(".md", "")],
            }

            sections.append(Document(page_content=section_text, metadata=metadata))

        return sections

In [None]:
loader = DirectoryLoader('./documents', glob="*.md", recursive=True, show_progress=True, loader_cls=SyllabusLoader)
docs = loader.load()

len(docs)

100%|██████████| 37/37 [00:00<00:00, 238.92it/s]


713

In [None]:
docs[3]

Document(page_content='Context from MADS Student Handbook, Summary of Degree Requirements: To earn a Master of Applied Data Science at the School of Information, you must complete a minimum of 34 credit hours of SIADS coursework, with a cumulative grade point average of a B (3.0) or better, including required coursework and electives.', metadata={'source': 'handbook.md', 'heading': 'Summary of Degree Requirements', 'section': '7', 'course_number': '', 'course_title': '', 'course_date': '', 'document': 'https://www.patricksollars.com/umich/[PUBLIC]_MADS_Student_Handbook.pdf'})

In [None]:
persist_directory = "./embeddings/"

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # sentence transformers

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
vectordb = Chroma.from_documents(
    documents=docs,
    collection_name="embeddings",
    embedding=embeddings,
    persist_directory=persist_directory,
)

vectordb.persist()
vectordb = None

In [None]:
with open(f"{persist_directory}embeddings.pickle", 'wb') as handle:
    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
with open(f"{persist_directory}embeddings.pickle", 'rb') as handle:
    embeddings = pickle.load(handle)

In [None]:
# Load database from persistent store
vectordb = Chroma(
    "embeddings",
    embedding_function=embeddings,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"},
)

In [None]:
vectordb.similarity_search_with_score("Which class involves time series analysis?")

[(Document(page_content='Context from Data Mining II (SIADS 632), Learning Outcomes: - Be able to formulate real world data as sequences, time series, or data streams.\n- Be able to formulate a real world problem as sequence prediction and solve it using N-Gram language models.\n- Be aware of how Hidden Markov Models work.\n- Extract patterns from time series data, including trends, seasons, cycles, and outliers.\n- Measure similarity between time series.\n- Conduct time series forecasting using autoregressions.\n- Articulate the restriction of data streams and strategies for mining data streams.\n- Implement Reservoir sampling, Bloom filter, and lossy counting.\n- Name real world applications of these data representations and methods.', metadata={'course_date': 'October 2022', 'course_number': 'SIADS 632', 'course_title': 'Data Mining II', 'document': 'https://www.si.umich.edu/sites/default/files/632%20_0.pdf', 'heading': 'Learning Outcomes', 'section': '6', 'source': '632_2022-10.md'

## Optional Ensemble Retriever

In [None]:
import numpy as np

def partition_equally_by_n(n):
  return list(np.ones(n) / n)

partition_equally_by_n(3)

[0.3333333333333333, 0.3333333333333333, 0.3333333333333333]

In [None]:
from langchain.retrievers import TFIDFRetriever, BM25Retriever, EnsembleRetriever

chroma_retriever = vectordb.as_retriever(
    search_type="mmr", # Maximum marginal relevance
    search_kwargs={"k": 5, "fetch_k": 20},
)

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf_retriever = TFIDFRetriever.from_documents(docs, tfidf_params={"stop_words":"english", "min_df": 1})

bm25_retriever = BM25Retriever.from_documents(docs)

retrievers = [
    chroma_retriever,
    tfidf_retriever,
    # bm25_retriever
    ]
weights = partition_equally_by_n(len(retrievers))

ensemble_retriever = EnsembleRetriever(retrievers=retrievers, weights=weights)

In [None]:
ensemble_retriever.invoke("Which class involves time series analysis?")

[Document(page_content='This course extends Data Mining I and introduces additional data representations and tasks involved in mining real world data, with a particular focus on sequence modeling, time series analysis, and mining data streams.\xa0 It introduces how to extract patterns, compute similarities/distances of data, and make predictions under these data representations.', metadata={'source': '2023-02_632.md', 'heading': 'Course Overview and Prerequisites', 'section': '1', 'course_number': 'SIADS 632', 'course_title': 'Data Mining II', 'course_date': 'February 2023'}),
 Document(page_content='- Be able to formulate real world data as sequences, time series, or data streams.\n- Be able to formulate a real world problem as sequence prediction and solve it using N-Gram language models.\n- Be aware of how Hidden Markov Models work.\n- Extract patterns from time series data, including trends, seasons, cycles, and outliers.\n- Measure similarity between time series.\n- Conduct time s