In [1]:
from ragatouille import RAGPretrainedModel
from datetime import datetime
import pickle
import re
from typing import List, Tuple
import json
import textwrap
import logging

In [10]:
persist_directory = "../embeddings"
colbert_path = "../colbertv2.0/"
index_root = "../colbert_index/"

In [3]:
# Load in previously processed documents - syllabi and advising
with open(f"{persist_directory}/documents.pickle", "rb") as handle:
    documents = pickle.load(handle)

In [6]:
# Split out documents to separate lists of document text and metadata
doc_list = [doc.page_content for doc in documents]
metadata_list = [doc.metadata for doc in documents]

In [11]:
# Create new model from downloaded base model available on Hugging Face (https://huggingface.co/colbert-ir/colbertv2.0)
# This does _not_ recognize the Apple Silicon GPU at this time
RAG = RAGPretrainedModel.from_pretrained(colbert_path, index_root = index_root)

[Apr 01, 19:58:57] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




In [17]:
# Create a new index. Documents as they stand are too long, even though they have been chunked.
# According to the documentation, 512 is about the maximum useful length, so the documents are split agian.
RAG.index(
    collection = doc_list,
    document_metadatas = metadata_list,
    index_name = "documents",
    max_document_length = 512,
    split_documents = True
)

New index_name received! Updating current index_name (documents) to documents
This is a behaviour change from RAGatouille 0.8.0 onwards.
This works fine for most users and smallish datasets, but can be considerably slower than FAISS and could cause worse results in some situations.
If you're confident with FAISS working on your machine, pass use_faiss=True to revert to the FAISS-using behaviour.
--------------------


[Apr 01, 20:06:17] #> Note: Output directory ../colbert_index/colbert/indexes/documents already exists


[Apr 01, 20:06:18] [0] 		 #> Encoding 835 passages..


100%|███████████████████████████████████████████| 27/27 [02:07<00:00,  4.71s/it]

[Apr 01, 20:08:26] [0] 		 avg_doclen_est = 103.75569152832031 	 len(local_sample) = 835





[Apr 01, 20:08:35] [0] 		 Creating 4,096 partitions.
[Apr 01, 20:08:35] [0] 		 *Estimated* 86,636 embeddings.
[Apr 01, 20:08:35] [0] 		 #> Saving the indexing plan to ../colbert_index/colbert/indexes/documents/plan.json ..
used 20 iterations (12.8253s) to cluster 82305 items into 4096 clusters
[0.031, 0.03, 0.029, 0.026, 0.027, 0.029, 0.029, 0.027, 0.028, 0.027, 0.028, 0.029, 0.03, 0.028, 0.029, 0.03, 0.026, 0.028, 0.026, 0.029, 0.028, 0.03, 0.029, 0.029, 0.028, 0.028, 0.032, 0.029, 0.029, 0.031, 0.032, 0.032, 0.032, 0.028, 0.027, 0.026, 0.03, 0.029, 0.028, 0.034, 0.03, 0.03, 0.028, 0.029, 0.03, 0.028, 0.028, 0.032, 0.031, 0.026, 0.026, 0.028, 0.031, 0.029, 0.028, 0.03, 0.031, 0.03, 0.034, 0.028, 0.029, 0.03, 0.03, 0.029, 0.033, 0.031, 0.03, 0.029, 0.029, 0.029, 0.03, 0.027, 0.03, 0.03, 0.029, 0.029, 0.03, 0.029, 0.03, 0.033, 0.032, 0.03, 0.029, 0.031, 0.029, 0.029, 0.028, 0.029, 0.028, 0.033, 0.029, 0.03, 0.029, 0.032, 0.029, 0.028, 0.033, 0.027, 0.03, 0.029, 0.03, 0.03, 0.028, 0.029,

0it [00:00, ?it/s]

[Apr 01, 20:08:48] [0] 		 #> Encoding 835 passages..



  0%|                                                    | 0/27 [00:00<?, ?it/s][A
  4%|█▋                                          | 1/27 [00:04<02:04,  4.80s/it][A
  7%|███▎                                        | 2/27 [00:09<02:00,  4.81s/it][A
 11%|████▉                                       | 3/27 [00:14<01:55,  4.82s/it][A
 15%|██████▌                                     | 4/27 [00:19<01:50,  4.83s/it][A
 19%|████████▏                                   | 5/27 [00:24<01:46,  4.83s/it][A
 22%|█████████▊                                  | 6/27 [00:28<01:41,  4.83s/it][A
 26%|███████████▍                                | 7/27 [00:33<01:36,  4.84s/it][A
 30%|█████████████                               | 8/27 [00:38<01:32,  4.85s/it][A
 33%|██████████████▋                             | 9/27 [00:43<01:27,  4.86s/it][A
 37%|███████████████▉                           | 10/27 [00:48<01:22,  4.85s/it][A
 41%|█████████████████▌                         | 11/27 [00:53<01:17,  4.84

[Apr 01, 20:11:01] #> Optimizing IVF to store map from centroids to list of pids..
[Apr 01, 20:11:01] #> Building the emb2pid mapping..
[Apr 01, 20:11:01] len(emb2pid) = 86636



100%|███████████████████████████████████| 4096/4096 [00:00<00:00, 175295.84it/s]

[Apr 01, 20:11:01] #> Saved optimized IVF to ../colbert_index/colbert/indexes/documents/ivf.pid.pt





Done indexing!


'../colbert_index/colbert/indexes/documents'

In [20]:
# This takes 30+ seconds to start up the first time, but runs faster after that
results = RAG.search(query="Which class involves time series analysis?")

In [21]:
results

[{'content': 'Data Mining II (SIADS 632), Syllabus SIADS 632: Data Mining II Course Overview And Prerequisites: This course extends Data Mining I and introduces additional data representations and tasks involved in mining real world data, with a particular focus on sequence modeling, time series analysis, and mining data streams. It introduces how to extract patterns, compute similarities/distances of data, and make predictions under these data representations.',
  'score': 19.4854679107666,
  'rank': 1,
  'document_id': '005fdc07-33d9-4387-9d72-6e60119997d8',
  'passage_id': 237,
  'document_metadata': {'source': '632_2022-10.md',
   'heading': 'Syllabus SIADS 632: Data Mining II Course Overview And Prerequisites',
   'section': '1',
   'course_number': 'SIADS 632',
   'course_title': 'Data Mining II',
   'course_date': 'October 2022',
   'document': 'https://www.si.umich.edu/sites/default/files/632%20_0.pdf'}},
 {'content': 'Data Mining II (SIADS 632), Learning Outcomes: - Be able to

In [22]:
# Ragatouille let's you create a LangChain retriever from the indexed model
retriever = RAG.as_langchain_retriever(k=5)

In [23]:
retriever.invoke("What is a backpack?")



[Document(page_content='Class Registration > Q: What is a Backpack?: A: The Backpack is a feature available on [Wolverine Access](https://wolverineaccess.umich.edu/) that works much like the "shopping carts" you have seen on many retail websites. With the Backpack you can prepare for your upcoming registration appointment by filling it with classes you want to take. When it is time to register, you will select one or more classes from your Backpack to register for it. NOTE: Placing a class in your Backpack does not enroll you in that class. You must register for a class to become enrolled in it. It is important to note that receiving an override does not enroll you in the course, you still must register through [Wolverine Access](https://wolverineaccess.umich.edu/) to claim the seat that has been opened for you.', metadata={'source': 'advising_guide.md', 'heading': 'Class Registration > Q: What is a Backpack?', 'section': '21', 'course_number': 'n/a', 'course_title': 'n/a', 'course_dat

In [24]:
# Next step is to add this to the RAG pipeline and check its performance