In [2]:
import os
import json
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

In [4]:
# Load the answer key to work with
with open(os.path.join(os.getcwd(), "ce311k_answers", "00_variables_raw.json")) as f:
    answer_key = json.load(f)

del answer_key["cells"][0:2]

In [5]:
# Get the data all cleaned up
questions_ordered = {}
q_num = -1
question, solution, test = False, False, False
for item in answer_key["cells"]:
    # Set the question number
    if item["cell_type"] == "raw":
        if item["source"][0] == "# BEGIN QUESTION\n":
            q_num += 1
    # Ensure naming sequence is created
    if f"q{q_num}" not in questions_ordered.keys():
        questions_ordered[f"q{q_num}"] = {}
        questions_ordered[f"q{q_num}"]["question"] = None
        questions_ordered[f"q{q_num}"]["solution"] = None
        questions_ordered[f"q{q_num}"]["test"] = None
    # Denote what type of info the line is
    if item["cell_type"] == "raw":
        if item["source"][0] == "# BEGIN QUESTION\n":
            question = True
        elif item["source"][0] == "# BEGIN SOLUTION":
            solution = True
        elif item["source"][0] == "# BEGIN TESTS":
            test = True
    # Put the info into the correct place
    elif item["cell_type"] != "raw":
        if question:
            questions_ordered[f"q{q_num}"]["question"] = item["source"]
            question = False
        elif solution:
            questions_ordered[f"q{q_num}"]["solution"] = item["source"][1:-1]
            solution = False
        elif test:
            questions_ordered[f"q{q_num}"]["test"] = item["source"]
            test = False

In [28]:
# Get data set up into dataframe for vectorization
df = pd.DataFrame(
    {
        "question": [
            questions_ordered[i]["question"] for i in questions_ordered.keys()
        ],
        "solution": [
            questions_ordered[i]["solution"] for i in questions_ordered.keys()
        ],
        "test": [questions_ordered[i]["test"] for i in questions_ordered.keys()],
    }
)

# Define the embedding function
model = SentenceTransformer(
    "all-MiniLM-L6-v2"
)  # This is the default embedding function for chroma
questions = []
solutions = []
tests = []
for i in questions_ordered.keys():
    questions.append(questions_ordered[i]["question"])
    solutions.append(questions_ordered[i]["solution"])
    tests.append(questions_ordered[i]["test"])

questions_concat = []
solutions_concat = []
tests_concat = []

def concatenate(iterable, sep=" "):
    sentence = iterable[0]
    for word in iterable[1:]:
        sentence += (sep + word)
    return sentence

for i in range(len(questions)):
    questions_concat.append(concatenate(questions[i]))
for i in range(len(solutions)):
    solutions_concat.append(concatenate(solutions[i]))
for i in range(len(tests)):
    tests_concat.append(concatenate(tests[i]))

question_embeddings = []
solution_embeddings = []
test_embeddings = []
for i in range(len(questions)):
    question_embeddings.append(model.encode(questions_concat[i]).tolist())
for i in range(len(solutions)):
    solution_embeddings.append(model.encode(solutions_concat[i]).tolist())
for i in range(len(tests)):
    test_embeddings.append(model.encode(tests_concat[i]).tolist())

df["question_embeddings"] = question_embeddings
df["solution_embeddings"] = solution_embeddings
df["test_embeddings"] = test_embeddings

In [27]:
print(questions_concat[0])

## Exercise 00
 
 Use the variable `v0` to evaluate the following function: 
 $$
 v0 = \frac{(14.8^2 + 6.5^2)}{3.8^2} + \frac{55}{\sqrt{2} + 14}
 $$


In [30]:
# Set up the chroma client
chroma_client = chromadb.PersistentClient(path="./chroma_save_states")
collection = chroma_client.get_or_create_collection(name="00_variables")

# Add the data to the chroma client
metas = []
qids = []
sids = []
tids = []
for i in range(len(df.index)):
    metas.append({"00_variables": i})
for i in range(len(df.index)):
    qids.append(f'q{i}')
    sids.append(f's{i}')
    tids.append(f't{i}')

collection.add(
    documents=df["question"].tolist(),
    embeddings=question_embeddings,
    ids=qids,
)
collection.add(
    documents=df["solution"].tolist(),
    embeddings=df["solution_embeddings"].tolist(),
    metadatas=metas,
    ids=sids,
)
collection.add(
    documents=df["test"].tolist(),
    embeddings=df["test_embeddings"].tolist(),
    metadatas=metas,
    ids=tids,
)

Add of existing embedding ID: q0
Add of existing embedding ID: q1
Add of existing embedding ID: q2
Add of existing embedding ID: q3
Add of existing embedding ID: q4
Add of existing embedding ID: q5
Add of existing embedding ID: q6
Insert of existing embedding ID: q0
Exception occurred invoking consumer for subscription 1956c1cd7f9d47f7b5767ac66eb18044to topic persistent://default/default/07e62d56-5358-46c4-82ac-d67f64ab2099 row value misused
Add of existing embedding ID: s0
Add of existing embedding ID: s1
Add of existing embedding ID: s2
Add of existing embedding ID: s3
Add of existing embedding ID: s4
Add of existing embedding ID: s5
Add of existing embedding ID: s6
Insert of existing embedding ID: s0
Insert of existing embedding ID: s1


Exception occurred invoking consumer for subscription 1956c1cd7f9d47f7b5767ac66eb18044to topic persistent://default/default/07e62d56-5358-46c4-82ac-d67f64ab2099 row value misused
Add of existing embedding ID: t0
Add of existing embedding ID: t1
Add of existing embedding ID: t2
Add of existing embedding ID: t3
Add of existing embedding ID: t4
Add of existing embedding ID: t5
Add of existing embedding ID: t6
Insert of existing embedding ID: t0
Insert of existing embedding ID: t1
Insert of existing embedding ID: t2
Insert of existing embedding ID: t3
Insert of existing embedding ID: t4
Insert of existing embedding ID: t5
Insert of existing embedding ID: t6
