In [None]:
import os
import json
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from openai import OpenAI

In [None]:
# Load the answer key to work with
with open(os.path.join(os.getcwd(), "ce311k_answers", "00_variables_raw.json")) as f:
    answer_key = json.load(f)

del answer_key["cells"][0:2]

In [None]:
# Get the data all cleaned up
questions_ordered = {}
q_num = -1
question, solution, test = False, False, False
for item in answer_key["cells"]:
    # Set the question number
    if item["cell_type"] == "raw":
        if item["source"][0] == "# BEGIN QUESTION\n":
            q_num += 1
    # Ensure naming sequence is created
    if f"q{q_num}" not in questions_ordered.keys():
        questions_ordered[f"q{q_num}"] = {}
        questions_ordered[f"q{q_num}"]["question"] = None
        questions_ordered[f"q{q_num}"]["solution"] = None
        questions_ordered[f"q{q_num}"]["test"] = None
    # Denote what type of info the line is
    if item["cell_type"] == "raw":
        if item["source"][0] == "# BEGIN QUESTION\n":
            question = True
        elif item["source"][0] == "# BEGIN SOLUTION":
            solution = True
        elif item["source"][0] == "# BEGIN TESTS":
            test = True
    # Put the info into the correct place
    elif item["cell_type"] != "raw":
        if question:
            questions_ordered[f"q{q_num}"]["question"] = item["source"]
            question = False
        elif solution:
            questions_ordered[f"q{q_num}"]["solution"] = item["source"][1:-1]
            solution = False
        elif test:
            questions_ordered[f"q{q_num}"]["test"] = item["source"]
            test = False

In [None]:
# Get data set up into dataframe for vectorization
df = pd.DataFrame(
    {
        "question": [
            questions_ordered[i]["question"] for i in questions_ordered.keys()
        ],
        "solution": [
            questions_ordered[i]["solution"] for i in questions_ordered.keys()
        ],
        "test": [questions_ordered[i]["test"] for i in questions_ordered.keys()],
    }
)

# Define the embedding function
model = SentenceTransformer(
    "all-MiniLM-L6-v2"
)  # This is the default embedding function for chroma

questions = []
solutions = []
tests = []
for i in questions_ordered.keys():
    questions.append(" ".join([str(item) for item in questions_ordered[i]["question"]]))
    solutions.append(" ".join([str(item) for item in questions_ordered[i]["solution"]]))
    tests.append(" ".join([str(item) for item in questions_ordered[i]["test"]]))

questions_concat = []
solutions_concat = []
tests_concat = []

def concatenate(iterable, sep=" "):
    sentence = iterable[0]
    for word in iterable[1:]:
        sentence += (sep + word)
    return sentence

for i in range(len(questions)):
    questions_concat.append(concatenate(questions[i]))
for i in range(len(solutions)):
    solutions_concat.append(concatenate(solutions[i]))
for i in range(len(tests)):
    tests_concat.append(concatenate(tests[i]))

question_embeddings = []
solution_embeddings = []
test_embeddings = []
for i in range(len(questions)):
    question_embeddings.append(model.encode(questions[i]).tolist())
for i in range(len(solutions)):
    solution_embeddings.append(model.encode(solutions[i]).tolist())
for i in range(len(tests)):
    test_embeddings.append(model.encode(tests[i]).tolist())

print(question_embeddings[0])

df["question_embeddings"] = question_embeddings
df["solution_embeddings"] = solution_embeddings
df["test_embeddings"] = test_embeddings

In [None]:
# Set up the chroma client
chroma_client = chromadb.PersistentClient(path="./chroma_save_states")
collection = chroma_client.get_or_create_collection(name="00_variables")

# Add the data to the chroma client
metas = []
qids = []
sids = []
tids = []
for i in range(len(df.index)):
    metas.append({"00_variables": i})
for i in range(len(df.index)):
    qids.append(f'q{i}')
    sids.append(f's{i}')
    tids.append(f't{i}')

collection.add(
    documents=questions,
    embeddings=question_embeddings,
    ids=qids,
)
collection.add(
    documents=solutions,
    embeddings=df["solution_embeddings"].tolist(),
    metadatas=metas,
    ids=sids,
)
collection.add(
    documents=tests,
    embeddings=df["test_embeddings"].tolist(),
    metadatas=metas,
    ids=tids,
)

In [None]:
# Query the chromadb and see the context it provides
question = "What variable should I use for question 2?"

results = collection.query(
    [model.encode(question).tolist()],
    n_results=3,
)

# Print the associated answers for the query

# First way
print(collection.query(
    [model.encode(question).tolist()],
    n_results=3,
))

# Second way
collection.get(
    ids=results["ids"][0],
    include=["documents", "embeddings", "metadatas"],
)

In [None]:
p1 = "Consider the following question: "
p2 = "The following information is provided:"
p3 = """Based solely on the data presented, answer the question as effectively as possible. This is used to answer technical user support tickets submitted by students, so honesty is imperative.

Ensure each answer:
1. Is directly related to the content given.
2. Can be answered with the information from the provided text only.
3. Answers must be technical and focused on questions a student user will ask for technical support. 

Avoid:
- Generating answers on general topics or external knowledge not contained in the text.
- Creating any answers if the information provided does not sufficiently cover 3 distinct points.
- Limit to fewer than 3 Q&A generation if the content is limited. 
- Do not overlap Q&A.

Output should be structured only as JSONL format. When not able to generate do not generate anything:
{"prompt": <Question>, "completion": <answer>}

If the content is not comprehensive enough to form 3 distinct answers, state that insufficient data is provided.
"""

In [None]:
# Authenticate with OpenAI API
with open("apiKeys.json", "r") as temp:
    keys = json.load(temp)
    apiKey = keys["GPT"]
client = OpenAI(api_key=apiKey)

def gpt4(question, tokens=500):
    """Generate a question from a given context

    Args:
        question (str): Context to generate a question from
        tokens (int, optional): Number of tokens to generate. Defaults to 500.
    """

    # Create the messages
    messages = [{"role": "user", "content": question}]

    response = client.chat.completions.create(
        model="gpt-4", max_tokens=tokens, temperature=0, messages=messages
    )

    # Extract the content
    content = response.choices[0].message.content

    # Split the content into text and code
    text_parts = []
    code_parts = []
    in_code_block = False

    for line in content.split("\n"):
        if line.startswith("```"):
            in_code_block = not in_code_block
            continue
        if in_code_block:
            code_parts.append(line)
        else:
            text_parts.append(line)

    # Print the text parts
    """for line in text_parts:
        print(line)"""

    # Print a separator
    """print("\n" + "-" * 50 + "\n")"""

    # Print the code parts
    for line in code_parts:
        print(line)
    return content

In [None]:
question = "How would I start to solve question 3?"
data = concatenate(collection.query(
        [model.encode(question).tolist()],
        n_results=3,
    )['documents'][0])

prompt = f'{p1} {question} \n{p2} \n{data} \n{p3}'

print(gpt4(prompt), 2000)

In [56]:
def help_desk(collection: chromadb.PersistentClient):
    """
    Function to provide assistance to users by answering their questions using ChatGPT.

    Parameters:
    - collection (chromadb.PersistentClient): The chroma database client used for querying data.

    Returns:
    None
    """
    while True:
        question = input("What question do you have? (Type 'exit' to quit.)")
        if question == "exit":
            break
        data = concatenate(collection.query(
            [model.encode(question).tolist()],
            n_results=3,
        )['documents'][0])
        prompt = f'{p1} {question} \n{p2} \n{data} \n{p3}'
        print(gpt4(prompt), 2000)



In [57]:
help_desk(collection)

{"prompt": "How would I define the variable in question 2?", "completion": "The variable in question 2 is `x` and it is defined as `2.34`. You can define it in your code by writing `x = 2.34`."}
{"prompt": "How do I evaluate the function `v3`?", "completion": "You can evaluate the function `v3` by substituting the value of `x` into the function. In Python, you can use the `math` library to access the exponential (`exp`) and square root (`sqrt`) functions. Here is an example: `import math\nv3 = math.exp(2*x) / math.sqrt(14 + x**2 - x)`."}
{"prompt": "How can I convert degrees Fahrenheit to degrees Celsius in the Exercise 06?", "completion": "In Exercise 06, you can convert degrees Fahrenheit to degrees Celsius using the formula `T_c = 5*(T_f - 32)/9`. You can define a variable `T_f` as the temperature in Fahrenheit you want to convert, and then calculate `T_c` using the formula. Here is an example in Python: `T_f = 78\nT_c = 5*(T_f - 32)/9`."} 2000
