### setup

In [1]:
import chromadb
import pandas as pd
# import chromadb.utils.embedding_functions as embedding_functions

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
#     api_key="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",
#     model_name="google-bert/bert-base-uncased"
# )

In [3]:
client = chromadb.PersistentClient(path="./database")
# collection = client.create_collection("BST_question", embedding_function=huggingface_ef)
collection = client.create_collection(
    name="Queue_question",
    metadata={"hnsw:space": "cosine"} # l2 is the default
)
# collection = client.create_collection("BST_question")

In [4]:
df = pd.read_csv('data/data_queue_final.csv').iloc[:, 1:]
df.head(2)

Unnamed: 0,question,student_answer,correct
0,What is the role of a prototype program in pro...,High risk problems are address in the prototyp...,0
1,What is the role of a prototype program in pro...,To simulate portions of the desired final prod...,0


In [5]:
df.shape

(2297, 3)

In [6]:
df['question'].value_counts()

question
What is a queue?                                                                        79
What is a pointer?                                                                      54
What are the similarities between iteration and recursion?                              31
When defining a recursive function, what are possible causes for infinite recursion?    31
How are overloaded functions differentiated by the compiler?                            31
                                                                                        ..
What is a binary search tree?                                                           24
What is a binary tree?                                                                  24
What is a leaf?                                                                         24
What is the height of a tree?                                                           24
What is a tree?                                                                  

In [7]:
df['correct'].value_counts()

correct
0    2240
1      57
Name: count, dtype: int64

In [8]:
df_correct = df.loc[df['correct'] == 1]
df_incorrect = df.loc[df['correct'] == 0]

In [9]:
df_correct['correct'].value_counts()

correct
1    57
Name: count, dtype: int64

In [10]:
df_incorrect['correct'].value_counts()

correct
0    2240
Name: count, dtype: int64

### add response to collection

In [11]:
# add correct responses
l = df_correct['student_answer'].tolist()
ids = [f"id{i}"for i in range(len(l))]
collection.add(
    documents=df_correct['student_answer'].tolist(),
    metadatas=[{"correct": "True"} for _ in range(len(l))],
    ids=ids,
)

/home/ioeddk/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [01:28<00:00, 941kiB/s] 


In [12]:
# add incorrect responses
l2 = df_incorrect['student_answer'].tolist()

ids = [f"id{i}"for i in range(len(l), len(l2)+len(l))]
collection.add(
    documents=df_incorrect['student_answer'].tolist(),
    metadatas=[{"correct": "False"} for _ in range(len(l2))],
    ids=ids,
)

### test queries - correct

In [13]:
results_c = collection.query(
    query_texts=["A queue is a data structure that follows FIFO principle, meaning that the first element added to the queue will be the first one to be removed."],
    n_results=3
)
results_c

{'ids': [['id2', 'id48', 'id2166']],
 'distances': [[0.07562458515167236,
   0.07847166061401367,
   0.07950538396835327]],
 'metadatas': [[{'correct': 'True'},
   {'correct': 'True'},
   {'correct': 'False'}]],
 'embeddings': None,
 'documents': [['A queue stores a set of elements in a particular order.  Its principle of operation is FIFO(first in first out), which means the first element inserted is the first one to be removed.',
   'A queue is a data structure that holds a set of objects, which has a FIFO (first in first out) priority.',
   'A queue is a data type that operates under a FILO (First In Last Out) method. This means that the first element inserted into the queue is the first element removed.']],
 'uris': None,
 'data': None}

In [14]:
collection.query(
    query_texts=["Essentially a array of sorts with a specific order of removal: the first element to be added is the first to be removed."],
    n_results=3
)

{'ids': [['id1335', 'id15', 'id56']],
 'distances': [[0.2999531626701355, 0.30400264263153076, 0.3084213137626648]],
 'metadatas': [[{'correct': 'False'},
   {'correct': 'True'},
   {'correct': 'True'}]],
 'embeddings': None,
 'documents': [['is a list of element where the first one to be removed is the last one inserted',
   'object that stores elements in order that follows first in first out.  the first element added is the first one to come off.',
   'A data structure that can store elements, which has the property that the last item added will be the last to be removed (or first-in-first-out).']],
 'uris': None,
 'data': None}

### test queries - incorrect

In [15]:
# test give up
collection.query(
    query_texts=["I'm not sure"],
    n_results=3
)

{'ids': [['id2291', 'id2292', 'id399']],
 'distances': [[0.26199963030841744, 0.529553519650718, 0.6609969139099121]],
 'metadatas': [[{'correct': 'False'},
   {'correct': 'False'},
   {'correct': 'False'}]],
 'embeddings': None,
 'documents': [['I’m not very sure? ',
   "Sorry, I don't know. But please give credit. ",
   'several']],
 'uris': None,
 'data': None}

In [16]:
# test give up
collection.query(
    query_texts=["I have no idea"],
    n_results=3
)

{'ids': [['id2291', 'id256', 'id2292']],
 'distances': [[0.4265668729383062, 0.565169095993042, 0.5688009278043047]],
 'metadatas': [[{'correct': 'False'},
   {'correct': 'False'},
   {'correct': 'False'}]],
 'embeddings': None,
 'documents': [['I’m not very sure? ',
   'NO ANSWER',
   "Sorry, I don't know. But please give credit. "]],
 'uris': None,
 'data': None}

In [17]:
# test a legitimate attempt but incorrect
collection.query(
    query_texts=["I think it's like a type of array which uses LIFO?"],
    n_results=3
)

{'ids': [['id1319', 'id1148', 'id746']],
 'distances': [[0.3585348129272461, 0.4374539256095886, 0.45801055431365967]],
 'metadatas': [[{'correct': 'False'},
   {'correct': 'False'},
   {'correct': 'False'}]],
 'embeddings': None,
 'documents': [['A data structure that stores data using LIFO.',
   'An array based list that uses an implicit ordering scheme, often using pointers.',
   'array  it is the collection of similar data types ex:int a[10]  ten indicates the size of array. [ ] is index of array, we can give only integer values to array of a.  where as string mean collection of group of characters.  string declarations have a datatype usually causes storage to be allocated in memory that is capable of holding some predetermined number of symbols.    However  Arrays can be declared to contain values of any non reference data type. Multiple arrarys of the same type ']],
 'uris': None,
 'data': None}

In [18]:
# related to the domain but very clearly not a legitimate attempt
collection.query(
    query_texts=["it is a binary tree with black and red nodes"],
    n_results=3
)

{'ids': [['id1703', 'id1700', 'id1692']],
 'distances': [[0.269878625869751, 0.31100910902023315, 0.32862555980682373]],
 'metadatas': [[{'correct': 'False'},
   {'correct': 'False'},
   {'correct': 'False'}]],
 'embeddings': None,
 'documents': [['it is a binary tree where each node has a unique key, the left child of a node has only values less than that node, and the right child of each node has higher values than that node.',
   'a binary tree with a special organization of data. where the left child of the element is less than it, and the right child is larger than it.',
   'a binary tree in which the data is in order from left to right.']],
 'uris': None,
 'data': None}

In [19]:
# test complete random gibberish
collection.query(
    query_texts=['kangaroo banana hospital'],
    n_results=3
)

{'ids': [['id2202', 'id2281', 'id2288']],
 'distances': [[0.78337374237208, 0.7900680229388891, 0.7927037885660632]],
 'metadatas': [[{'correct': 'False'},
   {'correct': 'False'},
   {'correct': 'False'}]],
 'embeddings': None,
 'documents': [['it is where you visit the parent then you vistit the children',
   'A queue is a common surgical tool used in most hospitals.',
   'A queue is a bouncy ball where the snowman likes to smile and participate in the lightsaber battle at TreeHacks.']],
 'uris': None,
 'data': None}

In [20]:
# test sentence that is grammatically correct but not relevant
collection.query(
    query_texts=['Have you visited your parents recently?'],
    n_results=3
)

{'ids': [['id2202', 'id2290', 'id2208']],
 'distances': [[0.5675092412830393, 0.6989030639374829, 0.7312304429639536]],
 'metadatas': [[{'correct': 'False'},
   {'correct': 'False'},
   {'correct': 'False'}]],
 'embeddings': None,
 'documents': [['it is where you visit the parent then you vistit the children',
   'it is ur mum',
   'go to the bottom of the left sub tree and visit the parent and then its children']],
 'uris': None,
 'data': None}

### test queries - maybe's

In [21]:
collection.query(
    query_texts=["a queue is a list of objects in a particular order that is read one at a time starting at the first followed by the second and so on."],
    n_results=3
)

{'ids': [['id1475', 'id5', 'id35']],
 'distances': [[-2.384185791015625e-07,
   0.11749064922332764,
   0.12926483154296875]],
 'metadatas': [[{'correct': 'False'},
   {'correct': 'True'},
   {'correct': 'True'}]],
 'embeddings': None,
 'documents': [['a queue is a list of objects in a particular order that is read one at a time starting at the first followed by the second and so on.',
   'a queue is a data structure that stores elements in a First in First out order.',
   'A queue is a First in First out data structure much like a line for a movie theatre.  The first object in line is the first object to be handled or enacted upon.']],
 'uris': None,
 'data': None}

In [22]:
collection.query(
    query_texts=["a queue either uses FIFO or LIFO, I'm not sure"],
    n_results=3
)

{'ids': [['id10', 'id2287', 'id33']],
 'distances': [[0.1650996208190918, 0.1709868877665951, 0.18880122900009155]],
 'metadatas': [[{'correct': 'True'},
   {'correct': 'False'},
   {'correct': 'True'}]],
 'embeddings': None,
 'documents': [['A queue is a FIFO data structure.',
   'A queue is a LIFO data structure.',
   'A queue is like a stack except it follows FIFO (First in first out) .']],
 'uris': None,
 'data': None}

In [23]:
# import chromadb.utils.embedding_functions as embedding_functions
# huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
#     api_key="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",
#     model_name="google-bert/bert-base-uncased"
# )