In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus  import stopwords
import re

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prabinapokharel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prabinapokharel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prabinapokharel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv("data/data_queue_final.csv", index_col=False)
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,question,student_answer,correct
0,What is the role of a prototype program in pro...,High risk problems are address in the prototyp...,0
1,What is the role of a prototype program in pro...,To simulate portions of the desired final prod...,0
2,What is the role of a prototype program in pro...,A prototype program simulates the behaviors of...,0
3,What is the role of a prototype program in pro...,Defined in the Specification phase a prototype...,0
4,What is the role of a prototype program in pro...,It is used to let the users have a first idea ...,0
...,...,...,...
2292,What is a queue?,A first in first out data structure,1
2293,What is a queue?,"A queue is a stack of sequenced tasks, underta...",0
2294,What is a queue?,A queue in computer science is the 81st or 113...,0
2295,What is a queue?,a queue is a abstract data type with a private...,1


In [3]:
responses_OG = df['student_answer'].tolist()

In [4]:
correct_responses_OG = [df.loc[i, 'student_answer'] for i in df.index if df.loc[i, 'correct'] == 1]
incorrect_responses_OG = [df.loc[i, 'student_answer'] for i in df.index if df.loc[i, 'correct'] == 0]
len(incorrect_responses_OG)

2240

In [5]:
query_question = "What is a queue?"
query_response_OG = "A queue is a data structure that follows FIFO principle, meaning that the first element added to the queue will be the first one to be removed."

In [6]:
def cleaning(text):
    lowered = text.lower() 
    removed = re.sub(r'[^a-z]', ' ', lowered)  
    tokens = nltk.word_tokenize(removed)
    lemmatizer = WordNetLemmatizer()
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

In [7]:
df['student_answer'] = df['student_answer'].apply(cleaning)

responses = df['student_answer'].tolist()

correct_responses = [df.loc[i, 'student_answer'] for i in df.index if df.loc[i, 'correct'] == 1]
incorrect_responses = [df.loc[i, 'student_answer'] for i in df.index if df.loc[i, 'correct'] == 0]

In [8]:
query_response = cleaning(query_response_OG)

In [23]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1, 1))),
])

In [25]:
pipeline.fit(responses)

query_response_vector = pipeline.transform([query_response])
student_responses_vectors = pipeline.transform(responses)

cosine_similarities = cosine_similarity(query_response_vector, student_responses_vectors)

response_similarities = list(zip(responses_OG, cosine_similarities[0]))

response_similarities.sort(key=lambda x: x[1], reverse=True)

print("Top student responses based on cosine similarity with the query response:")
for i, (response, similarity) in enumerate(response_similarities[:3], 1):
    if similarity < 0.5:
        break
    print(f"{i}. Response: {response}")
    print(f"   Cosine Similarity: {similarity}")


Top student responses based on cosine similarity with the query response:
1. Response: A Queue is a "first in first out" data structure, such that the first element added is also the first removed.
   Cosine Similarity: 0.6514737216657435
2. Response: A queue is a first in first out data structure.
   Cosine Similarity: 0.6011037668877947
3. Response: A queue stores a set of elements in a particular order.  Its principle of operation is FIFO(first in first out), which means the first element inserted is the first one to be removed.
   Cosine Similarity: 0.583007547420584


In [19]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1, 2))),
])

In [20]:
pipeline.fit(responses)

query_response_vector = pipeline.transform([query_response])
student_responses_vectors = pipeline.transform(responses)

cosine_similarities = cosine_similarity(query_response_vector, student_responses_vectors)

response_similarities = list(zip(responses_OG, cosine_similarities[0]))

response_similarities.sort(key=lambda x: x[1], reverse=True)

print("Top student responses based on cosine similarity with the query response:")
for i, (response, similarity) in enumerate(response_similarities[:3], 1):
    if similarity < 0.5:
        break
    print(f"{i}. Response: {response}")
    print(f"   Cosine Similarity: {similarity}")


Top 3 student responses based on cosine similarity with the query response:
1. Response: A Queue is a "first in first out" data structure, such that the first element added is also the first removed.
   Cosine Similarity: 0.4554544465374513
2. Response: A queue is a first in first out data structure.
   Cosine Similarity: 0.41581856401842565
3. Response: A data structure in C++ where the the first element in the queue is the first element taken out of the queue.
   Cosine Similarity: 0.40311361642995935
