# Import Dependencies

In [18]:
import pandas as pd

# Read data file to dict of pandas dataframes

In [19]:
db_file = './data/question_database_schema.xlsx'
dict_of_dfs = pd.read_excel(db_file, sheet_name = None)

print(dict_of_dfs.keys())

dict_keys(['learning_objectives', 'course_objective_description', 'questions', 'answer_choices', 'exams', 'exams_learning_objectives', 'distractor_type', 'student_question_responses'])


In [20]:
print(dict_of_dfs['exams'])

   exam_id  exam_number exam_form_id
0       1A            1            A
1       1B            1            B
2       2A            2            A
3       2B            2            B
4       2C            2            C
5       3A            3            A
6       3B            3            B
7       3C            3            C
8       4A            4            A
9       4B            4            B
10      4C            4            C


# Exam descriptions

### Number of exams: 4

- Exam 1: 
    - 2 forms (A, B)
    - 16 Questions
      - 2 with 5 options
      - 14 with 4 options
- Exam 2: 3 forms (A, B, C)
    - 16 Questions
      - 1 with 3 options
      - 15 with 4 options
- Exam 3: 3 forms (A, B, C)
    - 17 Questions
      - 17 with 4 options
- Exam 4: 3 forms (A, B, C)
    - 26 Questions
      - 25 with 4 options
      - 1 with 3 options (which form of the exam do you have?)


In [21]:
completed_answer_choices = dict_of_dfs['answer_choices']
student_responses        = dict_of_dfs['student_question_responses']
#convert option_id to number indexed at 1.
if 'A' in pd.unique(completed_answer_choices['option_id']):
    completed_answer_choices['option_id'] = completed_answer_choices['option_id'] = [ ord(letter) - 64 for letter in completed_answer_choices['option_id'] ]

student_responses = student_responses[student_responses['question_id'].isin(pd.unique(completed_answer_choices['question_id']))]

student_responses_with_details = pd.merge(
    left=student_responses, 
    right=completed_answer_choices,
    how='left',
    left_on=['question_id', 'selected_option'],
    right_on=['question_id', 'option_id'],
)

number_of_distractors_chosen = len(student_responses_with_details[student_responses_with_details['is_distractor'] > .5])
number_of_correct_answers_chosen = len(student_responses_with_details[student_responses_with_details['is_distractor'] < .5])
number_of_questions = len(student_responses_with_details)

print(f"Number of questions: {number_of_questions}.\nNumber of correct answers chosen: {number_of_correct_answers_chosen}.\nNumber of distractors chosen: {number_of_distractors_chosen}")

Number of questions: 16853.
Number of correct answers chosen: 12535.
Number of distractors chosen: 4247


In [22]:
number_of_solution_based_distractors = len(student_responses_with_details[student_responses_with_details['distractor_type'] == 2])

In [23]:
print(f"Number of distractors chosen: {number_of_distractors_chosen}. Number of solution-based distractors chosen: {number_of_solution_based_distractors}")

Number of distractors chosen: 4247. Number of solution-based distractors chosen: 1799


In [24]:
completed_answer_choices

Unnamed: 0,question_id,option_id,is_distractor,distractor_type,distractor_learning_objective_understanding,notes
0,1A01,1,1,2,66.0,Ignore middle term
1,1A01,2,1,2,66.0,Ignore middle term
2,1A01,3,1,5,66.0,7*8=56 vs 54
3,1A01,4,1,5,66.0,7*8=56 vs 54
4,1A01,5,0,0,66.0,Trinomial is prime
...,...,...,...,...,...,...
820,4C25,4,1,6,,
821,4C26,1,1,1,,
822,4C26,2,1,7,,
823,4C26,3,0,0,,


In [34]:
exam_question_distractor_count_frame = completed_answer_choices.groupby(by = 'distractor_type').count()["question_id"].reset_index().rename(columns = {"question_id": "count", "distractor_type": "distractor_id"})

exam_question_distractor_count_frame = pd.merge(
    left=exam_question_distractor_count_frame, 
    right=dict_of_dfs['distractor_type'],
    how='left',
    left_on=['distractor_id'],
    right_on=['distractor_id'],
)
display(exam_question_distractor_count_frame)

Unnamed: 0,distractor_id,count,distractor_type,explanation
0,0,206,solution,Solution based on expected student thinking fo...
1,1,242,solution_based,Answer based on manipulating the solution that...
2,2,211,question_learning_objective_understanding,Answer based on level of conception associated...
3,3,1,other_learning_objective_understanding,Answer based on level of conception associated...
4,4,3,representation,Answer based on a different representation tha...
5,5,34,mechanics_based,Answer based on mechanical error with expected...
6,6,40,unsure,It be like that sometimes
7,7,85,distractor_based,Answer based on manipulating another option
8,8,3,concept-based doppleganger,Answer based on conception but modified to loo...


In [33]:
student_responses_distractor_selection_counts = student_responses_with_details.groupby(by = 'distractor_type').count()["question_id"].reset_index().rename(columns = {"question_id": "count", "distractor_type": "distractor_id"})
student_responses_distractor_selection_counts = pd.merge(
    left=student_responses_distractor_selection_counts, 
    right=dict_of_dfs['distractor_type'],
    how='left',
    left_on=['distractor_id'],
    right_on=['distractor_id'],
)
student_responses_distractor_selection_counts["percent"] = student_responses_distractor_selection_counts["count"] / student_responses_distractor_selection_counts["count"].sum() 
display(student_responses_distractor_selection_counts)

Unnamed: 0,distractor_id,count,distractor_type,explanation,percent
0,0.0,12535,solution,Solution based on expected student thinking fo...,0.746931
1,1.0,1613,solution_based,Answer based on manipulating the solution that...,0.096115
2,2.0,1799,question_learning_objective_understanding,Answer based on level of conception associated...,0.107198
3,3.0,9,other_learning_objective_understanding,Answer based on level of conception associated...,0.000536
4,4.0,32,representation,Answer based on a different representation tha...,0.001907
5,5.0,264,mechanics_based,Answer based on mechanical error with expected...,0.015731
6,6.0,207,unsure,It be like that sometimes,0.012335
7,7.0,305,distractor_based,Answer based on manipulating another option,0.018174
8,8.0,18,concept-based doppleganger,Answer based on conception but modified to loo...,0.001073
