## Start

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import glob
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import tqdm


plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 600
# plt.style.use('ggplot')

In [None]:
all_qs = []
for topic in glob.glob('../questions/topics/*'):
    if 'Demo' not in topic:
        with open(f"{topic}/questions.yaml") as f:
            all_qs.extend(yaml.safe_load(f))

all_qs = pd.DataFrame(all_qs)
correct_answers = dict(zip(all_qs['Id'], all_qs['MultipleChoice'].apply(lambda r: r['Correct'])))

In [None]:
all_user_data = []

for file in glob.glob('../backend/users/*.yaml'):
    if 'demo' not in file:
        with open(file) as f:
            data = yaml.safe_load(f)

        if 'pretest' in data:
            data['pretest'] = pd.DataFrame(data['pretest'])
        if 'questionSchedule' not in data:
            continue
        
        qSchedule = []
        for i, day in enumerate(data['questionSchedule']):
            for j, q in enumerate(day):
                q['day'] = i
                q['numInDay'] = j
                qSchedule.append(q)
        data['questionSchedule'] = pd.DataFrame(qSchedule)

        if 'posttestA' in data:
            data['posttestA'] = pd.DataFrame(data['posttestA'])
        if 'posttestB' in data:
            data['posttestB'] = pd.DataFrame(data['posttestB'])
        if 'sleepData' in data:
            data['sleepData'] = np.array([float(d['numHours']) for d in data['sleepData']])

        all_user_data.append(data)

all_user_data = pd.DataFrame(all_user_data)

In [None]:
# bad_emails = ['madison.evans@som.umaryland.edu', 'puja.patel@som.umaryland.edu', 'kran2@jh.edu', 'charles1@usf.edu']
bad_emails = []
finished_study_data = all_user_data[all_user_data['status'].isin(['posttestDone', 'studyDone', 'posttestPartADone']) & ~all_user_data['email'].isin(bad_emails)]
finished_users = finished_study_data['email'].unique()
print(f"Participants who finished study portion: {len(finished_study_data)}\n{finished_users}")

In [None]:
finished_post_data = all_user_data[all_user_data['status'].isin(['posttestDone']) & ~all_user_data['email'].isin(bad_emails)].reset_index()
print(f"Participants who finished all post-tests: {len(finished_post_data)}\n{finished_post_data['email'].to_list()}")

## Fraction contested

Calculating fraction contested overall and per user

In [None]:
with open('../backend/scoring/contestedEvaluations.yaml') as f:
    contested_evaluations = pd.DataFrame(yaml.safe_load(f))

In [None]:
total_questions = 180

frac_contested = len(contested_evaluations[contested_evaluations['user'].isin(finished_users)]) / (len(finished_users) * total_questions)
print(f"Percentage of overall responses contested: {100 * frac_contested:.2f}%")

frac_contested_per_user = {user: (contested_evaluations['user'] == user).sum() / total_questions for user in finished_users}
plt.title('Percentage of responses contested per user')
plt.bar(frac_contested_per_user.keys(), np.array(list(frac_contested_per_user.values())) * 100)
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
contested_evaluations['correct'] = [correct_answers[qid] for qid in contested_evaluations['QID']]
contested_evaluations

In [None]:
import docx

document = docx.Document()
questions = contested_evaluations['QID'].map(lambda qid: all_qs['Question'][all_qs['Id'] == qid].item())

for i, qid in enumerate(contested_evaluations["QID"]):
    document.add_paragraph(f"QID: {contested_evaluations['QID'][i]}")

    topic = ' '.join(qid.split(' ')[:-1])
    figs = all_qs["Figures"][all_qs["Id"] == qid].item()
    if isinstance(figs, list):
        for f in figs:
            document.add_picture(f"../questions/topics/{topic}/{f}", height=docx.shared.Inches(2))

    document.add_paragraph(f"Question: {questions[i]}")
    document.add_paragraph(f"User Response: {contested_evaluations['userResponse'][i]}")
    document.add_paragraph(f"Correct Response: {contested_evaluations['correct'][i]}")
    document.add_paragraph("Was User Correct?: ")
    document.add_paragraph("Comments: ")
    document.add_paragraph(f"SAIL Score: {contested_evaluations['score'][i]}")
    document.add_paragraph(f"User: {contested_evaluations['user'][i]}")
    document.add_page_break()

document.save("contested_evaluations.docx")

## Reliability of voice transcription

Calculating fraction of times they edited voice transcription, overall and per user

In [None]:
num_voice_responses = 60
voice_edited_responses = finished_study_data['questionSchedule'].apply(
    lambda qs: qs[(qs['modality'] == 'voice') & (qs['userResponse'] != qs['originalResponse'])])

In [None]:
frac_edited_per_user = voice_edited_responses.apply(len) / num_voice_responses

print(f"Percentage of overall responses edited: {100 * frac_edited_per_user.mean():.2f}%")

plt.title('Percentage of responses edited per user')
plt.bar(finished_study_data['email'], frac_edited_per_user * 100)
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Study example response edits
voice_edited_responses[finished_study_data['email'] == 'andrewbharris@jhmi.edu'].squeeze()


## Study Performance per Modality

In [None]:
def calc_study_accuracy(qs, modality=None):
    if modality is not None:
        qs = qs[qs['modality'] == modality]
    return qs['score'].mean()

overall = finished_study_data['questionSchedule'].apply(lambda qs: calc_study_accuracy(qs)).mean()
voice = finished_study_data['questionSchedule'].apply(lambda qs: calc_study_accuracy(qs, 'voice')).mean()
voiceless = finished_study_data['questionSchedule'].apply(lambda qs: calc_study_accuracy(qs, 'voiceless')).mean()
mc = finished_study_data['questionSchedule'].apply(lambda qs: calc_study_accuracy(qs, 'mc')).mean()

sems = [finished_study_data['questionSchedule'].apply(lambda qs: calc_study_accuracy(qs, modality)).sem() for modality in [None, "voice", "voiceless", "mc"]]

plt.title('Accuracy during study per modality')
plt.bar(['Overall', 'Voice', 'Voiceless', 'MC'], [overall, voice, voiceless, mc], yerr=sems, alpha=1, ecolor='black', capsize=10)
plt.plot()


## Improvement per Modality

In [None]:
recall1 = pd.DataFrame(finished_post_data.iloc|[0]["first_posttest"]["A"])["automated_recall_score"]
recall2 = pd.DataFrame(finished_post_data.iloc[1]["first_posttest"]["A"])["automated_recall_score"]
recall1.to_numpy() == recall2.to_numpy()

In [None]:
def calc_test_accuracies(test_type, modality, test_method):
    
    accs = []
    for _, row in finished_post_data.iterrows():
        qSched = row['questionSchedule']
        if modality == 'all':
            modality_qids = qSched['qid'].unique()
        else:
            modality_qids = qSched['qid'][qSched['modality'] == modality].unique()

        test = row[test_type.split(".")[0]]
        if "." in test_type:
            test = test[test_type.split(".")[1]]
        
        test = pd.DataFrame(test)
        if test_method == 'recognition':
            accuracy = np.mean([test['response'][i] == correct_answers[test['QID'][i]]
                                for i in range(len(test)) if test['QID'][i] in modality_qids])
        elif test_method == 'recall':
            accuracy = np.mean([test['automated_recall_score'][i]
                                for i in range(len(test)) if test['QID'][i] in modality_qids])
        
        accs.append(accuracy)
    
    return accs

pretest_acc = calc_test_accuracies('pretest', 'all', test_method="recognition")
posttest_recall_acc = []
posttest_recognition_acc = []

for posttest_iteration in ['first_posttest', 'second_posttest', 'posttest']:
    recall_testname = f'{posttest_iteration}.A'
    recog_testname = f'{posttest_iteration}.B'
    if posttest_iteration == "posttest":
        recall_testname = "posttestA"
        recog_testname = "posttestB"
    
    posttest_recall_acc.append({modality: calc_test_accuracies(recall_testname, modality, test_method='recall')
                                for modality in ['all', 'voice', 'voiceless', 'mc']})
    posttest_recognition_acc.append({modality: calc_test_accuracies(recog_testname, modality, test_method="recognition")
                                     for modality in ['all', 'voice', 'voiceless', 'mc']})



In [None]:
bar_width = 0.5
plt.title('Pre-Test Baseline Recognition Scores per User')
plt.bar(finished_post_data['email'], pretest_acc, alpha=1, ecolor='black', capsize=10)
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
print("Avg # of questions on post-test:", (1-np.mean(pretest_acc)) * 90)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(16, 4), dpi=300)
bar_width = 0.375
for i, modality in enumerate(['all', 'voice', 'voiceless', 'mc']):
    ax[i].set_title(f'{modality}')
    ax[i].set_ylim(0, 1.1)
    ax[i].bar(np.arange(len(finished_post_data)) + bar_width,
              posttest_recognition_acc[0][modality], width=bar_width, label='post-test recognition')
    ax[i].bar(np.arange(len(finished_post_data)) + (2 * bar_width),
              posttest_recall_acc[0][modality], width=bar_width, label='post-test recall')

    ax[i].set_xticks(np.arange(len(finished_post_data)) + 1.5 * bar_width,
                     finished_post_data['email'], rotation=45, ha='right')

plt.suptitle("Post-Test #1")
plt.legend(bbox_to_anchor=(1.1, 1.05))
plt.show()

In [None]:
# bar_width = 0.25
# mean_pretest_acc = {modality: np.mean(accs) for modality, accs in pretest_acc.items()}
# mean_posttest_recall_acc = {modality: np.mean(accs) for modality, accs in posttest_recall_acc.items()}
# mean_posttest_recognition_acc = {modality: np.mean(accs) for modality, accs in posttest_recognition_acc.items()}

# def print_dict(title, d):
#     print(f"{title}:")
#     for k, v in d.items():
#         print(f"  {k}: {v * 100:.2f}%")
# print_dict("Recall", mean_posttest_recall_acc)
# print_dict("Recognition", mean_posttest_recognition_acc)

# plt.figure(figsize=(8, 6))
# plt.bar(np.arange(4), mean_pretest_acc.values(), width=bar_width, label='pre-test recognition')
# plt.bar(np.arange(4) + bar_width, mean_posttest_recognition_acc.values(), width=bar_width, label='post-test recognition')
# plt.bar(np.arange(4) + 2 * bar_width, mean_posttest_recall_acc.values(), width=bar_width, label='post-test recall')
# plt.xticks(np.arange(4) + bar_width, mean_pretest_acc.keys())
# plt.legend()
# plt.show()

In [None]:
mean_posttest_recall_acc = [{modality: np.mean(accs) for modality, accs in posttest_recall_acc[i].items()} for i in range(3)] 
mean_posttest_recall_sem = [{modality: scipy.stats.sem(accs) for modality, accs in posttest_recall_acc[i].items()} for i in range(3)]
mean_posttest_recognition_acc = [{modality: np.mean(accs) for modality, accs in posttest_recognition_acc[i].items()} for i in range(3)]
mean_posttest_recognition_sem = [{modality: scipy.stats.sem(accs) for modality, accs in posttest_recognition_acc[i].items()} for i in range(3)]


fig, ax = plt.subplots(1, 3, figsize=(16, 4), dpi = 300)
for i in range(3):
  ax[i].bar(np.arange(4), mean_posttest_recall_acc[i].values(), yerr=mean_posttest_recall_sem[i].values(), alpha=1, ecolor='black', capsize=10)
  ax[i].axis(ymin=0, ymax=0.8)
  ax[i].set_xticks(np.arange(4), mean_posttest_recall_acc[i].keys())
  ax[i].set_title(f"Test {i + 1}")

fig.suptitle('Post-Test Recall Scores per Learning Modality')
fig.show()


fig, ax = plt.subplots(1, 3, figsize=(16, 4), dpi = 300)
for i in range(3):
  ax[i].bar(np.arange(4), mean_posttest_recognition_acc[i].values(), yerr=mean_posttest_recognition_sem[i].values(), alpha=1, ecolor='black', capsize=10)
  ax[i].axis(ymin=0, ymax=1)
  ax[i].set_xticks(np.arange(4), mean_posttest_recognition_acc[i].keys())
  ax[i].set_title(f"Test {i + 1}")

fig.suptitle('Post-Test Recognition Scores per Learning Modality')
fig.show()

In [None]:
import seaborn as sns

mean_posttest_recall_acc = [{modality: np.mean(accs) for modality, accs in posttest_recall_acc[i].items()} for i in range(3)] 
mean_posttest_recall_sem = [{modality: scipy.stats.sem(accs) for modality, accs in posttest_recall_acc[i].items()} for i in range(3)]
mean_posttest_recognition_acc = [{modality: np.mean(accs) for modality, accs in posttest_recognition_acc[i].items()} for i in range(3)]
mean_posttest_recognition_sem = [{modality: scipy.stats.sem(accs) for modality, accs in posttest_recognition_acc[i].items()} for i in range(3)]


fig, ax = plt.subplots(1, 3, figsize=(16, 4), dpi = 300)
for i in range(3):
  ax[i].violinplot([accs for _, accs in posttest_recall_acc[i].items()])
  ax[i].axis(ymin=0, ymax=1)
  ax[i].set_xticks(np.arange(4) + 0.5, mean_posttest_recall_acc[i].keys())
  ax[i].set_title(f"Test {i + 1}")

fig.align_xlabels()
fig.suptitle('Post-Test Recall Scores per Learning Modality')
fig.show()

fig, ax = plt.subplots(1, 3, figsize=(16, 4), dpi = 300)
for i in range(3):
  ax[i].violinplot([accs for _, accs in posttest_recall_acc[i].items()])
  ax[i].axis(ymin=0, ymax=1)
  ax[i].set_xticks(np.arange(4) + 0.5, mean_posttest_recognition_acc[i].keys())
  ax[i].set_title(f"Test {i + 1}")
fig.suptitle('Post-Test Recognition Scores per Learning Modality')
fig.show()


for i in range(3):
  print(f"Recall Test {i + 1}")
  for key, accs in posttest_recall_acc[i].items():
    ks_result = (scipy.stats.kstest(accs, cdf='norm'))
    # print(f"{key} K-S statistic: {ks_result[0]}")
    print(f"{key} p-value: {ks_result[1]}")

for i in range(3):
  print(f"Recognition Test {i + 1}")
  for key, accs in posttest_recognition_acc[i].items():
    ks_result = (scipy.stats.kstest(accs, cdf='norm'))
    # print(f"{key} K-S statistic: {ks_result[0]}")
    print(f"{key} p-value: {ks_result[1]}")



In [None]:
# Create list of all post-test recall responses
qs = []
for i in range(len(finished_post_data)):
    for q in finished_post_data["first_posttest"][i]["A"]:
        qs.append((q, finished_post_data["email"][i]))

    for q in finished_post_data["second_posttest"][i]["A"]:
        qs.append((q, finished_post_data["email"][i]))

    for q in finished_post_data["posttestA"][i].to_dict("records"):
        qs.append((q, finished_post_data["email"][i]))

In [None]:
# Combine duplicate responses
def similar(q1, q2):
    return q1['QID'] == q2['QID'] and q1['response'] == q2['response']

edges = []
for i in range(len(qs)):
    for j in range(i + 1, len(qs)):
        if similar(qs[i][0], qs[j][0]):
            edges.append((i, j))

g = nx.Graph(edges)
clustered_qs = [[qs[i] for i in group] for group in nx.connected_components(g)]
print(f"Original questions: {len(qs)}, reduced questions: {len(clustered_qs)}")
print(f"{len(qs) / len(clustered_qs):.2f}x reduction in questions to grade!")

In [None]:
clustered_qs[30]

In [None]:
import docx

def add_question(q_group, document):
    q = q_group[0][0]
    document.add_paragraph(f"QID: {q['QID']}")

    topic = ' '.join(q["QID"].split(' ')[:-1])
    figs = all_qs["Figures"][all_qs["Id"] == q["QID"]].item()
    if isinstance(figs, list):
        for f in figs:
            document.add_picture(f"../questions/topics/{topic}/{f}", height=docx.shared.Inches(2))

    document.add_paragraph(f"Question: {all_qs['Question'][all_qs['Id'] == q['QID']].item()}")
    document.add_paragraph(f"User Response: {q['response']}")
    document.add_paragraph(f"Correct Response: {all_qs['MultipleChoice'][all_qs['Id'] == q['QID']].item()['Correct']}")
    document.add_paragraph("Was User Correct?: ")
    document.add_paragraph("Comments: ")
    document.add_paragraph(f"SAIL Score: {q['automated_recall_score']}")
    document.add_paragraph(f"Users: {', '.join([q[1] for q in q_group])}")
    document.add_page_break()

In [None]:
# Write document for posttest questions
np.random.default_rng(42).shuffle(clustered_qs)

document = docx.Document()
for q_group in tqdm.tqdm(clustered_qs):
    add_question(q_group, document)
document.save("posttest_answers_to_grade.docx")

## Forgetting Curve

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 4), dpi = 300)
modalities = ['voice', 'voiceless', 'mc']

for modality in modalities:
  ax[0].errorbar(['Test 1', 'Test 2', 'Test 3'], 
                 [mean_posttest_recall_acc[i][modality] for i in range(3)], 
                #  yerr=[mean_posttest_recall_sem[i][modality] for i in range(3)], 
                 fmt='o-', label=modality)
ax[0].axis(ymin=0, ymax=0.5)
ax[0].set_title("Post-Test Recall Scores per Modality over Time")

for modality in modalities:
  ax[1].errorbar(['Test 1', 'Test 2', 'Test 3'], 
                 [mean_posttest_recognition_acc[i][modality] for i in range(3)], 
                #  yerr=[mean_posttest_recognition_sem[i][modality] for i in range(3)], 
                 fmt='o-')
ax[1].set_xticks(np.arange(3), ['Test 1', 'Test 2', 'Test 3'])
ax[1].axis(ymin=0, ymax=0.8)
ax[1].set_title("Post-Test Recognition Scores per Modality over Time")

fig.legend()
fig.show()
fig.savefig('forgetting_curve_axis=0.png', facecolor='white')

## Automatic Free Response Grading

In [None]:
import os
import sys

sys.path.append(os.path.abspath('../backend/scoring'))

In [None]:
import score

scorer = score.new_scorer(root='../backend/scoring', verbose=False)

In [None]:
for i in tqdm.trange(len(finished_post_data)):
    for posttest_iteration in ['first_posttest', 'second_posttest', "posttestA"]:
        if posttest_iteration == "posttestA":
            posttestA = finished_post_data.iloc[i][posttest_iteration]
        else:
            posttestA = finished_post_data.iloc[i][posttest_iteration]['A']
        
        # if posttest_iteration == "posttestA":
        #     print(posttestA.iloc[0]['start'])
        for j in range(len(posttestA)):
            if posttest_iteration == "posttestA":
                # print(posttestA.iloc[j])
                posttestA.at[j, "automated_recall_score"] = scorer.score(posttestA.iloc[j]['QID'], posttestA.iloc[j]["response"])
                pass
            else: 
                posttestA[j]["automated_recall_score"] = scorer.score(posttestA[j]['QID'], posttestA[j]["response"])

In [None]:
pd.DataFrame(finished_post_data["posttestA"][2])

In [None]:
pd.DataFrame(finished_post_data["first_posttest"][0]["A"])

In [None]:
for posttest_iteration in ['posttestA', 'first_posttest', 'second_posttest']:
    print(f"Writing scores for {posttest_iteration}")
    for email, posttest in tqdm.tqdm(list(zip(finished_post_data['email'], finished_post_data[posttest_iteration]))):
        with open(f'../backend/users/{email}.yaml') as f:
            user_data = yaml.safe_load(f)
        
        if posttest_iteration != "posttestA": 
            for q, posttest_q in zip(user_data[posttest_iteration]["A"], posttest["A"]):
                q['automated_recall_score'] = posttest_q["automated_recall_score"]
        else:
            posttest_array = [posttest.iloc[i].to_dict() for i in range(posttest.shape[0])]
            for q, posttest_q in zip(user_data[posttest_iteration], posttest_array):
                q['automated_recall_score'] = int(posttest_q["automated_recall_score"])
        
        with open(f'../backend/users/{email}.yaml', 'w') as f:
            yaml.dump(user_data, f)

## Question Difficulties

In [None]:
question_difficulties = pd.read_csv('question_difficulties.csv')
plt.title('Percentage who answered question correctly')
plt.hist(question_difficulties['Percent who answered correctly'], bins=30)
plt.show()

## Sleep Data

In [None]:
finished_post_data['sleepData'].to_numpy()