# Empirical Analysis
As motivation for the paper we study occurence of overpractice and empirical solution paths.

In [None]:
# imports
import os
os.chdir("..")
import json
import numpy as np
from collections import defaultdict
from src.util import read_file
from src.bkt_inference import BKTInference

In [None]:
# definitions
DATE_STRING = "%Y-%m-%d %H:%M:%S"
BASE_PATH = "./Path/"
COL_NAMES = ["Anon Student Id", "Problem Hierarchy", "Problem Name",
    "KC (Default)", "Opportunity (Default)", "First Attempt", "Step Name"]
KC_COUNTS = {
    "subtraction-var": 0,
    "subtraction-const": 0,
    "divide": 0,
    "division-simple": 0,
    "combine-like-var": 0,
    "combine-like-const": 0,
    "cancel-var": 0,
    "cancel-const": 0,
    "distribute-multiplication": 0,   
}
BKTParas = './data/bkt_paras_tutorshop.csv'
SKC = "Opportunity (Single-KC)"
MASTERY_THRESHOLD = 0.95

In [None]:
# load data
df = read_file('./data/apta-combined.txt')
print("Number of problems:", len([n for n in df["Problem Name"].unique() if 
                                  ("_start" not in n) and ("_end" not in n)]))
df.head(3)

### Extract students' solution paths

In [None]:
# extract student sequences
sequences = {}
for s_id in df["Anon Student Id"].unique():
    s_df = df[df["Anon Student Id"] == s_id]
    ps = []
    for p in s_df["Problem Name"]:
        if "_start" in p or "_end" in p:
            continue
        if p not in ps:
            ps.append(p)
    if len(ps) >= 10:
        sequences[s_id] = ps

# get max length_seq
seqs = [(len(sequences[s_id]), s_id) for s_id in sequences]
seqs.sort(reverse=True)
ref_req = sequences[seqs[0][1]]
print("max len:", len(ref_req))
print("mean len:", round(np.mean([len(sequences[s]) for s in sequences])))
print(ref_req)

### Study master and under/over-practice

In [None]:
# prepare student data
student_data = {}
for s_id in sequences:
    s_df = df[df["Anon Student Id"] == s_id]
    assert np.all(s_df[SKC].values[:-1] < s_df[SKC].values[1:]), "order check"

    # preprocess student attempts
    student_data[s_id] = {k: {"att_number": [], "correct": []}
                          for k in KC_COUNTS}
    for kcs, correct in s_df[['KC (Default)', 'success']].values:
        for kc in kcs.split("~~"):
            n = len(student_data[s_id][kc]["att_number"]) + 1
            student_data[s_id][kc]["att_number"].append(n)
            student_data[s_id][kc]["correct"].append(correct)

In [None]:
# NOTE: Average number of opportunity per KC
print("Average number of opportunity per KC\n")
totals = 0
for kc in KC_COUNTS:
    avg = np.mean([len(student_data[s][kc]["correct"]) for s in student_data])
    totals += avg
    print(kc, round(avg, 1))
print("")
print(round(totals, 4))

In [None]:
# NOTE: Percent of students that achieved mastery for each KC
print("Percent of students that achieved mastery for each KC\n")
bkt_inf = BKTInference(para_path=BKTParas)
masteries = {k: [] for k in KC_COUNTS}
for s_id in student_data:
    for k in KC_COUNTS:
        bkt_pred = bkt_inf.manual_bkt(k, student_data[s_id][k]["correct"])[-1]
        masteries[k].append(bkt_pred >= MASTERY_THRESHOLD)

for k in KC_COUNTS:
    print(k, round(np.mean(masteries[k]), 3))

In [None]:
# NOTE: Number of over/under-practice for each KC
print("Number of over/under-practice for each KC\n")
under = {k: [] for k in KC_COUNTS}
over = {k: [] for k in KC_COUNTS}
for s_id in student_data:
    for k in KC_COUNTS:
        preds = bkt_inf.manual_bkt(k, student_data[s_id][k]["correct"])
    
        # determine overpractice
        for idx in range(1, len(preds) + 1):
            mastery = preds[-idx]
            if mastery < MASTERY_THRESHOLD:  # most recent below
                break
        overpractice = max(0, (idx - 2))
        over[k].append(overpractice)

print("\nAverage Overpractice") 
for k in KC_COUNTS:
    print(k, round(np.mean(over[k]), 1), "--", np.sum([v for v in over[k] if v>0]), "--", round(np.mean([v for v in over[k] if v > 0]), 1))

### Take most common empirical solution as reference

In [None]:
# NOTE: For each problem find most common correct solution
f_df = read_file("./data/apta-combined.txt")
com_sol_path = defaultdict(lambda: defaultdict(lambda: 0))
for s_id in f_df["Anon Student Id"].unique():
    s_df = f_df[f_df["Anon Student Id"] == s_id]
    for q_id in list(s_df["Problem Name"].unique())[:-1]:
        q_df = s_df[q_id == s_df["Problem Name"]]
        if np.all(q_df["First Attempt"].values == "correct"):
            sol = list(q_df["KC (Default)"].values)
            com_sol_path[q_id][str(sol)] += 1

for q in com_sol_path:
    # find common solution
    com_sol, com_c = "", 0
    for k in com_sol_path[q]:
        kcs = [e[1:-1] for e in k[1:-1].split(", ")]
        if com_sol_path[q][k] > com_c:
            com_sol = kcs
            com_c = com_sol_path[q][k]
        elif com_sol_path[q][k] == com_c:
            if len(kcs) > len(com_sol):
                com_sol = kcs
                com_c = com_sol_path[q][k]
    com_sol_path[q] = kcs

In [None]:
# aggregated opportunity counts over time
aggregated_counts = [KC_COUNTS.copy()]
for q in ref_req:
    if q not in com_sol_path:
        print("misses " + q)
        continue
    cs = aggregated_counts[-1].copy()
    for i, step in enumerate(com_sol_path[q]):
        for kc in step.split("~~"):
            cs[kc] += 1
    aggregated_counts.append(cs)
problem_counts = [len(sequences[s]) for s in sequences]

In [None]:
print("Median solved", np.median(problem_counts))
print("Quantile solved", np.percentile(problem_counts, 75))
print("Quantile solved", np.percentile(problem_counts, 25))

In [None]:
print("Quantile solved", np.percentile(problem_counts, [x for x in range(0, 105, 5)]))

In [None]:
# store basic KC steps
basic_kc_steps = dict(com_sol_path)
for k in basic_kc_steps:
    kcs = [e.split("~~") for e in basic_kc_steps[k]]
    basic_kc_steps[k] = kcs
with open("./data/common_solution_path.json", "w") as file:
    json.dump(basic_kc_steps, file, indent=4)
basic_kc_steps