In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns

# === Functions ===

def load_student_data(filepath):
    """
    Load the student-microgoal matrix, interpreting -2 as missing values,
    dropping any rows with illogical values (outside [-2, 2]),
    and clipping all values into the logical range [-2, 2].
    Returns a DataFrame with NaNs where data was missing.
    """
    df = pd.read_csv(filepath, index_col=0)
    df = df.replace(-2, np.nan)
    valid_mask = (df.le(2) & df.ge(-2)) | df.isna()
    df = df[valid_mask.all(axis=1)]
    df = df.clip(lower=-2, upper=2)
    return df

def train_als(R, mask, n_factors=20, n_iters=10, reg=0.1):
    n_users, n_items = R.shape
    U = np.random.rand(n_users, n_factors)
    V = np.random.rand(n_items, n_factors)
    I_f = np.eye(n_factors)

    for _ in range(n_iters):
        for i in range(n_users):
            idx = mask[i]
            V_i = V[idx]
            R_i = R[i, idx]
            A = V_i.T @ V_i + reg * I_f
            b = V_i.T @ R_i
            U[i] = np.linalg.solve(A, b)
        for j in range(n_items):
            idx = mask[:, j]
            U_j = U[idx]
            R_j = R[idx, j]
            A = U_j.T @ U_j + reg * I_f
            b = U_j.T @ R_j
            V[j] = np.linalg.solve(A, b)
    return U, V

# === Data Loading ===

df = load_student_data('student_abilities_pivoted.csv')
values = df.values.astype(float)
mask_full = ~np.isnan(values)
mu = values[mask_full].mean()

# === Cross-Validation Setup ===

factors_list = [10, 20, 30]
iters_list = [10, 30, 50]
n_folds = 2
reg = 0.01
random_seed = 42

coords = list(zip(*np.where(mask_full)))
np.random.seed(random_seed)
np.random.shuffle(coords)
folds = np.array_split(coords, n_folds)

rmse_grid = np.zeros((len(factors_list), len(iters_list)))

for i, n_factors in enumerate(factors_list):
    for j, n_iters in enumerate(iters_list):
        fold_rmses = []
        for k in range(n_folds):
            mask_train = mask_full.copy()
            mask_test = np.zeros_like(mask_full, dtype=bool)
            for (u, v) in folds[k]:
                mask_train[u, v] = False
                mask_test[u, v] = True
            
            R_train = np.where(mask_train, values, mu) - mu
            U, V = train_als(R_train, mask_train, n_factors=n_factors, n_iters=n_iters, reg=reg)
            preds = U @ V.T + mu

            diffs = preds[mask_test] - values[mask_test]
            mse = np.mean(diffs ** 2)
            fold_rmses.append(np.sqrt(mse))
            print(f"Fold {k+1}/{n_folds}, n_factors={n_factors}, n_iters={n_iters}, RMSE={fold_rmses[-1]:.4f}")
        rmse_grid[i, j] = np.mean(fold_rmses)

# === Visualization ===

fig, ax = plt.subplots()
cax = ax.imshow(rmse_grid, origin='lower', interpolation='nearest')
ax.set_xticks(range(len(iters_list)))
ax.set_xticklabels(iters_list)
ax.set_yticks(range(len(factors_list)))
ax.set_yticklabels(factors_list)
ax.set_xlabel('n_iters')
ax.set_ylabel('n_factors')
ax.set_title('3-Fold CV: Root Mean Squared Error')
fig.colorbar(cax, ax=ax, label='RMSE')
plt.show()

# === Optimal Parameters and Baseline Comparison ===

min_idx = np.unravel_index(np.argmin(rmse_grid), rmse_grid.shape)
best_f = factors_list[min_idx[0]]
best_it = iters_list[min_idx[1]]
best_rmse = rmse_grid[min_idx]

mask_demo = mask_full.copy()
mask_test_demo = np.zeros_like(mask_full, dtype=bool)
coords_demo = coords.copy()
np.random.shuffle(coords_demo)
n_hide_demo = int(len(coords_demo) * 0.2)
for (u, v) in coords_demo[:n_hide_demo]:
    mask_demo[u, v] = False
    mask_test_demo[u, v] = True

baseline_preds = np.full_like(values, mu)
diffs_baseline = baseline_preds[mask_test_demo] - values[mask_test_demo]
baseline_rmse = np.sqrt(np.mean(diffs_baseline ** 2))
improvement = (baseline_rmse - best_rmse) / baseline_rmse * 100

print(f"Optimal parameters: n_factors={best_f}, n_iters={best_it}, CV RMSE={best_rmse:.4f}")
print(f"Baseline RMSE (mean imputation): {baseline_rmse:.4f}")
print(f"Improvement over baseline: {improvement:.1f}% lower RMSE\n")

# === Classification at threshold of 1 ===

R_train_demo = np.where(mask_demo, values, mu) - mu
U_demo, V_demo = train_als(R_train_demo, mask_demo, n_factors=best_f, n_iters=best_it, reg=reg)
preds_demo = U_demo @ V_demo.T + mu

true_bin = (values[mask_test_demo] > 1).astype(int)
pred_bin = (preds_demo[mask_test_demo] > 1).astype(int)

accuracy = accuracy_score(true_bin, pred_bin)
precision = precision_score(true_bin, pred_bin)
recall = recall_score(true_bin, pred_bin)
f1 = f1_score(true_bin, pred_bin)

print(f"\nClassification metrics (threshold=1):")
print(f"  Accuracy:  {accuracy:.2f}")
print(f"  Precision: {precision:.2f}")
print(f"  Recall:    {recall:.2f}")
print(f"  F1 Score:  {f1:.2f}")

cm = confusion_matrix(true_bin, pred_bin)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Threshold = 1)')
plt.show()

# === Distribution Plots ===

# Plot distribution of raw prediction error (not absolute)
errors = preds_demo[mask_test_demo] - values[mask_test_demo]
plt.figure(figsize=(10, 5))
plt.hist(errors.flatten(), bins=400, alpha=0.7, color='purple')
plt.title('Distribution of Prediction Error (ELO scores)')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.xlim(-2.5, 2.5)
plt.xticks(np.linspace(-2.5, 2.5, num=21))  # Add more ticks for better granularity
plt.grid(True)
plt.show()


Fold 1/2, n_factors=10, n_iters=10, RMSE=0.5648
Fold 2/2, n_factors=10, n_iters=10, RMSE=0.5570
Fold 1/2, n_factors=10, n_iters=30, RMSE=0.5774
Fold 2/2, n_factors=10, n_iters=30, RMSE=0.5703


KeyboardInterrupt: 

In [8]:
import numpy as np

# Retrain on this demonstration split
R_train_demo = np.where(mask_demo, values, mu) - mu
U_demo, V_demo = train_als(R_train_demo, mask_demo, n_factors=best_f, n_iters=best_it, reg=reg)
preds_demo = U_demo @ V_demo.T + mu

# Print a few examples from the held-out set
print("Sample held-out predictions:")
for (u, v) in coords_demo[:5]:
    user = df.index[u]
    microgoal = df.columns[v]
    true = values[u, v]
    pred = preds_demo[u, v]
    print(f"  User {user}, microgoal {microgoal}: true = {true:.2f}, pred = {pred:.2f}")

# Compute absolute differences for all held-out points
diffs = []
for (u, v) in coords_demo:
    true = values[u, v]
    pred = preds_demo[u, v]
    diff = abs(true - pred)
    diffs.append((u, v, diff, true, pred))

# Sort by difference ascending (best) and descending (worst)
diffs_sorted = sorted(diffs, key=lambda x: x[2])

n = 5  # number of examples to show
# Best (smallest error)
print("\nTop 5 best predictions (smallest absolute error):")
for u, v, diff, true, pred in diffs_sorted[:n]:
    user = df.index[u]
    microgoal = df.columns[v]
    print(f"  User {user}, microgoal {microgoal}: true = {true:.2f}, pred = {pred:.2f}, abs_error = {diff:.2f}")

# Worst (largest error)
print("\nTop 5 worst predictions (largest absolute error):")
for u, v, diff, true, pred in diffs_sorted[-n:][::-1]:
    user = df.index[u]
    microgoal = df.columns[v]
    print(f"  User {user}, microgoal {microgoal}: true = {true:.2f}, pred = {pred:.2f}, abs_error = {diff:.2f}")


Sample held-out predictions:
  User 5013c1c3de27ed6fdde0e30b09f69c48010f2de90a75831bfff108dbe0353fee, microgoal 5174: true = -0.86, pred = -1.29
  User bc0fa55c991dc265795596921f14d24e54550cf365016bdf212c9f6f8ec33d2d, microgoal 7893: true = -1.26, pred = -1.73
  User 002b70d9358932ebbf1ed852747f170c408ba041f1ae6b715499a33a0efde073, microgoal 12377: true = 1.00, pred = 0.93
  User e7ad8dbf26e11d0e3f49c8870d13533556ec56c5b61e8a83f79997d643c0602a, microgoal 5644: true = 1.00, pred = 0.76
  User 838503ea7eb8ff5db27364052ef39738a886529c62bd43b9a4c2fa63fddb0e52, microgoal 13061: true = 1.00, pred = 1.14

Top 5 best predictions (smallest absolute error):
  User 0b1fd491b2516fc5773f88aafd2c8f874c33a3ce9cc2a14e1985fd51a33e2cea, microgoal 13356: true = 1.00, pred = 1.00, abs_error = 0.00
  User 4ed6b342099305b5151409844fa002187bf20ff51979b8893adf589ef1324254, microgoal 13749: true = 1.00, pred = 1.00, abs_error = 0.00
  User a92cd7dff4b459f4d2ff06b5ac399500d21751e2ca547cddf8f08649ed70f312, micro

### making the predictions

In [12]:
def get_completed_microgoals(user_id, df, microgoal_names, pass_threshold=1.0):
    """
    Return microgoals that the student has completed (ability >= pass_threshold).
    Returns a list of (microgoal_id, name, elo_score).
    """
    if user_id not in df.index:
        raise ValueError(f"User {user_id} not found in data.")
    row = df.loc[user_id]
    completed = row[row >= pass_threshold].dropna()
    ids = completed.index.tolist()
    scores = completed.values.tolist()
    names = [microgoal_names.get(str(mid), '<unknown>') for mid in ids]
    return list(zip(ids, names, scores))


def recommend_microgoals_within_completed_islands(user_id, df, U, V, mu, microgoal_names, df_hierarchy, threshold=1.0, top_n=10):
    """
    Recommend unseen microgoals for a given user, but only from islands where the user has already completed at least one microgoal.
    Returns a list of (microgoal_id, name, predicted_score).
    """
    if user_id not in df.index:
        raise ValueError(f"User {user_id} not found in data.")

    # Find completed microgoals for the user
    user_row = df.loc[user_id]
    completed_ids = set(user_row[user_row >= threshold].dropna().index.astype(str))

    # Find islands where the user has completed at least one microgoal
    completed_islands = set()
    for idx, row in df_hierarchy.iterrows():
        if row['microgoal_id'] in completed_ids:
            completed_islands.add(idx)

    # Find microgoals in those islands
    allowed_microgoal_ids = set()
    for idx in completed_islands:
        # All microgoals in this island
        allowed_microgoal_ids.update(
            df_hierarchy.loc[[idx], 'microgoal_id'].values
        )

    # Predict scores for all microgoals
    idx_user = df.index.get_loc(user_id)
    raw_preds = U[idx_user] @ V.T
    preds = raw_preds + mu

    attempted = ~np.isnan(df.values[idx_user])
    unseen = np.where(~attempted)[0]
    unseen_ids = df.columns[unseen].astype(str)

    # Filter unseen microgoals to only those in allowed islands
    candidates = [
        i for i, mid in zip(unseen, unseen_ids)
        if mid in allowed_microgoal_ids and preds[i] >= threshold
    ]
    ranked = sorted(candidates, key=lambda i: preds[i], reverse=True)
    top_idxs = ranked[:top_n]

    ids = df.columns[top_idxs].tolist()
    scores = [preds[i] for i in top_idxs]
    names = [microgoal_names.get(str(mid), '<unknown>') for mid in ids]
    return list(zip(ids, names, scores))

def load_microgoal_features(filepath):
    """
    Load the microgoal features table.
    Returns a Series mapping microgoal_id → name.
    """
    mf = pd.read_csv(filepath, dtype={'microgoal_id': str})
    if 'microgoal_id' not in mf.columns:
        mf = mf.rename(columns={mf.columns[0]: 'microgoal_id'})
    # Determine name column
    mf['microgoal_id'] = mf['microgoal_id'].astype(str)
    
    mf = mf[['microgoal_id','name','world','island']].set_index(['world','island'], drop=True)
    mf.sort_index(inplace=True)

    return mf

if __name__ == '__main__':
    # File paths
    student_filepath = 'student_abilities_pivoted.csv'
    microgoal_feat_filepath = 'microgoal_features.csv'

    # Load data
    df_students = load_student_data(student_filepath)
    df_hierarchy = load_microgoal_features(microgoal_feat_filepath)
    microgoal_names = df_hierarchy.set_index('microgoal_id')['name']

    # Identify common microgoals
    common_ids = set(df_students.columns.astype(str)).intersection(microgoal_names.index.astype(str))
    df_students = df_students.loc[:, df_students.columns.intersection(common_ids)]
    microgoal_names = microgoal_names.loc[list(common_ids)]

    # compute global mean on observed ELOs
    mask = ~df_students.isna().values
    mu = df_students.values[mask].mean()

    # center rating matrix: fill missing with mu then subtract mu
    R_centered = df_students.fillna(mu).values - mu

    # train ALS on centered ratings
    U, V = train_als(R_centered, mask, n_factors=20, n_iters=15, reg=0.1)

    # choose student
    example_user = df_students.index[1]

    # print completed microgoals
    completed = get_completed_microgoals(example_user, df_students, microgoal_names)
    print(f"Completed microgoals for {example_user} (ELO score):")
    for mid, name, score in completed:
        print(f"  {mid}: {name} (ELO: {score:.2f})")

    # print recommendations with predictions re-centered
    recs = recommend_microgoals_within_completed_islands(example_user, df_students, U, V, mu, microgoal_names, df_hierarchy,
                                 threshold=1.0, top_n=20)
    print(f"\nTop 5 recommendations for {example_user} (predicted ELO):")
    for mid, name, score in recs:
        print(f"  {mid}: {name} (predicted ELO: {score:.2f})")

Completed microgoals for 000cc5ae202de8cbd7711ab4147cf15e8286e48a076be97ff139fc92ce9565f0 (ELO score):
  13997: Optellen tot 10 (ELO: 1.09)
  13648: Herkennen van erbij-situaties t/m 10 (ELO: 1.05)
  5154: Tijd aangeven op analoge klok met hele uren (ELO: 1.01)

Top 5 recommendations for 000cc5ae202de8cbd7711ab4147cf15e8286e48a076be97ff139fc92ce9565f0 (predicted ELO):
  5167: Tijd aangeven op analoge klok met 10 en 5 minuten (predicted ELO: 5.05)
  5174: Tijd aangeven op analoge klok met minuten (predicted ELO: 3.62)
  5162: Tijd aangeven op analoge klok met kwartieren (predicted ELO: 1.62)


In [13]:
# Show the completed microgoals for the example user
print(f"Completed microgoals for {example_user}:")
for mid, name, score in completed:
    print(f"  {mid}: {name} (ELO: {score:.2f})")

# Show the recommended microgoals and explain their relation to completed ones
print(f"\nRecommended microgoals for {example_user} (predicted ELO):")
print("These are from islands where the user has already completed at least one microgoal.\n")
for mid, name, score in recs:
    print(f"  {mid}: {name} (predicted ELO: {score:.2f})")

Completed microgoals for 000cc5ae202de8cbd7711ab4147cf15e8286e48a076be97ff139fc92ce9565f0:
  13997: Optellen tot 10 (ELO: 1.09)
  13648: Herkennen van erbij-situaties t/m 10 (ELO: 1.05)
  5154: Tijd aangeven op analoge klok met hele uren (ELO: 1.01)

Recommended microgoals for 000cc5ae202de8cbd7711ab4147cf15e8286e48a076be97ff139fc92ce9565f0 (predicted ELO):
These are from islands where the user has already completed at least one microgoal.

  5167: Tijd aangeven op analoge klok met 10 en 5 minuten (predicted ELO: 5.05)
  5174: Tijd aangeven op analoge klok met minuten (predicted ELO: 3.62)
  5162: Tijd aangeven op analoge klok met kwartieren (predicted ELO: 1.62)


In [14]:
# For each recommended microgoal, show its island and which completed microgoal(s) from that island
for rec_mid, rec_name, rec_score in recs:
    # Find the island(s) for this recommended microgoal
    rec_islands = df_hierarchy[df_hierarchy['microgoal_id'] == rec_mid].index.tolist()
    print(f"\nRecommended: {rec_mid}: {rec_name} (predicted ELO: {rec_score:.2f})")
    for island in rec_islands:
        # Find all microgoals in this island
        island_mids = set(df_hierarchy.loc[[island], 'microgoal_id'])
        # Find which completed microgoals are in this island
        completed_on_island = [(mid, cname, cscore) for mid, cname, cscore in completed if mid in island_mids]
        if completed_on_island:
            print(f"  From island: {island}")
            print("  Already completed on this island:")
            for cmid, cname, cscore in completed_on_island:
                print(f"    {cmid}: {cname} (ELO: {cscore:.2f})")
        else:
            print(f"  From island: {island}")
            print("  No completed microgoals on this island.")


Recommended: 5167: Tijd aangeven op analoge klok met 10 en 5 minuten (predicted ELO: 5.05)
  From island: ("['Tijd & Datum']", "['Klok zetten analoog']")
  Already completed on this island:
    5154: Tijd aangeven op analoge klok met hele uren (ELO: 1.01)

Recommended: 5174: Tijd aangeven op analoge klok met minuten (predicted ELO: 3.62)
  From island: ("['Tijd & Datum']", "['Klok zetten analoog']")
  Already completed on this island:
    5154: Tijd aangeven op analoge klok met hele uren (ELO: 1.01)

Recommended: 5162: Tijd aangeven op analoge klok met kwartieren (predicted ELO: 1.62)
  From island: ("['Tijd & Datum']", "['Klok zetten analoog']")
  Already completed on this island:
    5154: Tijd aangeven op analoge klok met hele uren (ELO: 1.01)
