In [1]:
import os, csv, json, pickle, warnings, random
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# --------------------------
# Paths
# --------------------------
DATA_PATH = r"C:\Users\sagni\Downloads\SkillTracer Knowledge Tracing\archive (1)\2012-2013-data-with-predictions-4-final.csv"
OUTPUT_DIR = r"C:\Users\sagni\Downloads\SkillTracer Knowledge Tracing"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --------------------------
# Reproducibility
# --------------------------
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# --------------------------
# Robust reader (CSV or Excel-in-disguise)
# --------------------------
def is_zip_or_xlsx(path):
    try:
        with open(path, "rb") as f:
            return f.read(2) == b"PK"
    except Exception:
        return False

def robust_read_any(path):
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    if is_zip_or_xlsx(path):
        import openpyxl
        df = pd.read_excel(path, engine="openpyxl")
        print(f"[INFO] Loaded as Excel: {os.path.basename(path)} shape={df.shape}")
        return df
    encodings = ["utf-8", "utf-8-sig", "cp1252", "latin1"]
    delimiters = [";", ",", "\t", "|"]
    # sniff delimiter quickly
    try:
        with open(path, "rb") as f:
            head = f.read(8192).decode("latin1", errors="ignore")
        try:
            sn = csv.Sniffer().sniff(head)
            if sn.delimiter in delimiters:
                delimiters = [sn.delimiter] + [d for d in delimiters if d != sn.delimiter]
        except Exception:
            pass
    except Exception:
        pass
    last_err = None
    for enc in encodings:
        for sep in delimiters:
            try:
                df = pd.read_csv(path, encoding=enc, sep=sep, engine="python")
                if df.shape[1] > 1:
                    print(f"[INFO] Loaded as CSV enc='{enc}', sep='{sep}', shape={df.shape}")
                    return df
            except Exception as e:
                last_err = e
                continue
    raise RuntimeError(f"Could not parse {path}. Last error: {last_err}")

df = robust_read_any(DATA_PATH)

# --------------------------
# Column mapping (auto-detect common names)
# --------------------------
def pick_col(candidates, cols):
    for c in candidates:
        if c in cols:
            return c
    lc = {c.lower(): c for c in cols}
    for c in candidates:
        if c.lower() in lc:
            return lc[c.lower()]
    return None

cols = list(df.columns)
student_col = pick_col(["student_id","user_id","Anon Student Id","Anon StudentID","student","sid"], cols)
skill_col   = pick_col(["skill_id","skill","tag","KC(SubSkills)","KC","skill_name","concept_id"], cols)
correct_col = pick_col(["correct","is_correct","Correct First Attempt","answered_correctly","label"], cols)
time_col    = pick_col(["timestamp","start_time","order_id","time","Time"], cols)

if student_col is None or skill_col is None or correct_col is None:
    raise ValueError(f"Missing key columns in {cols} — need student_id-like, skill_id/tag-like, and correct-like columns.")

# Normalize correctness to {0,1}
df[correct_col] = pd.to_numeric(df[correct_col], errors="coerce")
df[correct_col] = (df[correct_col] > 0).astype(int)

# If multiple skills per row, take the first
def take_first_skill(v):
    if pd.isna(v): return np.nan
    s = str(v)
    for sep in ["~~","; ", ";", ",", "|"]:
        if sep in s:
            return s.split(sep)[0]
    return s

df[skill_col] = df[skill_col].apply(take_first_skill)

# Sort per student by time (if available)
if time_col is not None:
    try:
        df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
        df = df.sort_values([student_col, time_col]).reset_index(drop=True)
    except Exception:
        df = df.sort_values([student_col]).reset_index(drop=True)
else:
    df = df.sort_values([student_col]).reset_index(drop=True)

# Drop missing essentials
df = df.dropna(subset=[student_col, skill_col, correct_col])

# --------------------------
# Build integer encoders & save as preprocessor.pkl
# --------------------------
students = df[student_col].astype(str).unique().tolist()
skills   = df[skill_col].astype(str).unique().tolist()

student2idx = {s:i for i,s in enumerate(students)}
idx2student = {i:s for s,i in student2idx.items()}
skill2idx   = {s:i for i,s in enumerate(skills)}
idx2skill   = {i:s for s,i in skill2idx.items()}

df["_sid"]  = df[student_col].astype(str).map(student2idx)
df["_kid"]  = df[skill_col].astype(str).map(skill2idx)
df["_corr"] = df[correct_col].astype(int)

n_skills = len(skill2idx)
print(f"[INFO] n_students={len(student2idx)}, n_skills={n_skills}, rows={len(df)}")

# --------------------------
# Build sequences per student for DKT
# token = 1 + skill_id + correctness * n_skills (vocab: 1..2*n_skills; 0 is PAD)
# Predict next correctness at each timestep
# --------------------------
MAX_LEN = 200

def build_sequences_for_student(sub):
    kid = sub["_kid"].values.astype(np.int32)
    corr = sub["_corr"].values.astype(np.int32)
    if len(kid) < 2:  # need at least 2 interactions to create (x->y)
        return []
    tokens = (kid[:-1] + corr[:-1]*n_skills + 1).astype(np.int32)
    y      = corr[1:].astype(np.int32)
    chunks = []
    for start in range(0, len(tokens), MAX_LEN):
        xt = tokens[start:start+MAX_LEN]
        yt = y[start:start+MAX_LEN]
        if len(xt) == 0: 
            continue
        pad = MAX_LEN - len(xt)
        xt = np.pad(xt, (0,pad), constant_values=0)              # (T,)
        yt = np.pad(yt, (0,pad), constant_values=-1).astype(int) # (T,) with -1 as pad label
        mask = (yt != -1).astype(np.float32)                     # (T,)
        yt = np.where(yt==-1, 0, yt).astype(np.float32)          # replace pad with 0 for loss
        chunks.append((xt, yt, mask))
    return chunks

# Split by student (to avoid leakage)
sid_all = df["_sid"].unique()
train_sids, test_sids = train_test_split(sid_all, test_size=0.15, random_state=SEED)
train_sids, val_sids  = train_test_split(train_sids, test_size=0.1765, random_state=SEED)  # ~0.70/0.15/0.15

def collect(sids):
    X, Y, W = [], [], []
    for sid in sids:
        sub = df[df["_sid"] == sid]
        for xt, yt, mw in build_sequences_for_student(sub):
            X.append(xt)
            Y.append(yt[:, None])   # (T,1) to match Dense(1) output
            W.append(mw)            # (T,)
    if len(X)==0: return None, None, None
    return np.stack(X), np.stack(Y), np.stack(W)

X_train, Y_train, W_train = collect(train_sids)
X_val,   Y_val,   W_val   = collect(val_sids)

for name, arr in [("X_train",X_train),("X_val",X_val)]:
    if arr is None:
        raise RuntimeError(f"{name} is empty — check column mapping or data quality.")
    print(f"[INFO] {name}={arr.shape}")

# --------------------------
# Save preprocessors & label encoder (.pkl)
# --------------------------
preproc = {
    "student2idx": student2idx,
    "idx2student": idx2student,
    "skill2idx":   skill2idx,
    "idx2skill":   idx2skill,
    "n_skills":    n_skills,
    "max_len":     MAX_LEN,
    "token_pad_id": 0,
    "token_vocab_size": 2*n_skills + 1,
    "token_rule": "token = 1 + skill_id + correctness * n_skills",
    "columns": {
        "student": student_col,
        "skill":   skill_col,
        "correct": correct_col,
        "time":    time_col
    }
}
with open(os.path.join(OUTPUT_DIR, "preprocessor.pkl"), "wb") as f:
    pickle.dump(preproc, f)

le = LabelEncoder().fit([0,1])
with open(os.path.join(OUTPUT_DIR, "label_encoder.pkl"), "wb") as f:
    pickle.dump(le, f)

print("[INFO] Saved preprocessor.pkl and label_encoder.pkl")

# --------------------------
# Build & train DKT model
# --------------------------
VOCAB_SIZE   = preproc["token_vocab_size"]
EMBED_DIM    = 64
HIDDEN_UNITS = 128

inp = keras.Input(shape=(MAX_LEN,), dtype="int32")
x = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, mask_zero=True)(inp)
x = layers.LSTM(HIDDEN_UNITS, return_sequences=True)(x)
x = layers.Dropout(0.2)(x)
out = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inp, out)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
model.summary()

early = keras.callbacks.EarlyStopping(monitor="val_accuracy", mode="max", patience=5, restore_best_weights=True)

history = model.fit(
    X_train, Y_train,
    sample_weight=W_train,
    validation_data=(X_val, Y_val, W_val),
    epochs=30,
    batch_size=64,
    callbacks=[early],
    verbose=1
)

# --------------------------
# Save model (.h5) and config (.yaml or JSON fallback)
# --------------------------
h5_path = os.path.join(OUTPUT_DIR, "model.h5")
model.save(h5_path)
print(f"[INFO] Saved model.h5 -> {h5_path}")

# YAML config
saved_yaml = False
try:
    import yaml
    cfg = model.get_config()
    with open(os.path.join(OUTPUT_DIR, "model_config.yaml"), "w", encoding="utf-8") as f:
        yaml.safe_dump(cfg, f, sort_keys=False)
    saved_yaml = True
    print(f"[INFO] Saved model_config.yaml -> {OUTPUT_DIR}")
except Exception as e:
    print("[WARN] Could not write YAML (install pyyaml). Saving JSON instead:", e)

if not saved_yaml:
    with open(os.path.join(OUTPUT_DIR, "model_config.json"), "w", encoding="utf-8") as f:
        json.dump(model.get_config(), f, indent=2)
    print(f"[INFO] Saved model_config.json -> {OUTPUT_DIR}")


[INFO] Loaded as CSV enc='utf-8', sep=',', shape=(6123270, 35)
[INFO] n_students=29018, n_skills=265, rows=2711813
[INFO] X_train=(24031, 200)
[INFO] X_val=(5141, 200)
[INFO] Saved preprocessor.pkl and label_encoder.pkl


Epoch 1/30
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 167ms/step - accuracy: 0.3467 - loss: 0.5873 - val_accuracy: 0.3579 - val_loss: 0.5660
Epoch 2/30
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 165ms/step - accuracy: 0.3685 - loss: 0.5673 - val_accuracy: 0.3592 - val_loss: 0.5637
Epoch 3/30
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 167ms/step - accuracy: 0.3715 - loss: 0.5650 - val_accuracy: 0.3647 - val_loss: 0.5615
Epoch 4/30
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 169ms/step - accuracy: 0.3725 - loss: 0.5629 - val_accuracy: 0.3640 - val_loss: 0.5604
Epoch 5/30
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 169ms/step - accuracy: 0.3740 - loss: 0.5619 - val_accuracy: 0.3615 - val_loss: 0.5601
Epoch 6/30
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 172ms/step - accuracy: 0.3746 - loss: 0.5611 - val_accuracy: 0.3605 - val_loss: 0.5601
Epoch 7/30



[INFO] Saved model.h5 -> C:\Users\sagni\Downloads\SkillTracer Knowledge Tracing\model.h5
[INFO] Saved model_config.yaml -> C:\Users\sagni\Downloads\SkillTracer Knowledge Tracing
