In [None]:
# Cognitive Skills & Student Performance - Analysis Notebook

import os
import json
import math
import numpy as np
import pandas as pd
from pathlib import Path

# Paths
ROOT = Path('..').resolve()
DATA_CSV = ROOT / 'data' / 'students.csv'
PUBLIC_ANALYSIS = ROOT / 'public' / 'analysis'
PUBLIC_DATA = ROOT / 'public' / 'data'
PUBLIC_ANALYSIS.mkdir(parents=True, exist_ok=True)
PUBLIC_DATA.mkdir(parents=True, exist_ok=True)

# Load data
df = pd.read_csv(DATA_CSV)
print('Loaded rows:', len(df))
df.head()


In [1]:
# EDA: basic stats and correlations

numeric_cols = ['attention','focus','comprehension','retention','engagement_time','assessment_score']

summary = df[numeric_cols].describe().T
print(summary)

corr = df[numeric_cols].corr()
print("\nCorrelation with assessment_score:\n", corr['assessment_score'].sort_values(ascending=False))

# Export correlation matrix to JSON for dashboard use
corr_out = {
    'matrix': corr.round(3).to_dict(),
    'sorted_vs_score': corr['assessment_score'].sort_values(ascending=False).round(3).to_dict()
}
with open(PUBLIC_ANALYSIS / 'correlations.json', 'w') as f:
    json.dump(corr_out, f, indent=2)
print('Wrote', PUBLIC_ANALYSIS / 'correlations.json')


NameError: name 'df' is not defined

In [None]:
# Simple regression model to predict assessment_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression

X = df[['attention','focus','comprehension','retention','engagement_time']]
y = df['assessment_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
r2 = r2_score(y_test, preds)
mae = mean_absolute_error(y_test, preds)

print('R2:', round(r2, 3), ' MAE:', round(mae, 2))

# Export model coefficients and metrics
coefs = dict(zip(X.columns, model.coef_))
metrics = {'r2': float(r2), 'mae': float(mae)}
with open(PUBLIC_ANALYSIS / 'model.json', 'w') as f:
    json.dump({'coefficients': coefs, 'intercept': float(model.intercept_), 'metrics': metrics}, f, indent=2)
print('Wrote', PUBLIC_ANALYSIS / 'model.json')


In [None]:
# Clustering: derive learning personas
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

features = ['attention','focus','comprehension','retention','engagement_time']
scaler = StandardScaler()
Z = scaler.fit_transform(df[features])

k = 3
km = KMeans(n_clusters=k, n_init='auto', random_state=42)
labels = km.fit_predict(Z)

df['persona'] = labels

persona_summary = df.groupby('persona')[features + ['assessment_score']].mean().round(1)
print(persona_summary)

# Export per-student personas and cluster centroids
export = df[['student_id','name','class','persona']].to_dict(orient='records')
centroids = scaler.inverse_transform(km.cluster_centers_)
centroids_out = [{features[i]: float(v[i]) for i in range(len(features))} for v in centroids]

with open(PUBLIC_ANALYSIS / 'personas.json', 'w') as f:
    json.dump({'personas': export, 'centroids': centroids_out}, f, indent=2)
print('Wrote', PUBLIC_ANALYSIS / 'personas.json')
