In [5]:
import pandas as pd
import numpy as np

np.random.seed(42)

#Parameters
n_apprenants = 200
n_essais = 20  # par apprenant
n_total = n_apprenants * n_essais

# Creating columns
ID_s = np.repeat(np.arange(1, n_apprenants+1), n_essais)
response_time = np.random.uniform(low=0, high=60, size=n_total)
errors = np.random.poisson(lam=1.5, size=n_total)
Mean_HR = np.random.normal(loc=50, scale=10, size=n_total)
HRV_SDNN = np.random.normal(loc=50, scale=10, size=n_total)
HRV_RMSSD = np.random.normal(loc=35, scale=8, size=n_total)
EEG_alpha = np.random.normal(loc=8, scale=2, size=n_total)
EEG_beta = np.random.normal(loc=15, scale=3, size=n_total)
subjective_values = np.random.randint(1, 8, size=n_total)

#Let's simulate a simple label : weak/medium/high cognitive load
#Just a simple logic for now : the higher the response time and error rate, the higher the cognitive load. It will be more complex in reality, though.
cognitive_load_simple = 0.4*response_time + 0.4*errors + 0.2*(subjective_values/7)
labels_cls = pd.qcut(cognitive_load_simple, q=3, labels=["weak", "medium", "high"])

# =============================================
# Here i try to generate a more complex cognitive load label
#    Directions:
#      ↑ response_time, ↑ errors, ↑ Mean_HR, ↑ EEG_beta, ↑ self_report  => ↑ load
#      ↑ HRV_SDNN, ↑ HRV_RMSSD, ↑ EEG_alpha                              => ↓ load
# =============================================

def scale_01(x):
    x = np.asarray(x, dtype=float)
    xmin, xmax = np.nanmin(x), np.nanmax(x)
    if xmax == xmin:
        return np.zeros_like(x)
    return (x - xmin) / (xmax - xmin)

# Scale to comparable ranges
rt_s     = scale_01(response_time)
err_s    = scale_01(errors)
hr_s     = scale_01(Mean_HR)
sdnn_s   = scale_01(HRV_SDNN)
rmssd_s  = scale_01(HRV_RMSSD)
alpha_s  = scale_01(EEG_alpha)
beta_s   = scale_01(EEG_beta)
subj_s = scale_01(subjective_values)

# Negative contributors become (1 - scaled)
neg_sdnn  = 1.0 - sdnn_s
neg_rmssd = 1.0 - rmssd_s
neg_alpha = 1.0 - alpha_s

# Weights (sum to 1.0); adjustable
weights = {
    "rt":      0.20,  # response time
    "err":     0.20,  # errors
    "hr":      0.10,  # Mean_HR
    "beta":    0.15,  # EEG_beta
    "subj":  0.10,  # self-report measures
    "sdnn":    0.10,  # HRV_SDNN (inverse)
    "rmssd":   0.075, # HRV_RMSSD (inverse)
    "alpha":   0.075, # EEG_alpha (inverse)
}

score_complex = (
    weights["rt"]     * rt_s      +
    weights["err"]    * err_s     +
    weights["hr"]     * hr_s      +
    weights["beta"]   * beta_s    +
    weights["subj"] * subj_s  +
    weights["sdnn"]   * neg_sdnn  +
    weights["rmssd"]  * neg_rmssd +
    weights["alpha"]  * neg_alpha
)

cognitive_load = pd.qcut(score_complex, q=3, labels=["weak", "medium", "high"])

# Creating the dataframe
df = pd.DataFrame({
    "learner_id": ID_s,
    "response_time": response_time,
    "errors": errors,
    "Mean_HR": Mean_HR,
    "HRV_SDNN": HRV_SDNN,
    "HRV_RMSSD": HRV_RMSSD,
    "EEG_alpha": EEG_alpha,
    "EEG_beta": EEG_beta,
    "self_report": subjective_values,
    "cognitive_load_simple": labels_cls,
    "cognitive_load": cognitive_load
})

In [6]:
df.to_csv("../data/cog_data.csv", index=False)

In [7]:
df.head()

Unnamed: 0,learner_id,response_time,errors,Mean_HR,HRV_SDNN,HRV_RMSSD,EEG_alpha,EEG_beta,self_report,cognitive_load_simple,cognitive_load
0,1,22.472407,3,45.929841,74.359851,29.370104,7.093684,14.01282,2,medium,weak
1,1,57.042858,0,68.507811,36.649838,35.110686,8.12613,16.154998,3,high,high
2,1,43.919637,1,38.340258,49.145812,29.238697,9.950088,13.688801,5,high,medium
3,1,35.919509,2,51.026905,69.417461,35.565576,6.792518,18.671761,1,medium,medium
4,1,9.361118,0,52.478463,61.041752,31.956252,5.655177,22.65552,1,weak,weak
