In [None]:
import numpy as np
import pandas as pd
import json

### json

In [None]:
file_path = "/data/data_full.json"

with open(file_path, 'r') as json_file:
    data = json.load(json_file)

In [None]:
# unpacking json
records = []
for identifier, value in data.items():
    trials = value["log"]["trials"]
    for trial in trials:
        record = {
            "identifier": identifier,
            "date": value["date"], 
            "condition": value["log"]["condition"],
            "trial": trial["trial"],
            "end_time": trial["end_time"],
            "start_time": trial["start_time"],
            "target": trial["target"],
            "entered": trial["entered"],
            "part": trial["part"],
            "block": trial["block"],
        }
        records.append(record)

# df from the list of dictionaries
df = pd.DataFrame(records)

In [None]:
# filtering rows from the pilot
df.groupby('identifier')['date'].agg(list)
df['date'] = pd.to_datetime(df['date'])
df = df.loc[df['date'] >= '2023-08-10']

In [None]:
# filtering out empty trial (backend issue)
df = df.loc[df['trial'] != 32]


### metrics

In [None]:
# calculate wpm per trial
# from pypl: The WPM is calculated by dividing characters per second by five and then multiplying that with 60.

def calculate_wpm(row):
    characters_typed = len(row['entered']) # num of characters typed
    time_taken_sec = row['end_time'] - row['start_time'] # time taken
    wpm = ((characters_typed / time_taken_sec) / 5) * 60 # wpm 
    return wpm

df['wpm'] = df.apply(calculate_wpm, axis = 1)

In [None]:
# character error rate (from jiwer)

import jiwer

def calculate_cer(row):
    cer = jiwer.cer(row['target'], row['entered'])
    return cer

df['cer'] = df.apply(calculate_cer, axis = 1)

In [None]:
# low-quality data filtering

df = df[df['identifier'] != "5689069036109824"]

In [None]:
# filtering outliers with a mean 'cer' higher than 0.05 (as preregistered)
 
mean_cer_by_identifier = df.groupby('identifier')['cer'].mean()
identifiers_to_filter = mean_cer_by_identifier[mean_cer_by_identifier > 0.05].index
df = df[~df['identifier'].isin(identifiers_to_filter)]
df.reset_index(drop=True, inplace=True)

In [None]:
len(df['identifier'].value_counts())

### conditions

In [None]:
for index, row in df.iterrows():
    identifier = row['identifier']
    trial_num = row['trial']
    value = data.get(str(identifier), {})

    if value:
        trials = value["log"]["trials"]

        if trial_num < len(trials):
            trial = trials[trial_num]
            condition = value["log"]["condition"]
            part = trial["part"]
            keyboard = value["log"]["ux"][part]["keyboard"]

            # Assign the extracted "keyboard" value to the DataFrame
            df.at[index, 'keyboard'] = keyboard

In [None]:
print(df['wpm'].describe())

In [None]:
print(df['cer'].describe())

In [None]:
print('cognitive wpm MEAN:', df[df["keyboard"] == "cognitive"]["wpm"].mean(), ' SD:', df[df["keyboard"] == "cognitive"]["wpm"].std(),
      '\nbaseline wpm MEAN:', df[df["keyboard"] == "baseline"]["wpm"].mean(), ' SD:', df[df["keyboard"] == "baseline"]["wpm"].std(),
      '\ncognitive cer MEAN:', df[df["keyboard"] == "cognitive"]["cer"].mean(), ' SD:', df[df["keyboard"] == "cognitive"]["cer"].std(),
      '\nbaseline cer MEAN:', df[df["keyboard"] == "baseline"]["cer"].mean(), ' SD:', df[df["keyboard"] == "baseline"]["cer"].std())

### tests

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(data = df, x = 'wpm', kde = True)
plt.title('distribution of wpm')
plt.xlabel('wpm')
plt.ylabel('frequency')
plt.show()

In [None]:
sns.histplot(data = df, x = 'cer', kde = True)
plt.title('distribution of cer')
plt.xlabel('cer')
plt.ylabel('frequency')
plt.show()

##### wpm

In [None]:
# block assignment

df['block'] = np.select(
    [
        (df['trial'] >= 0) & (df['trial'] <= 7) | (df['trial'] >= 33) & (df['trial'] <= 40),
        (df['trial'] >= 8) & (df['trial'] <= 15) | (df['trial'] >= 41) & (df['trial'] <= 48),
        (df['trial'] >= 16) & (df['trial'] <= 23) | (df['trial'] >= 49) & (df['trial'] <= 56),
        (df['trial'] >= 24) & (df['trial'] <= 31) | (df['trial'] >= 57) & (df['trial'] <= 64)
    ],
    [1, 2, 3, 4],
    default = None
)

In [None]:
import pingouin as pg

aov_interaction = pg.rm_anova(data = df, dv = 'wpm', within = ['keyboard', 'block'], subject = 'identifier')

# Calculate the effect size (partial eta-squared) manually
eta_squared = aov_interaction['SS'][0] / (aov_interaction['SS'][0] + aov_interaction['SS'][1])

# Calculate Cohen's d
n = len(df)  # Total number of observations
k = len(df['keyboard'].unique())  # Number of levels in the 'keyboard' factor

cohen_d = np.sqrt(eta_squared * (n - 1) / (n - k))

# Print ANOVA results, partial eta-squared, and Cohen's d
print(aov_interaction)
print(f"Partial Eta-Squared (η²): {eta_squared}")
print(f"Cohen's d: {cohen_d}")

In [None]:
sns.pointplot(data = df, x = 'block', y = 'wpm', hue = 'keyboard', ci = 'sd')
plt.title('interaction plot of wpm for different keyboard conditions and blocks')
plt.xlabel('block')
plt.ylabel('wpm')
plt.show()

In [None]:
from statsmodels.stats.multicomp import MultiComparison

multi_comp = MultiComparison(df['wpm'], df['block'])
post_hoc_res = multi_comp.tukeyhsd()
print(post_hoc_res.summary())

##### cer

In [None]:
cer_interaction = pg.rm_anova(data = df, dv = 'cer', within = ['keyboard', 'block'], subject = 'identifier')
print(cer_interaction)

In [None]:
sns.pointplot(data = df, x = 'block', y = 'cer', hue = 'keyboard', ci = 'sd')
plt.title('interaction plot of cer for different keyboard conditions and blocks')
plt.xlabel('block')
plt.ylabel('cer')
plt.show()

In [None]:
df.to_csv('df_full_filtered.csv', index = False)