In [1]:
import os 

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Working with one file

In [2]:
keys_df = pd.read_csv('data/files/5_keystrokes.txt', sep='\t')

Notes:

$PT_t$ -> Press Time at timestep t\
$RT_t$ -> Release Time at timestamp t

* In paper they used four latency features:
* * Hold Latency                      
* * Inter-key Latency
* * Press Latency
* * Release Latency

$$
HL = RT_t - PT_t \\
IL = PT_t - RT_{t-1} \\
PL = PT_t - PT_{t-1} \\
RL = RT_t - RT_{t-1}
$$

So for a sequence with timestamp T we will collect t-1 latency features

In [3]:
pd.set_option('mode.chained_assignment', None)

section_ids = keys_df['TEST_SECTION_ID'].unique().tolist()
keys_df_example = keys_df[keys_df['TEST_SECTION_ID'] == section_ids[0]]

keys_df_example['PRESS_TIME_lag'] = keys_df_example['PRESS_TIME'].shift(1)
keys_df_example['PRESS_TIME_lag'] = np.where(keys_df_example['PRESS_TIME_lag'].isna(), keys_df_example['PRESS_TIME'], keys_df_example['PRESS_TIME_lag'])

keys_df_example['RELEASE_TIME_lag'] = keys_df_example['RELEASE_TIME'].shift(1)
keys_df_example['RELEASE_TIME_lag'] = np.where(keys_df_example['RELEASE_TIME_lag'].isna(), keys_df_example['RELEASE_TIME'], keys_df_example['RELEASE_TIME_lag'])

keys_df_example['PRESS_TIME_lag'] = keys_df_example['PRESS_TIME_lag'].astype('int64')
keys_df_example['RELEASE_TIME_lag'] = keys_df_example['RELEASE_TIME_lag'].astype('int64')

In [4]:
keys_df_example['HL'] =  keys_df_example['RELEASE_TIME'] - keys_df_example['PRESS_TIME']
keys_df_example['IL'] = keys_df_example['PRESS_TIME'] - keys_df_example['RELEASE_TIME_lag']
keys_df_example['PL'] = keys_df_example['PRESS_TIME'] - keys_df_example['PRESS_TIME_lag']
keys_df_example['RL'] = keys_df_example['RELEASE_TIME'] - keys_df_example['RELEASE_TIME_lag']

In [5]:
keys_df_example.TEST_SECTION_ID.unique()[0]

7

In [6]:
from preprocessing_utils import extract_latency_features_from_df as p

In [7]:
test_case = keys_df[keys_df['TEST_SECTION_ID'] == section_ids[1]]

In [8]:
output = p(test_case,
 'PRESS_TIME',
 'RELEASE_TIME',
 'KEYCODE',
 'PARTICIPANT_ID',
 'TEST_SECTION_ID')

In [9]:
(len(output['keycode_ids']),
len(output['hl']),
len(output['il']),
len(output['pl']),
len(output['rl']) )

(47, 47, 47, 47, 47)

### Extracting features from entire dataset & saving in one format

In [10]:
meta_df = pd.read_csv('data/metadata_participants.txt', sep='\t')
meta_df = meta_df[meta_df['LAYOUT'] =='qwerty']
meta_df = meta_df[meta_df['KEYBOARD_TYPE'].isin(['full', 'laptop'])]


ids_of_interest = meta_df['PARTICIPANT_ID'].tolist()

In [11]:
# reading all of the files, that are were typed in qwerty and on laptop/pc
keystrokes_data_files = [f'{i}_keystrokes.txt' for i in ids_of_interest]
keystrokes_real_files = os.listdir('data/files/')

keystrokes_data_files = sorted(list(set(keystrokes_data_files).intersection(set(keystrokes_real_files))))

In [30]:
from tqdm import tqdm
dataframes = []
failed_files = []
for path in tqdm(keystrokes_data_files):
    data_path = os.path.join(os.getcwd(), fr'data/files/{path}')
    try:
        df = pd.read_csv(data_path, sep='\t')
        for id in sorted(df.TEST_SECTION_ID.unique()):
            curr_df = df[df.TEST_SECTION_ID == id]
            dataframes.append(curr_df)
    except:
        failed_files.append(data_path)


len(failed_files)

100%|██████████| 162056/162056 [13:37<00:00, 198.25it/s] 


6048

In [46]:
final_dataset = pd.DataFrame()
final_dataset['frames'] = dataframes 

In [52]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
for i in range(100):
    try:
        sample = final_dataset.sample(5000, random_state=i)
        sample.apply(lambda x: p(x.frames,
                             'PRESS_TIME',
                             'RELEASE_TIME',
                               'KEYCODE',
                               'PARTICIPANT_ID',
                               'TEST_SECTION_ID'),
                                axis='columns',
                                result_type='expand')
    except Exception as e:
        print(i, '\n', e)
        


In [74]:
for idx, frame in enumerate(sample.frames):
    try:
        p(frame,
          'PRESS_TIME',
          'RELEASE_TIME',
          'KEYCODE',
          'PARTICIPANT_ID',
          'TEST_SECTION_ID')
    except:
        print(idx)


25


In [53]:
final_dataset = final_dataset.parallel_apply(lambda x: p(x.frames,
                                                        'PRESS_TIME',
                                                        'RELEASE_TIME',
                                                        'KEYCODE',
                                                        'PARTICIPANT_ID',
                                                        'TEST_SECTION_ID'),
                                            axis='columns',
                                            result_type='expand')

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [34]:
# 3.8% of data is corrupted
(len(keystrokes_data_files) - len(failed_files))/len(keystrokes_data_files)

0.9626795675568939

### Saving dataset in huggingface

In [38]:
from datasets import Dataset

In [40]:
dataset_136Mkeystrokes_features = Dataset.from_pandas(final_dataset)

In [41]:
dataset_136Mkeystrokes_features

Dataset({
    features: ['participant_id', 'section_id', 'keycode_ids', 'hl', 'il', 'pl', 'rl'],
    num_rows: 1455
})