In [2]:
import os 

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Working with one file

In [3]:
keys_df = pd.read_csv('data/files/5_keystrokes.txt', sep='\t')

Notes:

$PT_t$ -> Press Time at timestep t\
$RT_t$ -> Release Time at timestamp t

* In paper they used four latency features:
* * Hold Latency                      
* * Inter-key Latency
* * Press Latency
* * Release Latency

$$
HL = RT_t - PT_t \\
IL = PT_t - RT_{t-1} \\
PL = PT_t - PT_{t-1} \\
RL = RT_t - RT_{t-1}
$$

So for a sequence with timestamp T we will collect t-1 latency features

In [4]:
pd.set_option('mode.chained_assignment', None)

section_ids = keys_df['TEST_SECTION_ID'].unique().tolist()
keys_df_example = keys_df[keys_df['TEST_SECTION_ID'] == section_ids[0]]

keys_df_example['PRESS_TIME_lag'] = keys_df_example['PRESS_TIME'].shift(1)
keys_df_example['PRESS_TIME_lag'] = np.where(keys_df_example['PRESS_TIME_lag'].isna(), keys_df_example['PRESS_TIME'], keys_df_example['PRESS_TIME_lag'])

keys_df_example['RELEASE_TIME_lag'] = keys_df_example['RELEASE_TIME'].shift(1)
keys_df_example['RELEASE_TIME_lag'] = np.where(keys_df_example['RELEASE_TIME_lag'].isna(), keys_df_example['RELEASE_TIME'], keys_df_example['RELEASE_TIME_lag'])

keys_df_example['PRESS_TIME_lag'] = keys_df_example['PRESS_TIME_lag'].astype('int64')
keys_df_example['RELEASE_TIME_lag'] = keys_df_example['RELEASE_TIME_lag'].astype('int64')

In [5]:
keys_df_example['HL'] =  keys_df_example['RELEASE_TIME'] - keys_df_example['PRESS_TIME']
keys_df_example['IL'] = keys_df_example['PRESS_TIME'] - keys_df_example['RELEASE_TIME_lag']
keys_df_example['PL'] = keys_df_example['PRESS_TIME'] - keys_df_example['PRESS_TIME_lag']
keys_df_example['RL'] = keys_df_example['RELEASE_TIME'] - keys_df_example['RELEASE_TIME_lag']

In [6]:
keys_df_example.TEST_SECTION_ID.unique()[0]

7

In [7]:
from preprocessing_utils import extract_latency_features_from_df as p

In [8]:
test_case = keys_df[keys_df['TEST_SECTION_ID'] == section_ids[1]]

In [9]:
output = p(test_case,
 'PRESS_TIME',
 'RELEASE_TIME',
 'KEYCODE',
 'PARTICIPANT_ID',
 'TEST_SECTION_ID')

In [10]:
(len(output['keycode_ids']),
len(output['hl']),
len(output['il']),
len(output['pl']),
len(output['rl']) )

(47, 47, 47, 47, 47)

### Extracting features from entire dataset & saving in one format

In [11]:
def find_bad_data(df):
    if any(df.isna().sum().drop('LETTER') > 0):
        return 1
    else:
        return 0
    
    
def frac_of_keycode_nan(df):
    tot = df.shape[0]
    nan_num = df.KEYCODE.isna().sum()
    if tot == 0:
        return 0.0
    return nan_num/tot

def check_stuff(df):
    output = dict(
        is_bad = find_bad_data(df),
        frac_nan_kc = frac_of_keycode(nan(df))    )

In [12]:
meta_df = pd.read_csv('data/metadata_participants.txt', sep='\t')
meta_df = meta_df[meta_df['LAYOUT'] =='qwerty']
meta_df = meta_df[meta_df['KEYBOARD_TYPE'].isin(['full', 'laptop'])]


ids_of_interest = meta_df['PARTICIPANT_ID'].tolist()

In [13]:
# reading all of the files, that are were typed in qwerty and on laptop/pc
keystrokes_data_files = [f'{i}_keystrokes.txt' for i in ids_of_interest]
keystrokes_real_files = os.listdir('data/files/')

keystrokes_data_files = sorted(list(set(keystrokes_data_files).intersection(set(keystrokes_real_files))))

In [14]:
from tqdm import tqdm
dataframes = []
failed_files = []
for path in tqdm(keystrokes_data_files):
    data_path = os.path.join(os.getcwd(), fr'data/files/{path}')
    try:
        df = pd.read_csv(data_path, sep='\t')
        for id in sorted(df.TEST_SECTION_ID.unique()):
            curr_df = df[df.TEST_SECTION_ID == id]
            dataframes.append(curr_df)
    except:
        failed_files.append(data_path)


len(failed_files)

100%|██████████| 162056/162056 [25:46<00:00, 104.81it/s] 


6048

In [15]:
final_dataset = pd.DataFrame()
final_dataset['frames'] = dataframes 

In [22]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [25]:
final_dataset['is_bad'] = final_dataset.frames.parallel_apply(find_bad_data)
final_dataset['keycode_nan_frac'] = final_dataset.frames.parallel_apply(frac_of_keycode_nan)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=97478), Label(value='0 / 97478')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=97478), Label(value='0 / 97478')))…

In [29]:
final_dataset = final_dataset[final_dataset['is_bad'] == 0]

In [36]:
print(f'{100 - final_dataset.shape[0]/len(dataframes) * 100 :.3f}% datapoints lostlost ')
print(f'{final_dataset.shape[0]} keystrokes in final dataset')

0.011% datapoints lostlost 
2339211 keystrokes in final dataset


In [37]:
features_df = final_dataset.parallel_apply(lambda x: p(x.frames,
                                                        'PRESS_TIME',
                                                        'RELEASE_TIME',
                                                        'KEYCODE',
                                                        'PARTICIPANT_ID',
                                                        'TEST_SECTION_ID'),
                                            axis='columns',
                                            result_type='expand')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=97468), Label(value='0 / 97468')))…

In [44]:
features_df['participant_id'] = features_df['participant_id'].astype('int64')

### Saving dataset in huggingface

In [49]:
from datasets import Dataset

In [54]:
dataset_136Mkeystrokes_features = Dataset.from_pandas(features_df)
dataset_136Mkeystrokes_features = dataset_136Mkeystrokes_features.remove_columns('__index_level_0__')

In [55]:
dataset_136Mkeystrokes_features

Dataset({
    features: ['participant_id', 'section_id', 'keycode_ids', 'hl', 'il', 'pl', 'rl'],
    num_rows: 2339211
})

In [57]:
import huggingface_hub

In [58]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [59]:
dataset_136Mkeystrokes_features.push_to_hub('rokset3/136Mkeystrokes_features')

Pushing dataset shards to the dataset hub:   0%|          | 0/10 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [72]:
final_dataset_concat = pd.concat(final_dataset.frames.tolist())

In [77]:
final_dataset_concat

Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,PRESS_TIME,RELEASE_TIME,LETTER,KEYCODE
0,100001.0,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891207.0,1.473275e+12,1.473275e+12,SHIFT,16.0
1,100001.0,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891214.0,1.473275e+12,1.473275e+12,W,87.0
2,100001.0,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891219.0,1.473275e+12,1.473275e+12,a,65.0
3,100001.0,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891226.0,1.473275e+12,1.473275e+12,s,83.0
4,100001.0,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891231.0,1.473275e+12,1.473275e+12,,32.0
...,...,...,...,...,...,...,...,...,...
655,99998.0,1091363,I'm not close enough right now to the discussion.,I'm not close enough right now to the discussion.,51911078.0,1.473276e+12,1.473276e+12,s,83.0
656,99998.0,1091363,I'm not close enough right now to the discussion.,I'm not close enough right now to the discussion.,51911083.0,1.473276e+12,1.473276e+12,i,73.0
657,99998.0,1091363,I'm not close enough right now to the discussion.,I'm not close enough right now to the discussion.,51911089.0,1.473276e+12,1.473276e+12,o,79.0
658,99998.0,1091363,I'm not close enough right now to the discussion.,I'm not close enough right now to the discussion.,51911095.0,1.473276e+12,1.473276e+12,n,78.0


In [78]:
final_dataset_concat.PRESS_TIME = final_dataset_concat.PRESS_TIME.astype('int64')
final_dataset_concat.RELEASE_TIME = final_dataset_concat.RELEASE_TIME.astype('int64')
final_dataset_concat.KEYSTROKE_ID = final_dataset_concat.KEYSTROKE_ID.astype('int64')
final_dataset_concat.PARTICIPANT_ID = final_dataset_concat.PARTICIPANT_ID.astype(int)

In [80]:
final_ds = Dataset.from_pandas(final_dataset_concat)
final_ds = final_ds.remove_columns('__index_level_0__')

In [81]:
dataset_136Mkeystrokes_features.push_to_hub('rokset3/136Mkeystrokes')

Pushing dataset shards to the dataset hub:   0%|          | 0/10 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/234 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/598 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
