In [1]:
import os 

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Working with one file

In [29]:
keys_df = pd.read_csv('data/files/5_keystrokes.txt', sep='\t')

Notes:

$PT_t$ -> Press Time at timestep t\
$RT_t$ -> Release Time at timestamp t

* In paper they used four latency features:
* * Hold Latency                      
* * Inter-key Latency
* * Press Latency
* * Release Latency

$$
HL = RT_t - PT_t \\
IL = PT_t - RT_{t-1} \\
PL = PT_t - PT_{t-1} \\
RL = RT_t - RT_{t-1}
$$

So for a sequence with timestamp T we will collect t-1 latency features

In [3]:
pd.set_option('mode.chained_assignment', None)

section_ids = keys_df['TEST_SECTION_ID'].unique().tolist()  
keys_df_example = keys_df[keys_df['TEST_SECTION_ID'] == section_ids[0]]

keys_df_example['PRESS_TIME_lag'] = keys_df_example['PRESS_TIME'].shift(1)
keys_df_example['PRESS_TIME_lag'] = np.where(keys_df_example['PRESS_TIME_lag'].isna(), keys_df_example['PRESS_TIME'], keys_df_example['PRESS_TIME_lag'])

keys_df_example['RELEASE_TIME_lag'] = keys_df_example['RELEASE_TIME'].shift(1)
keys_df_example['RELEASE_TIME_lag'] = np.where(keys_df_example['RELEASE_TIME_lag'].isna(), keys_df_example['RELEASE_TIME'], keys_df_example['RELEASE_TIME_lag'])

keys_df_example['PRESS_TIME_lag'] = keys_df_example['PRESS_TIME_lag'].astype('int64')
keys_df_example['RELEASE_TIME_lag'] = keys_df_example['RELEASE_TIME_lag'].astype('int64')

In [4]:
keys_df_example['HL'] =  keys_df_example['RELEASE_TIME'] - keys_df_example['PRESS_TIME']
keys_df_example['IL'] = keys_df_example['PRESS_TIME'] - keys_df_example['RELEASE_TIME_lag']
keys_df_example['PL'] = keys_df_example['PRESS_TIME'] - keys_df_example['PRESS_TIME_lag']
keys_df_example['RL'] = keys_df_example['RELEASE_TIME'] - keys_df_example['RELEASE_TIME_lag']

In [5]:
keys_df_example.TEST_SECTION_ID.unique()[0]

7

In [6]:
from preprocessing_utils import extract_latency_features_from_df as p

In [7]:
test_case = keys_df[keys_df['TEST_SECTION_ID'] == section_ids[1]]

In [8]:
output = p(test_case,
 'PRESS_TIME',
 'RELEASE_TIME',
 'KEYCODE',
 'PARTICIPANT_ID',
 'TEST_SECTION_ID')

In [9]:
(len(output['keycode_ids']),
len(output['hl']),
len(output['il']),
len(output['pl']),
len(output['rl']) )

(47, 47, 47, 47, 47)

### Extracting features from entire dataset & saving in one format

In [10]:
meta_df = pd.read_csv('data/metadata_participants.txt', sep='\t')
meta_df = meta_df[meta_df['LAYOUT'] =='qwerty']
meta_df = meta_df[meta_df['KEYBOARD_TYPE'].isin(['full', 'laptop'])]


ids_of_interest = meta_df['PARTICIPANT_ID'].tolist()

In [11]:
# reading all of the files, that are were typed in qwerty and on laptop/pc
keystrokes_data_files = [f'{i}_keystrokes.txt' for i in ids_of_interest]
keystrokes_real_files = os.listdir('data/files/')

keystrokes_data_files = sorted(list(set(keystrokes_data_files).intersection(set(keystrokes_real_files))))

In [42]:
from tqdm import tqdm
dataframes = []
failed_files = []
for path in tqdm(keystrokes_data_files):
    data_path = os.path.join(os.getcwd(), fr'data/files/{path}')
    try:
        df = pd.read_csv(data_path, sep='\t')
        for id in sorted(df.TEST_SECTION_ID.unique()):
            curr_df = df[df.TEST_SECTION_ID == id]
            dataframes.append(curr_df)
    except:
        failed_files.append(data_path)


len(failed_files)

  2%|▏         | 3132/162056 [00:41<32:20, 81.88it/s]

  2%|▏         | 3459/162056 [00:45<33:54, 77.95it/s]

In [15]:
# 3.8% of data is corrupted
(len(keystrokes_data_files) - len(failed_files))/len(keystrokes_data_files)

0.962667226144049

In [36]:
concat = []
for id in sorted(keys_df.TEST_SECTION_ID.unique()):
    concat.append(keys_df[keys_df['TEST_SECTION_ID'] == id])

Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,PRESS_TIME,RELEASE_TIME,LETTER,KEYCODE
32,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,326,1471934395727,1471934395871,SHIFT,16
33,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,328,1471934395836,1471934395973,W,87
34,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,330,1471934395973,1471934396026,i,73
35,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,333,1471934396119,1471934396192,l,76
36,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,336,1471934396285,1471934396358,l,76
37,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,338,1471934396436,1471934396509,,32
38,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,347,1471934396608,1471934396672,y,89
39,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,348,1471934396739,1471934396824,o,79
40,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,349,1471934396872,1471934396957,u,85
41,5,10,Will you and KB be around this afternoon?,Will you and KB be around thus afternoon?,351,1471934397052,1471934397142,,32
