In [1]:
import zipfile
import io
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle

In [2]:
# map upper case letters to their lowercase keys
uppers = {
    '~': '`', '!': '1', '@': '2', '#': '3', '$': '4', '%': '5', '^': '6', '&': '7', '*': '8', '(': '9', ')': '0', '_': '-', '+': '=',
    'Q': 'q', 'W': 'w', 'E': 'e', 'R': 'r', 'T': 't', 'Y': 'y', 'U': 'u', 'I': 'i', 'O': 'o', 'P': 'p', '{': '[', '}': ']', '|': '\\',
    'A': 'a', 'S': 's', 'D': 'd', 'F': 'f', 'G': 'g', 'H': 'h', 'J': 'j', 'K': 'k', 'L': 'l', ':': ';', '"': '\'',
    'Z': 'z', 'X': 'x', 'C': 'c', 'V': 'v', 'B': 'b', 'N': 'n', 'M': 'm', '<': ',', '>': '.', '?': '/',
}

# characters in the keystroke dataset
chars = {
    ' ', '!', '?', "'", ',', '-', '.', 
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
}

# all keyboard characters
keyboard = {
    'ESC', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12',
    
    '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', 'BKSP',
    '`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '=',
    '\t', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|',
    'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']', '\\',
    'CAPS_LOCK', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"', '\n',
    'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\'',
    'SHIFT', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?',
    'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/',
    'CTRL', 'WIN', 'ALT', ' ', 'MENU',
    
    'PRT_SC', 'SCR_LK', 'PAUSE',
    'INSERT', 'HOME', 'PG_UP', 'DELETE', 'END', 'PG_DOWN',
    'ARW_UP', 'ARW_LEFT', 'ARW_DOWN', 'ARW_RIGHT',
    'NUM_LK', 'NUM_DIVIDE', 'NUM_MULT', 'NUM_SUB', 'NUM_ADD',
    'NUM_0', 'NUM_1', 'NUM_2', 'NUM_3', 'NUM_4', 'NUM_5', 'NUM_6', 'NUM_7', 'NUM_8', 'NUM_9',
}

In [3]:
chain = {}

def extract_durations(df):
    # keep track of previous key
    prev = ''
    for _, (char, time_diff) in df.iterrows():
        # only record keyboard keys
        if char in keyboard:
            if not np.isnan(time_diff):
                if char in uppers:
                    # make char lowercase
                    char = uppers[char]
                char = char.lower()
                if prev and prev in chain:
                    if char in chain[prev]:
                        # create a list of time differences for every key pair
                        chain[prev][char].append(time_diff)
                    else:
                        chain[prev][char] = [time_diff]
                elif prev:
                    chain[prev] = {}
                    chain[prev][char] = [time_diff]
            prev = char
        else:
            prev = ''

def handle_df(data):
    # convert file data to dataframe
    df = pd.DataFrame([x.split('\t') for x in data.decode('latin-1').split('\n')])
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.sort_values('PRESS_TIME')
    df['TIME_DIFFERENCE'] = df['PRESS_TIME'].apply(pd.to_numeric).diff()
    df = df[['LETTER', 'TIME_DIFFERENCE']]
    return df

In [4]:
failed = 0
total = 0

with zipfile.ZipFile('Keystrokes.zip', 'r') as f:
    # iterate through zip file without fully decompressing
    # this is faster since there's less seek time
    for name in (pbar := tqdm([name for name in f.namelist() if name[-15:] == '_keystrokes.txt'])):
        try:
            df = handle_df(f.read(name))
            extract_durations(df)
        except Exception as e:
            failed += 1
        total += 1
        pbar.set_description(f'failed %: {round(100 * failed / total, 1)}')

failed %: 0.0: 100%|███████████████████| 168593/168593 [2:29:12<00:00, 18.83it/s]


In [5]:
with open('diffs.pkl', 'wb') as f:
    pickle.dump(chain, f)