In [1]:
# !pip install -q datasets

import re
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets
import json

ds = load_dataset("blog_authorship_corpus")
ds = concatenate_datasets([ds['train'], ds['validation']])
ds = ds.remove_columns(['date', 'gender', 'age', 'horoscope', 'job'])

# map upper case letters to their lowercase keys
uppers = {
    '~': '`', '!': '1', '@': '2', '#': '3', '$': '4', '%': '5', '^': '6', '&': '7', '*': '8', '(': '9', ')': '0', '_': '-', '+': '=',
    'Q': 'q', 'W': 'w', 'E': 'e', 'R': 'r', 'T': 't', 'Y': 'y', 'U': 'u', 'I': 'i', 'O': 'o', 'P': 'p', '{': '[', '}': ']', '|': '\\',
    'A': 'a', 'S': 's', 'D': 'd', 'F': 'f', 'G': 'g', 'H': 'h', 'J': 'j', 'K': 'k', 'L': 'l', ':': ';', '"': '\'',
    'Z': 'z', 'X': 'x', 'C': 'c', 'V': 'v', 'B': 'b', 'N': 'n', 'M': 'm', '<': ',', '>': '.', '?': '/',
}

keyboard = {
    'esc', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12',
    
    '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', 'bksp',
    '`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '=',
    '\t', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|',
    'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']', '\\',
    'caps_lock', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"', '\n',
    'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\'',
    'shift', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?',
    'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/',
    'ctrl', 'win', 'alt', ' ', 'menu',
    
    'prt_sc', 'scr_lk', 'pause',
    'insert', 'home', 'pg_up', 'delete', 'end', 'pg_down',
    'arw_up', 'arw_left', 'arw_down', 'arw_right',
    'num_lk', 'num_divide', 'num_mult', 'num_sub', 'num_add',
    'num_0', 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9',
}

Reusing dataset blog_authorship_corpus (/home/spence/.cache/huggingface/datasets/blog_authorship_corpus/blog_authorship_corpus/1.0.0/6f5d78241afd8313111956f877a57db7a0e9fc6718255dc85df0928197feb683)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def process(text : str) -> str:
    _t = ''
    for char in text:
        if char in keyboard:
            _t += char
        else:
            _t += 'Ĩ'

    text = _t
    return text

In [3]:
chain = {
    'shift': {'freq': 0}
}

prev = ''
for blog in tqdm(ds['text']):
    # preprocess text
    blog = process(blog)
    
    for char in blog:
        # remove non-keyboard characters
        if char == 'Ĩ':
            prev = ''
        elif char in uppers:
            # assume shift key was held down
            if prev in uppers:
                if uppers[char] in chain[uppers[prev]]:
                    chain[uppers[prev]]['FREQ'] += 1
                else:
                    chain[uppers[prev]] = {'FREQ' : 1}
                chain[uppers[prev]][uppers[char]] = chain[uppers[prev]].get(uppers[char], 0) + 1
            else:
                chain['SHIFT']['FREQ'] += 1
                chain['SHIFT'][uppers[char]] = chain['SHIFT'].get(uppers[char], 0) + 1
        elif prev and prev in chain:
            chain[prev]['FREQ'] += 1
            chain[prev][char] = chain[prev].get(char, 0) + 1
        elif prev:
            chain[prev] = {'FREQ': 1}
            chain[prev][char] = 1
        
        prev = char
    prev = ''

# remove key before non-keyboard
_ = chain.pop('Ĩ', None)

100%|██████████████████████████████████| 727712/727712 [09:48<00:00, 1236.27it/s]


In [4]:
with open('common_pairs.json', 'w') as f:
    json.dump(chain, f)

[Wrist Angle](https://pubmed.ncbi.nlm.nih.gov/10424183/):
    
    right: 23.4, left: 19.9
    avg: 21.65 degrees

[Finger Strength](https://www.researchgate.net/publication/2423272_A_System_For_Measuring_Finger_Forces_During_Grasping):
    
    @ 20 degrees, strongest:
        1. Thumb
        2. Middle
        3. Index
        4. Ring
        5. Pinky
        
![Fingers on Keyboard](https://upload.wikimedia.org/wikipedia/commons/9/93/Finger_position_on_a_keyboard.png)