In [19]:
#does the main math
#calculates total travel distance for all words in src_words_alpha.txt by layout
#calculates percent of characters that are in the home row for each word by layout

import pandas as pd
from collections import Counter
pd.options.mode.chained_assignment = None

df = pd.read_csv('words_alpha.csv', keep_default_na=False)
df_dist = pd.read_csv('pair_distances.csv', keep_default_na=False)
df_home = pd.read_csv('src_key_coords.csv', keep_default_na=False)
df_home = df_home[df_home['home_row']]

layouts = ['qwerty', 'dvorak', 'colemak', 'workman']

    words  length
0      AA       2
1     AAA       3
2     AAH       3
3   AAHED       5
4  AAHING       6
(370077, 2)
   distance qwerty dvorak colemak workman
0      0.00     QQ     ''      QQ      QQ
1     19.05     QW     ',      QW      QD
2     38.10     QE     '.      QF      QR
3     57.15     QR     'P      QP      QW
4     76.20     QT     'Y      QG      QB
    row  row_index   x-coord  y-coord qwerty dvorak colemak workman  home_row
10    2          1    4.7625    19.05      A      A       A       A      True
11    2          2   23.8125    19.05      S      O       R       S      True
12    2          3   42.8625    19.05      D      E       S       H      True
13    2          4   61.9125    19.05      F      U       T       T      True
16    2          7  119.0625    19.05      J      H       N       N      True


In [20]:
#split words into character pairs to calculate travel
#convert into counter dict to accout for repeats
def splitwords(word):
    pairs = [word[x]+word[x+1] for x in range(len(word)-1)]
    return Counter(pairs)

In [21]:
df['pairs'] = df['words'].apply(splitwords).tolist()
print(df.head())

    words  length                                          pairs
0      AA       2                                      {'AA': 1}
1     AAA       3                                      {'AA': 2}
2     AAH       3                             {'AA': 1, 'AH': 1}
3   AAHED       5           {'AA': 1, 'AH': 1, 'HE': 1, 'ED': 1}
4  AAHING       6  {'AA': 1, 'AH': 1, 'HI': 1, 'IN': 1, 'NG': 1}


In [24]:
#calculates travel distance for each word based on layout
def wdist(letters, layout):
    dist = 0
    vals = set(letters.values())
    
    for val in vals:
        keys = [key for key, value in letters.items() if value == val]
        dist += (df_dist[df_dist[layout].isin(keys)]['distance'].sum()) * val
    return round(dist, 2)

In [25]:
#loop through the layouts, run calculations
#took ~4 minutes per layout on last run
for ly in layouts:
    df[ly] = df['pairs'].apply(wdist, layout=ly).tolist()
    print(ly + ' completed')
print()
print(df.head())

qwerty completed
dvorak completed
colemak completed
workman completed

    words  length                                          pairs  qwerty  \
0      AA       2                                      {'AA': 1}    0.00   
1     AAA       3                                      {'AA': 2}    0.00   
2     AAH       3                             {'AA': 1, 'AH': 1}   95.25   
3   AAHED       5           {'AA': 1, 'AH': 1, 'HE': 1, 'ED': 1}  179.67   
4  AAHING       6  {'AA': 1, 'AH': 1, 'HI': 1, 'IN': 1, 'NG': 1}  212.92   

   dvorak  colemak  workman  
0    0.00     0.00     0.00  
1    0.00     0.00     0.00  
2  114.30    95.25    38.10  
3  247.65   190.50   253.93  
4  275.51   237.41   266.70  


In [26]:
#calculate percentage of characters in each word in the home row
def home_row(word, layout):
    wlist = list(word)
    home = sum(item in layout for item in wlist)
    phome = (home / len(word)) * 100
    return round(phome, 2)

In [27]:
for ly in layouts:
    colname = ly + '_home'
    lylist = df_home[ly].to_list()
    df[colname] = df['words'].apply(home_row, layout=lylist).tolist()
    print(ly + ' completed')
print()
print(df.head())

qwerty completed
dvorak completed
colemak completed
workman completed

    words  length                                          pairs  qwerty  \
0      AA       2                                      {'AA': 1}    0.00   
1     AAA       3                                      {'AA': 2}    0.00   
2     AAH       3                             {'AA': 1, 'AH': 1}   95.25   
3   AAHED       5           {'AA': 1, 'AH': 1, 'HE': 1, 'ED': 1}  179.67   
4  AAHING       6  {'AA': 1, 'AH': 1, 'HI': 1, 'IN': 1, 'NG': 1}  212.92   

   dvorak  colemak  workman  qwerty_home  dvorak_home  colemak_home  \
0    0.00     0.00     0.00       100.00       100.00        100.00   
1    0.00     0.00     0.00       100.00       100.00        100.00   
2  114.30    95.25    38.10        66.67       100.00         66.67   
3  247.65   190.50   253.93        60.00        80.00         60.00   
4  275.51   237.41   266.70        33.33        66.67         66.67   

   workman_home  
0        100.00  
1        

In [30]:
#drop pairs column, only needed for calculations
df.drop(columns = 'pairs', inplace = True)
print(df.head())
print(df.shape)

    words  length  qwerty  dvorak  colemak  workman  qwerty_home  dvorak_home  \
0      AA       2    0.00    0.00     0.00     0.00       100.00       100.00   
1     AAA       3    0.00    0.00     0.00     0.00       100.00       100.00   
2     AAH       3   95.25  114.30    95.25    38.10        66.67       100.00   
3   AAHED       5  179.67  247.65   190.50   253.93        60.00        80.00   
4  AAHING       6  212.92  275.51   237.41   266.70        33.33        66.67   

   colemak_home  workman_home  
0        100.00        100.00  
1        100.00        100.00  
2         66.67        100.00  
3         60.00         80.00  
4         66.67         83.33  
(370077, 10)


In [31]:
#export to csv
df.to_csv('word_distances.csv', index = False)
print('done')

done
