In [730]:
import numpy as np
import pandas as pd
import random, math
from sklearn.linear_model import LinearRegression, LogisticRegression

In [486]:
def find_min_edit_distance(word1, word2):
    w1 = "#" + word1
    w2 = "#" + word2
    D = np.zeros([len(w1), len(w2)], dtype=np.int8)
    
    for i in range(len(w1)):
        D[i, 0] = i
    for j in range(len(w2)):
        D[0, j] = j
    
    for i in range(1, len(w1)):
        for j in range(1, len(w2)):
            left = D[i-1, j] + 1
            right = D[i, j-1] + 1
            diagonal = D[i-1, j-1] + (2 if w1[i] != w2[j] else 0)
            D[i, j] = min(diagonal, min(left, right))
            
    return D[len(word1), len(word2)]

In [487]:
def get_all_words(df, restrict_list=[]):
    all_words = {}
    unique_words = {}
    prob_words = {}
    for i, column in enumerate(list(df.columns.values)):
        if i in restrict_list:
            continue
        for word in df[column].values:
            if len(str(word)) > 3:
                word_to_use = str(word).lower().strip("\n")
                if len(word_to_use) >= 5 and word_to_use[-1] == "s":
                    word_to_use = word_to_use[:-1]
                    
                if word_to_use not in all_words:
                    all_words[word_to_use] = 0
                all_words[word_to_use] += 1
                
                if word_to_use not in unique_words:
                    unique_words[word_to_use] = 0
                if word.lower() != word:
                    unique_words[word_to_use] += 1
    
    for word in all_words:
        prob_words[word] = unique_words[word] / all_words[word]
    
    return all_words, unique_words, prob_words

In [488]:
def get_min_edit_matrix(all_words):
    min_edit_matrix = {}
    for word1 in all_words:
        if word1 not in min_edit_matrix:
            min_edit_matrix[word1] = {}
        for word2 in all_words:
            if word2 not in min_edit_matrix:
                min_edit_matrix[word2] = {}
            if word1 != word2:
                if word2 not in min_edit_matrix[word1]:
                    min_edit_matrix[word1][word2] = find_min_edit_distance(word1, word2)
                if word1 not in min_edit_matrix[word2]:
                    min_edit_matrix[word2][word1] = min_edit_matrix[word1][word2]
    return min_edit_matrix

In [229]:
data = pd.read_csv("boggle_results.csv", header=0)

In [230]:
word_frequencies, word_unique_frequencies, word_probabilities = get_all_words(data)
sorted_frequencies = list(reversed(sorted(zip(list(word_frequencies.values()), list(word_frequencies.keys())))))
sorted_unique_frequencies = list(reversed(sorted(zip(list(word_unique_frequencies.values()), list(word_unique_frequencies.keys())))))


In [231]:
min_edit_matrix = get_min_edit_matrix(word_frequencies)

In [232]:
max_distance = 0
pair = None
for word in min_edit_matrix:
    for w2 in min_edit_matrix[word]:
        if min_edit_matrix[word][w2] > max_distance:
            max_distance = min_edit_matrix[word][w2]
            pair = (word, w2)

In [676]:
val_columns = random.sample(range(len(data.columns.values)), k=8)
wf_train, wuf_train, wp_train = get_all_words(data, restrict_list=val_columns)
columns = data.columns.values
val_words = []
targets = []
for column_idx in val_columns:
    column = data[columns[column_idx]]
    for word in column:
        word_to_use = str(word).lower().strip("\n")
        if len(word_to_use) >= 5 and word_to_use[-1] == "s":
            word_to_use = word_to_use[:-1]
        if len(word_to_use) > 3:
            val_words.append(word_to_use)
            target = 1 if word.lower() != word else 0
            targets.append(target)

In [677]:
word_to_idx = {}
idx_to_word = []
for i, word in enumerate(word_frequencies):
    word_to_idx[word] = i
    idx_to_word.append(word)

In [678]:
data_matrix = np.zeros([len(val_words), max_distance + 1])
for i, word in enumerate(val_words):
    counts = np.zeros(max_distance + 1)
    for word2 in wf_train:
        dist = min_edit_matrix[word][word2] if word != word2 else 0
        counts[dist] += wf_train[word2]
    for word2 in wp_train:
        dist = min_edit_matrix[word][word2] if word != word2 else 0
        data_matrix[i, dist] += wp_train[word2] * wf_train[word2] / counts[dist]
    for j, count in enumerate(counts):
        if count == 0:
            if j < max_distance - 1:
                data_matrix[i, j] = data_matrix[i, j]

In [669]:
big_data_matrix = np.zeros([len(word_frequencies), max_distance + 1])
for i, word in enumerate(idx_to_word):
    counts = np.zeros(max_distance + 1)
    for word2 in word_frequencies:
        dist = min_edit_matrix[word][word2] if word != word2 else 0
        counts[dist] += word_frequencies[word2]
    for word2 in word_probabilities:
        dist = min_edit_matrix[word][word2] if word != word2 else 0
        big_data_matrix[i, dist] += word_probabilities[word2] * word_frequencies[word2] / counts[dist]
    for j, count in enumerate(counts):
        if count == 0:
            if j < max_distance - 1:
                big_data_matrix[i, j] = big_data_matrix[i, j]

In [679]:
num_elements = 4
regressor = LogisticRegression(fit_intercept=False)
output = regressor.fit(data_matrix[:, 0:num_elements], targets)
coeffs = output.coef_[0]
print(output.score(data_matrix[:, 0:num_elements], targets))
print()
for coeff in coeffs:
    print(coeff)
print()

0.6712328767123288

0.622760092075081
0.10234231729263789
-0.06755913819216534
-1.2697556438218494



In [683]:
def make_prediction(word):
    row = np.expand_dims(big_data_matrix[word_to_idx[word], 0:num_elements], 0)
    return output.predict_proba(row)[0][1]

In [692]:
odds_dict = {}
for word in word_frequencies:
    odds_dict[word] = make_prediction(word)
odds_list = list(reversed(sorted(zip(list(odds_dict.values()), list(odds_dict.keys())))))

In [700]:
for word, pct in odds_list:
    print(word, "\t", pct)

0.6737296112796849 	 slight
0.6737296112796849 	 groping
0.6681362734719496 	 camp
0.6587083026678903 	 carnal
0.6508460278628737 	 tunnel
0.6508460278628737 	 teff
0.6508460278628737 	 splinch
0.6508460278628737 	 slug
0.6508460278628737 	 senior
0.6508460278628737 	 retooled
0.6508460278628737 	 relish
0.6508460278628737 	 queered
0.6508460278628737 	 pinnie
0.6508460278628737 	 oilier
0.6508460278628737 	 ohio
0.6508460278628737 	 miff
0.6508460278628737 	 meow
0.6508460278628737 	 fish
0.6508460278628737 	 exit
0.6508460278628737 	 asunder
0.6508460278628737 	 agora
0.6508460278628737 	 achy
0.6499423986157273 	 mist
0.6489245279614925 	 behead
0.6477693019710207 	 gnat
0.6469981888479942 	 richen
0.6469981888479942 	 pimp
0.6457112993057914 	 pupa
0.6457112993057914 	 puma
0.6446802654997703 	 magi
0.643992162154727 	 maim
0.6431311961531576 	 gust
0.6392454620351391 	 dial
0.636799247953042 	 mart
0.6361449321985859 	 tonner
0.6353414081277496 	 wormed
0.6353414081277496 	 worm
0

0.4652020652521191 	 glen
0.4651529533454284 	 noted
0.4632446012288753 	 scoot
0.46299043489372615 	 hotel
0.4626558983547904 	 trade
0.4625799526560085 	 toon
0.46252981830843165 	 deer
0.46251019597304094 	 trawl
0.46175944982519596 	 went
0.4616294333645304 	 here
0.46040322763014857 	 scented
0.46028601001880143 	 lied
0.4596844594290055 	 grid
0.45887779810098805 	 tooled
0.45870299894726557 	 dote
0.45856599922256963 	 lout
0.4584406178780365 	 fair
0.45837318676913524 	 feed
0.4580656573476904 	 redo
0.4580064681415298 	 tees
0.4569280290352233 	 tong
0.45621858692936756 	 canted
0.45611193096593927 	 shill
0.45601729907639965 	 teen
0.4555731286887945 	 feet
0.4555202618905996 	 aged
0.455382899465872 	 tarry
0.4552131671902127 	 nape
0.4551693203920844 	 cane
0.4547755204450339 	 tooler
0.4533193756851993 	 seen
0.45223626733740263 	 rust
0.4519371993595509 	 tented
0.45186269357491227 	 earn
0.4508368798192254 	 hole
0.45069553634093573 	 moat
0.44949589694191744 	 goner
0.4

0.3422690250815257 	 cent
0.34214465177956926 	 hnad
0.34132446136411804 	 sender
0.3410215936088221 	 peon
0.3409629159714135 	 tory
0.340625645019316 	 newt
0.3403126005148028 	 leaden
0.3400478339655344 	 peni
0.33987939069276374 	 tartar
0.33987939069276374 	 nord
0.3398673654706141 	 lichen
0.3398673654706141 	 flew
0.3397852500493794 	 lard
0.33963883420886765 	 dame
0.33963883420886765 	 beer
0.338879296505026 	 jader
0.33879753856865646 	 list
0.33879753856865646 	 beamer
0.33856827070813383 	 tars
0.3384114425888704 	 mail
0.33797526016406165 	 form
0.33782992718745725 	 fell
0.33760443075778995 	 canned
0.3375974581180378 	 veal
0.337345705176349 	 repeal
0.337345705176349 	 pell
0.33716593808821954 	 melt
0.3370311436659849 	 roam
0.3370311436659849 	 lost
0.3369263218792612 	 racing
0.3365909999766777 	 zest
0.3362797723749839 	 swam
0.3362797723749839 	 city
0.3360883256169992 	 dearie
0.3360883256169992 	 coil
0.33551583179143735 	 rape
0.3345825317517574 	 nial
0.3345515

In [720]:
points = [0, 0, 0, 0, 1, 2, 3, 5, 8, 11, 11, 11]
expected_points = {}
for i, column in enumerate(list(data.columns.values)):
        for word in data[column].values:
            if len(str(word)) > 3:
                word_to_use = str(word).lower().strip("\n")
                if len(word_to_use) >= 5 and word_to_use[-1] == "s":
                    word_to_use = word_to_use[:-1]
                
                if word.lower() not in expected_points:
                    expected_points[word.lower().strip("\n \t")] = points[len(word.lower().strip("\n \t"))] * odds[word_to_use]
ep_list = list(reversed(sorted(zip(list(expected_points.values()), list(expected_points.keys())))))

In [721]:
for word, ep in ep_list:
    print(word, "\t", ep)

6.69443032652205 	 fasteners
5.20676822290299 	 retooled
5.082731265021997 	 friended
4.367947034614377 	 recanted
4.199672966883692 	 rentiers
3.954962477873689 	 retooler
3.6145252269617663 	 requested
3.3686480563984245 	 groping
3.2542301393143687 	 splinch
3.2542301393143687 	 queered
3.2542301393143687 	 pinnies
3.2542301393143687 	 asunder
3.234990944239971 	 richens
3.1767070406387483 	 croatia
3.1767070406387483 	 angelic
3.114447612292845 	 watered
3.006798979416219 	 steered
2.9385109122647783 	 thrones
2.927585806506956 	 rotated
2.781342725189349 	 rotates
2.769695729473471 	 sinewed
2.750407052033581 	 roasted
2.744499986140104 	 fainter
2.7255369092715673 	 torquer
2.6867030024736485 	 heating
2.658449835069142 	 streams
2.6540588605868503 	 tearier
2.650578306225138 	 painter
2.641551293946365 	 scooted
2.6287456196085572 	 requests
2.6287456196085572 	 repealer
2.6287456196085572 	 repealed
2.6270364832357873 	 leading
2.5 	 mittens
2.4637431493024464 	 parrots
2.45406

0.8493573427541681 	 rinse
0.8488887000296064 	 hones
0.8483029475589016 	 snore
0.8472941020066282 	 steep
0.8462437153566471 	 wheat
0.8453091704522435 	 sails
0.8450955745327714 	 gager
0.845075663687498 	 cooty
0.8445560180201339 	 there
0.8444744360179601 	 frees
0.844361440675012 	 rater
0.8443148921415993 	 creed
0.8432029550430379 	 cares
0.8430683659108774 	 gents
0.8415247009760771 	 moled
0.8385205197960837 	 tides
0.8375361445154542 	 fades
0.8369224890196842 	 eases
0.835455559213112 	 poler
0.8352574411205662 	 chafer
0.8346304195201613 	 rents
0.8343755182525776 	 feels
0.8343590789848426 	 spine
0.8323376141646165 	 sheet
0.8322984563251218 	 quote
0.8309460760714485 	 assed
0.827161206947147 	 whole
0.8247573139633326 	 heeds
0.8241028681709573 	 greet
0.8236094064513105 	 notes
0.8230841729166758 	 wined
0.8214034259662035 	 danes
0.8213528023125398 	 stags
0.8206822801882369 	 mages
0.8205700011493692 	 fumed
0.8197752460808161 	 plait
0.8194384607224662 	 rhone
0.81

0.4244071362451225 	 sine
0.42265458522612176 	 sail
0.42223721800898006 	 free
0.4221127057544305 	 sent
0.4217792142901789 	 rail
0.42173937488944085 	 rone
0.42160147752151894 	 care
0.4215341829554387 	 gent
0.42100156895400287 	 late
0.42018956507458943 	 mote
0.41926025989804183 	 tide
0.41919965029367 	 dole
0.41910885784050406 	 agin
0.41869272028550253 	 cells
0.41865598743637855 	 earl
0.4184612445098421 	 ease
0.4183485031920282 	 pita
0.4181371654346198 	 shed
0.41800931181047785 	 mice
0.41789596703854637 	 pole
0.41777453583451335 	 dong
0.41731520976008063 	 rent
0.4171877591262888 	 feel
0.4164980535763586 	 loot
0.41624038498038096 	 poor
0.41611703451157145 	 wine
0.4152563472209909 	 wait
0.41504351076191515 	 wise
0.4149781824934739 	 shag
0.4149767059989395 	 case
0.414144894866097 	 sane
0.4135658746372042 	 near
0.41255001887807097 	 trod
0.4123786569816663 	 heed
0.41180470322565527 	 note
0.4116188033576658 	 slid
0.4111741156627438 	 root
0.41070171298310176 	

In [722]:
game_exp = {}
for game in data.columns.values:
    game_exp_pts = 0
    game_pts = 0
    for word in data[game]:
        if len(str(word).strip("\n ")) < 4:
            continue
        stripped_word = word.strip("\n ")
        game_exp_pts += expected_points[stripped_word.lower()]
        game_pts += 0 if stripped_word[0] == stripped_word[0].lower() else points[len(stripped_word)]
    game_exp[game] = (game_exp_pts, game_pts)

In [723]:
for game in game_exp:
    print(game, "\t", game_exp[game][0], "\t", game_exp[game][1])

Sam-G1 	 13.446750228443205 	 9
Sam-G2 	 4.181379384052316 	 2
Sam-G3 	 16.607874963675677 	 18
Sam-G4 	 15.375738796272822 	 14
Sam-G5 	 16.54220415634709 	 8
Sam-G6 	 7.59457730251567 	 7
Sam-G7 	 13.244339096117184 	 12
Will-G1 	 10.27481466463975 	 14
Will-G2 	 20.273399524163526 	 23
Will-G3 	 20.036708681496165 	 14
Will-G4 	 13.227630200086624 	 16
Will-G5 	 21.885122706088577 	 25
Will-G6 	 8.090933944819497 	 8
Will-G7 	 11.75834425133094 	 6
Will-G8 	 5.664027936229411 	 5
Will-G9 	 12.470576388190104 	 18
Sam-G8 	 14.190557076000859 	 15
Sam-G9 	 18.778134338923483 	 20
Sam-G10 	 25.065843468036473 	 31
Sam-G11 	 13.806229321085638 	 9
Sam-G12 	 9.474319864061949 	 0
Sam-G13 	 18.93545378204995 	 18
Sam-G14 	 10.794891375425744 	 8
Sam-G15 	 14.705911592750232 	 8
Sam-G16 	 19.250840134349147 	 20
Sam-G17 	 7.832946921645441 	 6
Sam-G18 	 10.096096666195214 	 6
Sam-G19 	 3.3703220304325328 	 2
Sam-G20 	 22.78266621015379 	 22
Sam-G21 	 31.103476586279562 	 36
Sam-G22 	 10.13

In [731]:
def get_cum_stats(person):
    total_games = 0
    total_pts = 0
    total_exp_pts = 0
    for game in game_exp:
        if person in game:
            total_games += 1
            total_pts += game_exp[game][1]
            total_exp_pts += game_exp[game][0]
    avg_pts = total_pts / total_games
    avg_exp_pts = total_exp_pts / total_games
    
    diff = 0
    exp_diff = 0
    for game in game_exp:
        if person in game:
            diff += (game_exp[game][1] - avg_pts)**2
            exp_diff += (game_exp[game][0] - avg_exp_pts)**2
    var = math.sqrt(diff / total_games)
    exp_var = math.sqrt(exp_diff / total_games)
            
    return avg_exp_pts,  avg_pts, exp_var, var

In [732]:
will_stats = get_cum_stats("Will")

(15.70140115088097, 15.897435897435898, 6.533693589376612, 9.700105396717571)

In [733]:
sam_stats = get_cum_stats("Sam")

(13.666063321571471, 11.805555555555555, 6.243679205335406, 8.39252128863829)

In [734]:
rachel_stats = get_cum_stats("Rachel")

(12.813060221891753, 10.255813953488373, 6.00988765943056, 7.436844239625869)

In [737]:
most_egregious = None
big_neg_gap = 0
most_lucky = None
big_pos_gap = 0
for game in game_exp:
    exp_pts, pts = game_exp[game]
    gap = pts - exp_pts
    if gap < big_neg_gap:
        most_egregious = game
        big_neg_gap = gap
    elif gap > big_pos_gap:
        most_lucky = game
        big_pos_gap = gap

In [739]:
print(most_egregious, "\t", big_neg_gap, "\t", game_exp[most_egregious][0], "\t", game_exp[most_egregious][1])
print(most_lucky, "\t", big_pos_gap, "\t", game_exp[most_lucky][0], "\t", game_exp[most_lucky][1])

Rachel-G18 	 -10.678760395271375 	 24.678760395271375 	 14
Will-G26 	 18.672376439026674 	 22.327623560973326 	 41


In [None]:
import matplotlib.pyplot as plt

pl