In [1]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
from itertools import combinations

https://stackoverflow.com/questions/21979970/how-to-use-word2vec-to-calculate-the-similarity-distance-by-giving-2-words

https://radimrehurek.com/gensim/auto_examples/index.html#documentation

https://fasttext.cc/docs/en/english-vectors.html

https://hannibunny.github.io/nlpbook/05representations/01WordEmbeddingImplementation.html

https://datascience.stackexchange.com/questions/12872/how-can-i-get-a-measure-of-the-semantic-similarity-of-words

In [2]:
%%time
model = KeyedVectors.load_word2vec_format('crawl-300d-2M.vec')

CPU times: user 2min 6s, sys: 3.42 s, total: 2min 9s
Wall time: 2min 10s


Ways to measure:

Convex hull volume

Average Linkage: Maximize average similarity

Complete Linkage: Maximize lowest similarity

Single Linkage: Maximize highest similarity

In [3]:
def pairwise_values(input_words):
    sims = []
    for i in range(len(input_words)-1):
        for j in range(len(input_words)-i-1):
            word1 = input_words[i]
            word2 = input_words[i+j+1]
            sim = model.similarity(word1, word2)
            sims.append(sim)
        
    return sims

In [4]:
def distance_metric(similarities, method="average"):
    if method=="average":
        return np.mean(similarities)
    elif method=="complete":
        return min(similarities)
    elif method=="single":
        return max(similarities)
    else:
        raise ValueError("Method must be 'average', 'complete', or 'single'")

Step 1:

Find potential first group

Step 2:

Eliminate first four from total set

Step 3:

Find potential second group

Step 4:

Eliminate second four from total set

Step 5:

Find potential third (and therefore fourth) group

Step 6:

Calculate overall score of four groups (we want least similarity between groups/most dissimilarity or (less likely) highest within-group similarity)

Step 7:

Repeat with slightly different groups (maybe choose second-best and so on for potential first?)

In [5]:
def remove_combos(df, remove_words):
    """Input remove_words as a set"""
    bool_series = df['combo'].apply(lambda x: False if len(set(x).intersection(remove_words)) > 0 else True) #this line can be more efficient
    return df[bool_series]

In [6]:
def keep_combos(group_df, guess):
    """Input a group dataframe and the correct guess, returns df with only possible combinations"""
    bool_series = group_df['Groups'].apply(lambda x: any(t == guess for t in x))
    return group_df[~bool_series]

In [7]:
def score_groups(df, group_li, metric='average'):
    score_df = df[df['combo'].isin(group_li)]
    if metric=="average":
        return np.mean(score_df['distance'])
    if metric=="min":
        return min(score_df['distance'])
    elif metric=="max":
        return max(score_df['distance'])
    else:
        raise ValueError("Method must be 'average', 'min', or 'max'")

In [8]:
def groupings_score(df, groups_correct=0, iterations=1, metric='average'):
    group_scores = []
    group_lists = []
    iterations = min(len(df), iterations)
    for i in range(iterations):
        group_list = []
        group1 = df.iloc[i]['combo']
        group_list.append(group1)
        
        if groups_correct < 3:
            df_no1 = remove_combos(df, set(group1)) #this line is creating an empty dataframe bc every line of the df has at least one of the words in set(group1)
            #print(len(df_no1))
            group2 = df_no1.iloc[0]['combo']
            group_list.append(group2)
            
            if groups_correct < 2:
                df_no2 = remove_combos(df_no1, set(group2))
                group3 = df_no2.iloc[0]['combo']
                group_list.append(group3)
                
                if groups_correct < 1:
                    df_no3 = remove_combos(df_no2, set(group3))
                    group4 = df_no3.iloc[0]['combo']
                    group_list.append(group4)
        
        group_score = score_groups(df, group_list, metric)
        
        group_lists.append(group_list)
        group_scores.append(group_score)
        
    group_df = pd.DataFrame({'Groups': group_lists, 'Score': group_scores}).sort_values('Score', ascending=False)
    top_guess = group_df.iloc[0]['Groups'][0]
    return top_guess, group_df

In [9]:
def combo_generator(word_list, words_per_class, distance="average"):
    combos = list(combinations(word_list, words_per_class))
    df = pd.DataFrame({"combo": combos})
    distances = []
    for c in combos:
        distances.append(distance_metric(pairwise_values(c), distance))
    df['distance'] = distances
    df = df.sort_values('distance', ascending=False)
    return df

In [11]:
def optimizer(word_list, words_per_class=4, distance="average", metric="average", iterations=1):
    
    ### Generates word combinations and assesses their collective pairwise distances based on the chosen distance metric
    df0 = combo_generator(word_list, words_per_class) #Generate the 1820 combos
    
    ### Greedy algorithm for choosing best groupings
    guess, group_df0 = groupings_score(df0, num_correct, iterations, metric) #Generate the top [iterations] number of groupings of combos out of 63,063,000 possible
    
    ### Information-Based Optimizer
    num_correct = 0
    print(f"Guess: {guess}")
    info_df = pd.DataFrame(columns=['Guess', 'Feedback'])
    while num_correct < 4:
        feedback = int(input("Feedback: Input 4 if all 4 were correct, 3 if 3 were correct, 2 if 2 were correct, 0 if no feedback received, -1 if fail"))
        if feedback==4: #increment num_correct, add to info_df, generate new (smaller) word_list by removing guessed words, generate new groupings dataframe out of (34,650|70|1) possible
            num_correct += 1
            info_df = info_df.append({'Guess': guess, 'Feedback': 4})
            if num_correct == 4:
                print("Solved!")
                break
            word_list = list(set(word_list) - set(guess))
            df = combo_generator(word_list, words_per_class)
            print(len(df))
            guess, group_df = groupings_score(df, num_correct, iterations, metric)
            
        elif feedback==3: #this one isn't working
            #this will need to factor in the possibility of getting 3 right and then 3 right again
            info_df = info_df.append({'Guess': guess, 'Feedback': 3})
            
            choose3 = list(combinations(guess, 3))
            remaining = list(set(word_list) - set(guess))
            next_options = []
            for g in choose3:
                for w in remaining:
                    next_option = tuple(list(g) + [w])
                    next_options.append(next_option)

            # groups3_df = group_df[group_df['Groups'].apply(lambda x: any(sorted(item) in [sorted(t) for t in next_options] for item in x))]
            # guess = groups3_df.iloc[0]['Groups'][0]
            df = combo_generator(word_list, words_per_class)
            guess, group_df = groupings_score(df, num_correct, iterations, metric)
            group_df = group_df[group_df['Groups'].apply(lambda x: any(sorted(item) in [sorted(t) for t in next_options] for item in x))]
            guess = group_df.iloc[0]['Groups'][0]
            
        elif feedback==2: #haven't edited this one
            choose2 = list(combinations(guess, 2))
            remaining = list(set(word_list) - set(guess))
            remaining_words = list(combinations(remaining, 2))
            next_options = []
            for g in choose2:
                for w in remaining_words:
                    next_option = g + tuple(w)
                    next_options.append(next_option)
            groups2_df = group_df[group_df['Groups'].apply(lambda x: any(sorted(item) in [sorted(t) for t in next_options] for item in x))]
            guess = groups2_df.iloc[0]['Groups'][0]
            
        elif feedback==0: #could be 2 + 2, could be 2 + 1 + 1, could be 1 + 1 + 1 + 1 but we don't know
            guess = group_df.iloc[guess_num]['Groups'][0]
            
        else:
            print("Sad.")
            break
        guess_num += 1
        print(f"Guess: {guess}")
    print("Yay.")
    
    
    return None


In [278]:
combo_t = combo_generator(today, 4)
combo_t['combo'].apply(lambda x: sorted(x) in [sorted(t) for t in next_options])

Unnamed: 0,combo,distance
1550,"(icon, legend, symbol, glyph)",0.408283
1535,"(icon, legend, character, symbol)",0.384329
1586,"(icon, character, symbol, glyph)",0.371889
1494,"(icon, popular, legend, symbol)",0.360979
1538,"(icon, legend, character, glyph)",0.332129
...,...,...
1700,"(legend, hot, lipid, levy)",0.069065
1405,"(fine, popular, assess, glyph)",0.068898
1703,"(legend, hot, lipid, assess)",0.063975
624,"(hippo, fine, legend, assess)",0.063751


In [13]:
today = ['handsome', 'hippo', 'in', 'charge',
        'fine', 'icon', 'popular', 'legend',
        'hot', 'character', 'lipid', 'levy',
        'symbol', 'big', 'assess', 'glyph']

In [9]:
dec14 = ['fox', 'screen', 'lining', 'spoon',
        'split', 'divide', 'fork', 'part',
        'badger', 'bug', 'hound', 'nag',
        'arena', 'field', 'dome', 'bowl']

In [14]:
optimizer(today, distance="complete", metric="max", iterations=500)

Guess: ('icon', 'legend', 'symbol', 'glyph')


Feedback: Input 4 if all 4 were correct, 3 if 3 were correct, 2 if 2 were correct, 0 if no feedback received, -1 if fail 4


495
Guess: ('popular', 'handsome', 'hot', 'big')


Feedback: Input 4 if all 4 were correct, 3 if 3 were correct, 2 if 2 were correct, 0 if no feedback received, -1 if fail 4


70
Guess: ('fine', 'assess', 'charge', 'levy')


Feedback: Input 4 if all 4 were correct, 3 if 3 were correct, 2 if 2 were correct, 0 if no feedback received, -1 if fail 4


1
Guess: ('hippo', 'lipid', 'character', 'in')


Feedback: Input 4 if all 4 were correct, 3 if 3 were correct, 2 if 2 were correct, 0 if no feedback received, -1 if fail 4


Solved!
Yay.


In [60]:
groups_t.head()

Unnamed: 0,Groups,Score
0,"[(icon, character, symbol, glyph), (handsome, ...",0.181978
1,"[(handsome, popular, hot, big), (icon, charact...",0.181978
35,"[(in, charge, lipid, assess), (icon, character...",0.181978
38,"[(charge, lipid, levy, assess), (icon, charact...",0.180525
4,"[(icon, legend, symbol, glyph), (handsome, pop...",0.173597


In [47]:
options_t[0:5]

[('icon', 'character', 'symbol', 'assess'),
 ('icon', 'character', 'symbol', 'hippo'),
 ('icon', 'character', 'symbol', 'popular'),
 ('icon', 'character', 'symbol', 'in'),
 ('icon', 'character', 'symbol', 'lipid')]

In [59]:
groups_t[groups_t['Groups'].apply(lambda x: any(sorted(item) in [sorted(t) for t in options_t] for item in x))]

Unnamed: 0,Groups,Score
4,"[(icon, legend, symbol, glyph), (handsome, pop...",0.173597
3,"[(legend, character, symbol, glyph), (handsome...",0.173597
2,"[(icon, legend, character, symbol), (handsome,...",0.173059
159,"[(in, fine, character, assess), (handsome, pop...",0.168559
123,"[(in, charge, character, assess), (handsome, p...",0.167978
...,...,...
489,"[(fine, icon, legend, hot), (in, charge, lipid...",0.099756
91,"[(popular, legend, character, big), (in, charg...",0.096373
84,"[(handsome, legend, character, big), (in, char...",0.096373
408,"[(icon, hot, character, glyph), (handsome, pop...",0.093658


In [None]:
any(sorted(item) in [sorted(t) for t in target_tuples] for item in x)

In [24]:
list(set(words_t) - set(('icon', 'character', 'symbol', 'glyph')))

['assess',
 'hippo',
 'popular',
 'in',
 'lipid',
 'big',
 'fine',
 'levy',
 'hot',
 'legend',
 'handsome',
 'charge']

In [15]:
list(combinations([('icon', 'character', 'symbol'), 'handsome', 'hippo'], 2))

[(('icon', 'character', 'symbol'), 'handsome'),
 (('icon', 'character', 'symbol'), 'hippo'),
 ('handsome', 'hippo')]