In [1]:
import gensim.downloader

w2v_model = gensim.downloader.load('word2vec-google-news-300')


In [2]:
import numpy as np

def word2vec(word):
    # return word2vec[word.lower().split()[-1]]
    try:
        return w2v_model[word.lower().split()[-1]]
    except KeyError:
        print(f"Word {word} not found in word2vec")
        return w2v_model["hello"]
    
def similarity(vec1: np.ndarray, vec2: np.ndarray):
    # dot / (norm(vec1) * norm(vec2))
    score = vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    # return max(0, score)
    return score

In [3]:
from transformers import AutoTokenizer, AutoModel
from torch import Tensor

model_name = "thenlper/gte-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentenceformer = AutoModel.from_pretrained(model_name)
sentenceformer.eval()

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def word2vec(word):
    encoded = tokenizer(word, max_length=512, padding=True, truncation=True, return_tensors='pt')
    model_output = sentenceformer(**encoded)
    embedding = average_pool(model_output.last_hidden_state, encoded['attention_mask'])

    return embedding.detach().numpy()[0]

def similarity(vec1: np.ndarray, vec2: np.ndarray):
    # dot / (norm(vec1) * norm(vec2))
    return vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    # return vec1.dot(vec2)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from simulator import NYTConnections

import numpy as np
import networkx as nx

from itertools import combinations
from queue import PriorityQueue



def solve(game: NYTConnections):
    words = game.starting
    vecs = {word: word2vec(word) for word in words}

    G = nx.Graph()

    for pair in combinations(words, 2):
        score = similarity(vecs[pair[0]], vecs[pair[1]])

        G.add_edge(pair[0], pair[1], weight=score)

    # add EVERY SINGLE 4-clique to a priority queue
    pq = PriorityQueue()

    for clique in combinations(G.nodes, 4):
        clique = frozenset(clique)
        sG = G.subgraph(clique)

        # mod = nx.community.modularity(G, [clique]) * -1

        within = 0

        for edge in sG.edges:
            within += G.get_edge_data(edge[0], edge[1])["weight"]

        without = 0
        for edge in G.edges:
            if len(clique.intersection(set(edge))) == 1:
                without += G.get_edge_data(edge[0], edge[1])["weight"] ** 2

        # without = within - mod
        mod = within - without  # modularity of the clique

        pq.put((-mod, (set(clique), within, without)))

    while not all(game.solved.values()):
        # loop until we SOLVE the game

        score, _guess = pq.get_nowait()  # [0] is the modularity score
        guess, within, without = _guess
        correct = game.guess(guess)

        print(f"Guessed {guess}: {correct}, score: {-score}, within: {within}, without: {without}")

        # if correct, remove all other cliques that contain the guess
        if correct:
            for clique in list(pq.queue):
                if guess.intersection(clique[1][0]):
                    pq.queue.remove(clique)

    return game.mistakes_remaining

In [5]:
from simulator import NYTConnections
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def solve(game: NYTConnections):
    words = game.starting
    vecs = {word: word2vec(word) for word in words}
    vecslist = list(vecs.values())

    # distortions = []
    # for k in range(1, 16):
    #     kmeans = KMeans(n_clusters=k, n_init='auto')
    #     kmeans.fit(vecslist)
    #     distortions.append(kmeans.inertia_)

    # fig = plt.figure(figsize=(15, 5))
    # plt.plot(range(1, 16), distortions)
    # plt.grid(True)
    # plt.title("Elbow curve")

    clustering = KMeans(n_clusters=10, n_init='auto')
    clustering.fit(vecslist)

    clusters = {}

    for i, label in enumerate(clustering.labels_):
        if label not in clusters:
            clusters[label] = []
        
        clusters[label].append(words[i])

    print(clusters)


In [122]:
mistake_counts = []

for i in range(204, 205):
    try:
        game = NYTConnections(i)
    except ValueError:
        break
    mistakes_remaining = solve(game)

    mistake_counts.append(mistakes_remaining)
    print(f"Game {i}: {mistakes_remaining} mistakes remaining")
    # break


# avg = sum(mistake_counts) / len(mistake_counts)
# win_rate = sum([1 for count in mistake_counts if count >= 0]) / len(mistake_counts)
# print(f"Average mistakes remaining: {avg} for {len(mistake_counts)} games")
# print(f"Win rate: {win_rate}")
# print(mistake_counts)




{3: ['SWIFT', 'DASH', 'LARK', 'DART', 'BUMPER', 'JAY', 'HANCOCK'], 9: ['HOLIDAY'], 0: ['CARDINAL'], 2: ['HOOD'], 1: ['ZIP'], 6: ['MONK'], 8: ['TRUNK'], 4: ['PARKER'], 5: ['TIRE'], 7: ['BOLT']}
Game 200: None mistakes remaining
