In [2]:
import json
import os
import numpy as np

In [3]:
# filepath on Ishan's Mac -> change if someone else uses this
file_path = "/Users/ishan//Desktop/cs224n/02.json"

if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)

        print(data)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("File does not exist or is empty.")

{'acrossmap': None, 'admin': False, 'answers': {'across': ['HERB', 'CROW', 'HAVE', 'AMOR', 'HOPE', 'TOLET', 'LIBERATED', 'INLET', 'ELEVATOR', 'LOIRE', 'RYDER', 'ASTERN', 'EASTER', 'IVAN', 'ASP', 'FLEECE', 'NANA', 'DELAYED', 'UNAGING', 'IRAN', 'GUARDS', 'NAY', 'TEST', 'ACCESS', 'TENREC', 'ETAPE', 'STELE', 'REASONER', 'TOROS', 'TELLSOVER', 'ADEPT', 'ETAL', 'LIVE', 'RODE', 'DENY', 'SLED'], 'down': ['HALER', 'EMILY', 'ROBED', 'BREVE', 'CHAT', 'ROTO', 'OPERATE', 'WED', 'HONORING', 'ALLINVAIN', 'VEER', 'ETTE', 'TILE', 'RAREFY', 'SECURE', 'TRENDS', 'ALEGAR', 'SEDUCE', 'ANNA', 'NAGY', 'ADIT', 'SERE', 'PLASTERED', 'ANTELOPE', 'ASSESS', 'ACCRETE', 'NEST', 'TOOLS', 'ANVIL', 'PEEVE', 'ERRED', 'STAR', 'TODO', 'ELAN', 'ALLY', 'TED']}, 'author': 'Martha J. DeWitt', 'autowrap': None, 'bbars': None, 'circles': None, 'clues': {'across': ['1. Fennel or sweet cicely', '5. Eat ___ (suffer humiliation)', '9. "To ___ and to Hold," Johnston novel', '13. Cupid', '14. Lange from Conn.', '15. House sign', '16. W

In [4]:
# utilizing Berkeley's crosswords struct with a few additional functions for what we need

class Crossword:
    def __init__(self, data):
        self.data = data
        self.across_clues = {}
        self.down_clues = {}
        self.clue_to_positions = {}
        self.solution_dict = {}
        self.clue_grid = None
        self.neighbors = {}

    def initialize_solution_map(self):
        # first do across
        clues = self.data['clues']['across']
        answers = self.data['answers']['across']
        for i, clue in enumerate(clues):
            period_idx = clue.find('.')
            num, rest = clue[:period_idx], clue[period_idx+1:]
            self.solution_dict[f"{num}A"] = answers[i]

        # now do down
        clues = self.data['clues']['down']
        answers = self.data['answers']['down']
        for i, clue in enumerate(clues):
            period_idx = clue.find('.')
            num, rest = clue[:period_idx], clue[period_idx+1:]
            self.solution_dict[f"{num}D"] = answers[i]


    def initialize_clues(self):
        """
        Take in dictionary representing crossword and fill in dictionaries that hole clue codes (i.e. 1a/3d/18a/etc) 
        and map to corresppnding clue.
        """
        for clue in self.data['clues']['across']:
            period_idx = clue.find('.')
            num, rest = clue[:period_idx], clue[period_idx+1:]
            self.across_clues[f"{num}A"] = rest

        for clue in self.data['clues']['down']:
            period_idx = clue.find('.')
            num, rest = clue[:period_idx], clue[period_idx+1:]
            self.down_clues[f"{num}D"] = rest

    def initialize_clue_positions_mapping(self):
        """
        Take clue dictionary from self.across_clues and self.down_clues in the form {'1A': clue, etc ...}, 
        build a dictionary that maps clue ID to coordinates in grid
        """
        # first do across
        for clue in self.across_clues:
            num = int(clue[:-1])
            answer_len = len(self.solution_dict[clue])
            start = list(self.data['gridnums']).index(num)
            row, col = start // 15, start % 15 
            # this is across, so now that we have a start index, add corresponding coord to map
            coords = []
            for i in range(answer_len):
                coords.append((row, col + i))
            self.clue_to_positions[clue] = coords

        # now do down
        for clue in self.down_clues:
            num = int(clue[:-1])
            answer_len = len(self.solution_dict[clue])
            start = list(self.data['gridnums']).index(num)
            row, col = start // 15, start % 15 # convert from 1D array index to grid coord
            # this is across, so now that we have a start index, add corresponding coord to map
            coords = []
            for i in range(answer_len):
                coords.append((row + i, col))
            self.clue_to_positions[clue] = coords
    

    def initialize_clue_grid(self):
        """
        Represent a grid in the form of each cell being filled into to show what clue it corresponds to.
        For example:
        grid = [[('1A, 1D'), ('1A, 2D')],
                [('2A, 1D'), ('2A, 2D')]]
        """

        grid = [
            [[None, None] for _ in range(15)] for _ in range(15)
        ]
        
        for clue in self.across_clues.keys():
            coords = self.clue_to_positions[clue]
            for (x, y) in coords:
                grid[x][y][0] = clue

        for clue in self.down_clues.keys():
            coords = self.clue_to_positions[clue]
            for (x, y) in coords:
                grid[x][y][1] = clue

        self.clue_grid = grid


    def initialize(self):
        self.initialize_clues()
        self.initialize_solution_map()
        self.initialize_clue_positions_mapping()
        self.initialize_clue_grid()



In [5]:
trial = Crossword(data)

In [6]:
trial.initialize()

In [7]:
solutions = trial.solution_dict
down_answers = {}
across_answers = {}
for item in solutions:
    if item[-1] == 'D':
        down_answers[item] = solutions[item]
    else:
        across_answers[item] = solutions[item]


In [8]:
#trial.across_clues
down_clues = trial.down_clues
across_clues = trial.across_clues

In [9]:
inputs = []

for item in across_clues:
    clue = across_clues[item]
    ans = across_answers[item]
    length = len(ans)
    input_text = str(clue) + ',' + ' ' + str(length) + ','
    inputs.append(input_text)

for item in down_clues:
    clue = down_clues[item]
    ans = down_answers[item]
    length = len(ans)
    input_text = str(clue) + ',' + ' ' + str(length) + ','
    inputs.append(input_text)



print(inputs)


    
    

[' Fennel or sweet cicely, 4,', ' Eat ___ (suffer humiliation), 4,', ' "To ___ and to Hold," Johnston novel, 4,', ' Cupid, 4,', ' Lange from Conn., 4,', ' House sign, 5,', ' What NOW wants women to be, 9,', ' Ocean arm, 5,', ' Follower of grain or freight, 8,', " Orleans's river, 5,", ' ___ Cup (golf prize), 5,', " Boatman's backward, 6,", ' March 26, 1978, 6,', ' Pavlov, 4,', ' Relative of a daboia, 3,', ' Defraud, 6,', ' Pram pusher, 4,', ' Put off, 7,', ' Describing eternal youth, 7,', " Pahlavi's country, 4,", ' Cerberus et al., 6,', ' Aye neutralizer, 3,', ' Put to the ___, 4,', ' Passageway, 6,', ' Madagascar mammal, 6,', ' Storehouse of a sort, 5,', ' Inscribed pillar, 5,', ' Newscaster, 8,', ' Bulls, in Barcelona, 5,', ' Repeats a report, 9,', ' Proficient, 5,', ' Abbr. often used on deeds, 4,', ' Kind of wire, 4,', ' Harassed, 4,', ' Abjure, 4,', ' Pung or monoski, 4,', " Item in a Czech's wallet, 5,", ' Girl in "Our Town", 5,', ' Togate, 5,', ' Longest modern musical note, 5,

In [23]:
across_keys = list(trial.across_clues.keys())
down_keys = list(trial.down_clues.keys())
all_keys = across_keys + down_keys
print(all_keys)




['1A', '5A', '9A', '13A', '14A', '15A', '16A', '18A', '19A', '20A', '21A', '22A', '25A', '28A', '31A', '34A', '35A', '36A', '38A', '40A', '41A', '43A', '44A', '45A', '46A', '48A', '53A', '55A', '58A', '59A', '60A', '61A', '62A', '63A', '64A', '65A', '1D', '2D', '3D', '4D', '5D', '6D', '7D', '8D', '9D', '10D', '11D', '12D', '15D', '17D', '23D', '24D', '26D', '27D', '29D', '30D', '31D', '32D', '33D', '37D', '39D', '42D', '47D', '49D', '50D', '51D', '52D', '53D', '54D', '56D', '57D', '59D']


In [50]:
# work on unique completions

import openai 

!pip install openai==0.28

openai.api_key = 'sk-proj-8oLvnNGJLnlgW4SQOoHwT3BlbkFJ8c24SWE59CoO4sTxlDC7'

with open('/Users/ishan/Desktop/cs224n/fine_tuned_model_name.txt', 'r') as f:
    fine_tuned_model = f.read().strip()

def generate_unique_completions(prompt, model, num_completions=5, max_tokens=50):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        n=num_completions,  
        stop=None,
        temperature=0.9,  
        top_p=0.9  
    )
    completions = set()
    for choice in response['choices']:
        completions.add(choice['message']['content'].strip())
        if len(completions) >= num_completions:
            break
    
    return list(completions)[:num_completions]  




candidates = {key: [] for key in all_keys}


for idx, prompt in enumerate(inputs):
    completions = generate_unique_completions(prompt, fine_tuned_model, num_completions=5)
    for completion in completions:
        candidates[all_keys[idx]].append(completion)

print(candidates)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'1A': ['HERB'], '5A': ['CROW'], '9A': ['HAVE'], '13A': ['AMOR'], '14A': ['ANNE', 'REPW', 'HANK', 'INES', 'VERA'], '15A': ['LEOIV', 'SCORP'], '16A': ['ASSERTIVE'], '18A': ['YEMEN', 'FJORD'], '19A': ['CARTELOT', 'CARTELLI'], '20A': ['TEDES', 'FROST', 'PEEDE', 'TEJAS', 'WABAS'], '21A': ['RYDER'], '22A': ['ASTERN'], '25A': ['EASTER'], '28A': ['IVAN'], '31A': ['ASP'], '34A': ['SHORTC', 'FLIMFL', 'BILKED'], '35A': ['MUMS', 'YANK'], '36A': ['DEFERRE', 'DEFERRED'], '38A': ['AGELESS'], '40A': ['IRAN'], '41A': ['HOUNDS', 'HELLDO'], '43A': ['NAY', 'NAK'], '44A': ['TEST', 'SWORD', 'ROUE'], '45A': ['THROAT'], '46A': ['LEMURS'], '48A': ['ABBEY', 'CROCK', 'CELLAR', 'FATCA'], '53A': ['STELE'], '55A': ['ANCHORMAN', 'ANCHORER'], '58A': ['TOROS'], '59A': ['CONFIRMS', 'ECHOESIT'], '60A': ['ADEPT', 'ABLES'], '61A': ['TENR', 'ETAL', 'FIPS', 'LISA'], '62A': ['BARB', 'LOOP'], '63A': ['RIDOF', 'RODE', 'RANAT'], '64A': ['FORGO', 'FOGO'], '65A': ['ECHO', 'ITEM', 'SLED', 'TOYS', 'ALPINE'], '1D': ['KRONE', 'KORUN

In [30]:
# get bi-encoder

from sentence_transformers import SentenceTransformer, util
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
bi_encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')



In [51]:
print(candidates)

{'1A': ['HERB'], '5A': ['CROW'], '9A': ['HAVE'], '13A': ['AMOR'], '14A': ['ANNE', 'REPW', 'HANK', 'INES', 'VERA'], '15A': ['LEOIV', 'SCORP'], '16A': ['ASSERTIVE'], '18A': ['YEMEN', 'FJORD'], '19A': ['CARTELOT', 'CARTELLI'], '20A': ['TEDES', 'FROST', 'PEEDE', 'TEJAS', 'WABAS'], '21A': ['RYDER'], '22A': ['ASTERN'], '25A': ['EASTER'], '28A': ['IVAN'], '31A': ['ASP'], '34A': ['SHORTC', 'FLIMFL', 'BILKED'], '35A': ['MUMS', 'YANK'], '36A': ['DEFERRE', 'DEFERRED'], '38A': ['AGELESS'], '40A': ['IRAN'], '41A': ['HOUNDS', 'HELLDO'], '43A': ['NAY', 'NAK'], '44A': ['TEST', 'SWORD', 'ROUE'], '45A': ['THROAT'], '46A': ['LEMURS'], '48A': ['ABBEY', 'CROCK', 'CELLAR', 'FATCA'], '53A': ['STELE'], '55A': ['ANCHORMAN', 'ANCHORER'], '58A': ['TOROS'], '59A': ['CONFIRMS', 'ECHOESIT'], '60A': ['ADEPT', 'ABLES'], '61A': ['TENR', 'ETAL', 'FIPS', 'LISA'], '62A': ['BARB', 'LOOP'], '63A': ['RIDOF', 'RODE', 'RANAT'], '64A': ['FORGO', 'FOGO'], '65A': ['ECHO', 'ITEM', 'SLED', 'TOYS', 'ALPINE'], '1D': ['KRONE', 'KORUN

In [44]:

# I downloaded sentence_transformers and then this should work

def biencoder(clue, answers):

    def encode_texts(bi_encoder, texts):
        return bi_encoder.encode(texts)
    
    def calculate_similarity(clue_embedding, answer_embeddings):
        return util.dot_score(clue_embedding, answer_embeddings)[0].cpu().numpy()
    
    def softmax(x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()
    
    clue_embedding = encode_texts(bi_encoder, [clue])[0]
    answer_embeddings = encode_texts(bi_encoder, answers)
    
    similarity_scores = calculate_similarity(clue_embedding, answer_embeddings)
    probabilities = softmax(similarity_scores)
    
    answer_probabilities = list(zip(answers, probabilities))
    
    
    for answer, probability in answer_probabilities:
        print(f"Answer: {answer}, Probability: {probability:.4f}")

confidence_ratings = {}

# Iterate over the first 5 items in all_clues_actual and call biencoder with each clue and its candidates
for i, (key, clue) in enumerate(all_clues_actual.items()):
    if i >= 5:
        break
    candidate_answers = candidates.get(key, [])
    if candidate_answers: 
        confidence_ratings[key] = biencoder(clue, candidate_answers)

# pass this rating thing to bi-encoder
print(confidence_ratings)
   

    


Answer: HERB, Probability: 1.0000
Answer: CROW, Probability: 1.0000
Answer: HAVE, Probability: 1.0000
Answer: AMOR, Probability: 1.0000
Answer: MELA, Probability: 0.0000
Answer: EELI, Probability: 0.0000
Answer: TOMS, Probability: 0.1278
Answer: ALAN, Probability: 0.8721
{'1A': None, '5A': None, '9A': None, '13A': None, '14A': None}


In [52]:
all_clues = []
for (key, val) in trial.across_clues.items():
    all_clues.append(val)

for (key, val) in trial.down_clues.items():
    all_clues.append(val)

all_clues_actual = {}
for i in range(len(all_clues)):
    all_clues_actual[all_keys[i]] = all_clues[i]

print(all_clues_actual)

{'1A': ' Fennel or sweet cicely', '5A': ' Eat ___ (suffer humiliation)', '9A': ' "To ___ and to Hold," Johnston novel', '13A': ' Cupid', '14A': ' Lange from Conn.', '15A': ' House sign', '16A': ' What NOW wants women to be', '18A': ' Ocean arm', '19A': ' Follower of grain or freight', '20A': " Orleans's river", '21A': ' ___ Cup (golf prize)', '22A': " Boatman's backward", '25A': ' March 26, 1978', '28A': ' Pavlov', '31A': ' Relative of a daboia', '34A': ' Defraud', '35A': ' Pram pusher', '36A': ' Put off', '38A': ' Describing eternal youth', '40A': " Pahlavi's country", '41A': ' Cerberus et al.', '43A': ' Aye neutralizer', '44A': ' Put to the ___', '45A': ' Passageway', '46A': ' Madagascar mammal', '48A': ' Storehouse of a sort', '53A': ' Inscribed pillar', '55A': ' Newscaster', '58A': ' Bulls, in Barcelona', '59A': ' Repeats a report', '60A': ' Proficient', '61A': ' Abbr. often used on deeds', '62A': ' Kind of wire', '63A': ' Harassed', '64A': ' Abjure', '65A': ' Pung or monoski', '1D

In [53]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

bi_encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def biencoder(clue, answers):
    def encode_texts(bi_encoder, texts):
        return bi_encoder.encode(texts)
    
    def calculate_similarity(clue_embedding, answer_embeddings):
        return util.dot_score(clue_embedding, answer_embeddings)[0].cpu().numpy()
    
    def softmax(x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()
    
    clue_embedding = encode_texts(bi_encoder, [clue])[0]
    answer_embeddings = encode_texts(bi_encoder, answers)
    
    similarity_scores = calculate_similarity(clue_embedding, answer_embeddings)
    probabilities = softmax(similarity_scores)
    
    answer_probabilities = {answer: prob for answer, prob in zip(answers, probabilities)}
    
    return answer_probabilities


confidence_ratings = {}


for i, (key, clue) in enumerate(all_clues_actual.items()):
    candidate_answers = candidates.get(key, [])
    if candidate_answers:  
        confidence_ratings[key] = biencoder(clue, candidate_answers)

print(confidence_ratings)


{'1A': {'HERB': 1.0}, '5A': {'CROW': 1.0}, '9A': {'HAVE': 1.0}, '13A': {'AMOR': 1.0}, '14A': {'ANNE': 0.48943096, 'REPW': 0.059694137, 'HANK': 0.09849533, 'INES': 0.0008574594, 'VERA': 0.35152215}, '15A': {'LEOIV': 0.36919644, 'SCORP': 0.6308035}, '16A': {'ASSERTIVE': 1.0}, '18A': {'YEMEN': 1.954321e-06, 'FJORD': 0.9999981}, '19A': {'CARTELOT': 0.7712528, 'CARTELLI': 0.22874717}, '20A': {'TEDES': 0.0004267744, 'FROST': 1.9153504e-07, 'PEEDE': 0.0014748239, 'TEJAS': 0.8140384, 'WABAS': 0.18405981}, '21A': {'RYDER': 1.0}, '22A': {'ASTERN': 1.0}, '25A': {'EASTER': 1.0}, '28A': {'IVAN': 1.0}, '31A': {'ASP': 1.0}, '34A': {'SHORTC': 2.4104588e-06, 'FLIMFL': 0.26476982, 'BILKED': 0.7352277}, '35A': {'MUMS': 0.00059641973, 'YANK': 0.9994036}, '36A': {'DEFERRE': 0.16892241, 'DEFERRED': 0.8310776}, '38A': {'AGELESS': 1.0}, '40A': {'IRAN': 1.0}, '41A': {'HOUNDS': 0.9979923, 'HELLDO': 0.002007778}, '43A': {'NAY': 0.9999647, 'NAK': 3.529474e-05}, '44A': {'TEST': 0.034744713, 'SWORD': 0.94785655, 'R