In [1]:
import os
from openai import OpenAI

if not os.environ["OPENAI_API_KEY"]: 
    os.environ["OPENAI_API_KEY"]= '<REDACTED>'

MODEL = "gpt-4o-mini"

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [2]:
import json

def read_jsonl_and_transform(file_path):
    result_dict = {}

    with open(file_path, 'r') as file:
        for line in file:
            record = json.loads(line)
            keyword = record.pop('keyword')
            result_dict[keyword] = record

    return result_dict

def read_json_to_dict(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

keyword_labels, results = read_jsonl_and_transform('20qs-data/labeled_keywords.jsonl'), read_json_to_dict('20qs-data/labeling_results.jsonl')

In [4]:
with open('20qs-data/keywords_data/similar_keywords_cleaned.txt', 'r') as file:
    lines = file.readlines()

once_similar_keywords = [line.strip() for line in lines]
print(once_similar_keywords[:50])

['ATV', 'Acetone', 'Acorn', 'Acrylic panel', 'Acrylic sheet', 'Action camera', 'Adhesive', 'Adrenal gland', 'Agar-agar', 'Air Conditioning Unit', 'Air compressor', 'Air horn', 'Air purifier', 'Air purifiers', 'Airbrush', 'Almond', 'Almond Extract', 'Almond butter', 'Alternator', 'Aluminium sheet', 'Aluminum foil', 'Amber', 'American Robin', 'Amphibian', 'Amplifier', 'Anemometer', 'Ankle Weights', 'Ankle bracelet', 'Ant', 'Antenna', 'Antique vase', 'Anxiety medication', 'Appetizer', 'Apple cider', 'Apple tart', 'Appliances', 'Arbor', 'Area rug', 'Assembly line', 'Athletic shoes', 'Audio interface', 'Augmented Reality Glasses', 'B B gun ammo', 'Backpack', 'Backpacks', 'Bacon', 'Bacteria', 'Badge holder', 'Badge reel', 'Baguette']


In [5]:
things_list = [word for word in keyword_labels if keyword_labels[word]['place'] == 'no']

In [3]:
def make_similar_keyword(keyword):
    
    sys_prompt = ("You are an assistant tasked with generating a keyword similar to a given keyword for a game of 20 Questions. "
                  "The similar keyword should belong to the same general category but must not be a synonym or extremely similar. "
                  "Here are some guiding examples:\n"
                  "Example 1:\n Keyword: Mechanical pencil\n Similar Keyword: Ballpoint pen\n"
                  "Example 2:\n Keyword: Surgical mask\n Similar Keyword: Forceps\n"
                  "Example 3:\n Keyword: Honda Civic\n Similar Keyword: Toyota Highlander\n"
                  "Example 4:\n Keyword: Chocolate Mousse\n Similar Keyword: Vanilla cake\n"
                  "Only output the similar keyword and nothing else.\n")
    user_msg = f"Keyword:{keyword}\nSimilar Keyword:"
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_msg}
        ],
        max_tokens=10
    )
    response = response.choices[0].message.content

    return response

In [8]:
from tqdm import tqdm
import random

similar_keywords = []
for keyword in tqdm(random.choices(once_similar_keywords, k = 600), desc = 'keywords generated'):
    similar_keyword = make_similar_keyword(keyword)
    similar_keywords.append(similar_keyword)    

keywords generated: 100%|██████████| 600/600 [04:31<00:00,  2.21it/s]


In [9]:
with open('20qs-data/keywords_data/keywords.txt', 'r') as file:
    original_words = file.readlines()

original_keywords = [line.strip() for line in original_words]

with open('20qs-data/keywords_data/twice_removed_similar_keywords.txt', 'r') as file:
    round_1 = file.readlines()
    
round_1_words = [line.strip() for line in original_words]

similar_keywords = list(set(similar_keywords).difference(set(original_keywords)).difference(set(once_similar_keywords).difference(round_1_words)))
print(similar_keywords)

['Toggle bolt', 'Serrano pepper', 'Fabric sofa', 'ID badge holder', 'Plaster cast', 'Rain gutter', 'Curling stone', 'Drumsticks', 'Hiker', 'Sputum', 'Boston fern', 'amber fossil', 'Hair tie', 'Oregano', 'Aluminum', 'Geothermal energy', 'Workbench', 'Tennis net', 'Smart television', 'Fabric loveseat', 'Mosaic lamp', 'Flat iron', 'Treehouse', 'Goose', 'Car wax', 'Seashell', 'Saxophone', 'Medical record', 'Paper plate', 'Elastic bandage', 'Crowd barrier', 'Craft adhesive', 'Neon sign', 'Gel', 'emerald', 'gummy worms', 'Apple sauce', 'Scrunchies', 'drivetrain', 'Fox', 'Nail', 'Greek yogurt container', 'Mongoose', 'Cottage pie', 'Airsoft pellets', 'Northern Cardinal', 'Motorhome', 'Bathtub Mat', 'Partition wall', 'Hydrangea shrub', 'Composting container', 'Loofah', 'Serving platters', 'Pelican', 'Bronze statue', 'Insecticide', 'License plate number', 'Hamster', 'Capo', 'Smart speakers', 'Mineral water', 'Phone case', 'label', 'Vase Arrangements', 'Gutter', 'dried apricot', 'Backpack cooler'

In [10]:
print(len(similar_keywords))

348


In [11]:
with open('20qs-data/twice_removed_more_similar_keywords.txt', 'w') as file:
    for item in similar_keywords:
        file.write(f"{item}\n")