In [1]:
%%capture
!{sys.executable} -m pip install openai
!{sys.executable} -m pip install litellm

In [1]:
import os
from openai import OpenAI

if not os.environ["OPENAI_API_KEY"]: 
    os.environ["OPENAI_API_KEY"]= '<REDACTED>'

MODEL = "gpt-4o-mini"

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

Open keywords data:

In [2]:
def read_file_to_list(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().splitlines()
    return lines

keywords_list = read_file_to_list('20qs-data/keywords.txt')
keywords_list = keywords_list[1:]
print(f"Number of keywords in data: {len(keywords_list)}")

Number of keywords in data: 2046


## Labeling Functions

In [4]:
from collections import Counter
from tqdm import tqdm

def get_answer(label, keyword):
    
    prompt = f"Keyword: {keyword}\n\nQuestion: {label['question']}\n\n"
    prompt += "Please provide an answer to the question based on the keyword. ONLY ANSWER Yes OR No. IF UNSURE, CHOOSE MOST LIKELY ANSWER FROM Yes OR No. "
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=10
    )
    response = response.choices[0].message.content

    return response


def label_keywords(label, keywords, labels, results):
    
    label_tag = label['tag']
    yes_no_counts = Counter()

    for keyword in tqdm(keywords, desc = 'label_keywords', position = 0, leave = True, ncols = 100):
        answer = get_answer(label, keyword)
        answer = answer.lower()
        # Count yes/no answers
        if 'yes' in answer:
            yes_no_counts['yes'] += 1
            answer = 'yes'
        elif 'no' in answer:
            yes_no_counts['no'] += 1
            answer = 'no'
        else:
            yes_no_counts['err'] +=1
            answer = 'err'
        if keyword in labels:
            labels[keyword][label_tag] = answer
        else:
            labels[keyword] = {label_tag: answer}

    # Calculate percentages
    total_answers = sum(yes_no_counts.values())
    yes_percentage = (yes_no_counts['yes'] / total_answers) * 100 if total_answers > 0 else 0
    no_percentage = (yes_no_counts['no'] / total_answers) * 100 if total_answers > 0 else 0
    err_percentage = (yes_no_counts['err'] / total_answers) * 100 if total_answers > 0 else 0

    # Print results
    print(label['question'])
    print(f"Yes answers: {yes_no_counts['yes']}, percentage: {yes_percentage:.2f}%")
    print(f"No answers: {yes_no_counts['no']}, percentage: {no_percentage:.2f}%")
    print(f"Error answers: {yes_no_counts['err']}, percentage: {err_percentage:.2f}%")

    # Save results
    results[label_tag] = {
        'yes_percentage': yes_percentage,
        'no_percentage': no_percentage,
        'yes_counts': yes_no_counts['yes'],
        'no_counts': yes_no_counts['no'],
        'err_counts': yes_no_counts['err']
    }

    return labels, results

Read current keyword labels and results for adding labels:

In [4]:
import json

def read_jsonl_and_transform(file_path):
    result_dict = {}

    with open(file_path, 'r') as file:
        for line in file:
            record = json.loads(line)
            keyword = record.pop('keyword')
            result_dict[keyword] = record

    return result_dict

def read_json_to_dict(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

keyword_labels, results = read_jsonl_and_transform('20qs-data/labeled_keywords.jsonl'), read_json_to_dict('20qs-data/labeling_results.jsonl')

OR start without any labels:

In [5]:
keyword_labels = {}
results = {}

Label places/things:

In [7]:
places_label = {'tag': 'place', 'question': "Is it a place?"}

keyword_labels, results = label_keywords(places_label, keywords_list, keyword_labels, results)

label_keywords: 100%|███████████████████████████████████████████| 2046/2046 [16:02<00:00,  2.13it/s]

Is it a place?
Yes answers: 710, percentage: 34.70%
No answers: 1336, percentage: 65.30%
Error answers: 0, percentage: 0.00%





In [8]:
keyword_places = [keyword for keyword in keywords_list if keyword_labels[keyword]['place'] == 'yes']
keyword_things = [keyword for keyword in keywords_list if keyword_labels[keyword]['place'] == 'no']

print(f"Number of keywords labeled as things: {len(keyword_things)}.")

Number of keywords labeled as things: 1336.


## Things Branch

Labels to apply:

In [11]:
labels = [
    {'tag': 'scientific research', 'question': "Is it related to scientific research?"},
    {'tag': 'food, drinks, cooking', 'question': "Is it broadly related to food, drinks or cooking?"},
    {'tag': 'handheld', 'question': "Is it something that can be held in someone's hand?"},
    {'tag': 'food', 'question': "Is it a food?"},
    {'tag': 'furniture', 'question': "Is it furniture?"},
    {'tag': 'man-made', 'question': "Is it man-made?"},
    {'tag': 'industry, manufacturing', 'question': "Is it related to industrial production or manufacturing?"},
    {'tag': 'agriculture', 'question': "Is it related to agricultural production?"},
    {'tag': 'arts, media', 'question': "Is it broadly related to arts or media?"},
    {'tag': 'safety', 'question': "Is it related to safety?"},
    {'tag': 'medicine', 'question': "Is it broadly related to medicine?"},
    {'tag': 'transportation, vehicles', 'question': "Is it related to transportation or vehicles?"},
    {'tag': 'electronics, technology', 'question': "Is it related to electronics or technology?"},
    {'tag': 'clothing, accessories', 'question': "Is it related to clothing or accessories?"},
    {'tag': 'natural resource, material', 'question': "Is it a natural resource or material?"},
    {'tag': 'natural phenomenon', 'question': "Is it a natural phenomenon or natural feature?"},
    {'tag': 'living', 'question': "Is it a living thing?"},
    {'tag': 'animal', 'question': "Is it an animal?"},
    {'tag': 'plant', 'question': "Is it a plant?"},
    {'tag': 'cleaning, hygiene', 'question': "Is it related to cleaning or hygiene?"},
    {'tag': 'entertainment, sports', 'question': "Is it broadly related to entertainment or sports?"}
]

In [14]:
for label in tqdm(labels, desc = 'labels', ncols=150):
    keyword_labels, results = label_keywords(label, keyword_things, keyword_labels, results)

labels:   0%|                                                                                                                  | 0/21 [00:00<?, ?it/s]

label_keywords:   0%|                                                      | 0/1336 [00:03<?, ?it/s]
labels:   0%|                                                                                                                  | 0/21 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [16]:
keyword_science = [keyword for keyword in keyword_things if keyword_labels[keyword]['scientific research'] == 'yes']
keyword_not_science = [keyword for keyword in keyword_things if keyword_labels[keyword]['scientific research'] == 'no']
print(keyword_not_science[:100])


['accent chair', 'adjustable bench', 'Advertisement', 'air handler', 'Alarm system', 'amazon echo', 'analog stick', 'ankle bracelet', 'aperol spritz', 'Apple pie', 'Aprons', 'archery target', 'arrival board', 'atm', 'attic ladder', 'audio guide', 'autograph', 'backgammon board', 'baggage conveyor', 'Baguette', 'ballpoint pen', 'bandana', 'bank statement', 'bar towel', 'barbell collar', 'Barber Chair', 'Bath Mat', 'beach chair', 'beach umbrella', 'Beanbag', 'Bed frame', 'bedspread', 'beer coaster', 'beer stein', 'beer tap', 'Blinds', 'Bobby Pins', 'Bollards', 'book cover', 'bookbag', 'bookend', 'Bookends', 'bracelet', 'bread basket', 'Bread knife', 'bread maker', 'Bread pudding', 'Brewery merchandise', 'Briefcase', 'brochure stand', 'Brownies', 'buckle', 'Bumper sticker', 'bunk bed', 'butter knife', 'button', 'cabinet', 'cable tie', 'cable tray', 'camp stove', 'candle holder', 'canning jar', 'cannoli', 'Cash Register', 'casket', 'Cat carrier', 'centerpiece', 'chair cushion', 'Champagne 

## Places Branch
- TODO

## Save as JSONL

In [11]:
def make_list_from_dict(original_dict):
    list_of_dicts = []
    for key, sub_dict in original_dict.items():
        # Create a new dictionary that includes the key from the original dict
        new_dict = {"keyword": key}
        # Update the new dictionary with the key-value pairs from the sub-dictionary
        new_dict.update(sub_dict)
        # Append the new dictionary to the list
        list_of_dicts.append(new_dict)
    return list_of_dicts

labeled_keywords_list = make_list_from_dict(keyword_labels)

In [12]:
with open("20qs-data/labeled_keywords.jsonl", "w") as f:
    for i in labeled_keywords_list:
        json.dump(i, f)
        f.write('\n')

with open("20qs-data/labeling_results.jsonl", "w") as f:
    json.dump(results, f)