In [1]:
import numpy as np
import openai
from openai import OpenAI

import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import time
import pickle
import json
from sklearn.linear_model import LinearRegression
from sklearn.metrics import precision_recall_fscore_support, f1_score, hamming_loss


# Data

In [2]:
# load list with categories of interest
filename = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\data\categories_of_interest.txt"
with open(filename, 'r') as file:
    cats = json.load(file)

# load the dictionary to map category column names to gpt names
filename = r'C:\\Users\\conix\\Dropbox\\FNRS project taxonomy\\methods in taxonomy\\data\\category_names_dict.txt'
with open(filename, 'r') as file:
    category_names = json.load(file)

# descriptions of the various categories, used in the first try with gpt4omini
filename = r'C:\\Users\\conix\\Dropbox\\FNRS project taxonomy\\methods in taxonomy\\data\\methods_description_gpt4omini_first_try.txt'
with open(filename, 'r') as file:
    methods_old = json.load(file)

# descriptions of the various categories, used in the second try with gpt4omini
filename = r'C:\\Users\\conix\\Dropbox\\FNRS project taxonomy\\methods in taxonomy\\data\\methods_description_gpt4omini_second_try.txt'
with open(filename, 'r') as file:
    methods = json.load(file)

# hierarchical classification of the categories
filename = r"C:\Users\conix\Dropbox\FNRS project taxonomy\methods in taxonomy\data\classification_categories.txt"
with open(filename, "r") as file:
    classif = json.load(file)

In [3]:
# load data from lr

with open("methods_paper_files/upload_data_lr.pkl", "rb") as f:
    data = pickle.load(f)

df = data['og_data']
test_idx = data['test data']['test_idx']
y_test = data['test data']['y_test']
X_test = data['test data']['X_test']

# limit to test data
df = df.iloc[test_idx]

# just make sure the data is correct
assert df[cats].equals(y_test)
assert X_test.equals(df['displayed_text'])
assert len(df) == len(df['id'].unique())


# Estimate price

In [4]:
def estimate_price(input_cost, output_cost, num_par, tokens_par, tokens_query, cat):
    
    batch_discount = 0.5
    cost = (((tokens_par + tokens_query) * cat * num_par * batch_discount) * input_cost +
        (num_par * cat * 1 * output_cost))

    print(f"Price for {num_par} samples for {cat} categories: {cost}")




num_par = len(df)
tokens_par = np.mean([len(i)/4 for i in X_test])
tokens_query = 220 # see below for this value
cat = len(cats)

estimate_price(input_cost = 2 / 1e6, output_cost = 8 / 1e6, num_par = num_par, tokens_par = tokens_par, tokens_query = tokens_query, cat = cat)


Price for 304 samples for 38 categories: 5.800101499999999


# set up openAI API access

In [5]:
# key linked to our account

key = 'YOUR_API_KEY_HERE'
client = OpenAI(api_key = key)


# Cheap base model: gpt 4.1-mini with the batch api

We'll use the batch API to get a 50% discount. See [here](https://cookbook.openai.com/examples/batch_processing). Due to rate limits, we have to do this in multiple batches. Check the OpenAI website for this -- rate limits depend on usage tier and model.

In [26]:
# function to split up the annotated paragraphs into batches.

def create_task(row_id, cat, para, system_prompt):
    """Create a single task dictionary for the given chunk."""
    custom_id = f"{row_id}___{cat}"
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4.1-mini",
            "temperature": 0,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": para}
            ],
        }
    }

## Create the batches

In [28]:
# dictionary that stores all the batches
tasks = {}

# list that stores the requests within each batch
# there is one request for each category for each annotated paragraph
tasks_list = []

batch_count = 0

for _, row in tqdm(df[['id', 'displayed_text']].iterrows()):

    # the user prompt is the concrete material that the general task has to be applied to
    # in our case the annotated paragraph
    para = row["displayed_text"]
    for cat, descr in methods.items():

        # the system prompt gives general instructions
        # in this case: the category name and description as well as the output we want
        system_prompt = f'''Imagine you are a professional taxonomist. Your goal is to judge whether a paragraph from the methods section of a taxonomic research paper discusses {cat}. {descr} You will be provided with a paragraph, and you will output a "1" if yes, and a "0" if not. This is the only output needed. Note, the method may be mentioned either directly or implied by the content. However, please make sure you are conservative. That is, only output "1" if you are very sure that this method was applied. If the paragraph is consistent with the method not having been used, please output "0".'''

        

        task = create_task(row_id=row['id'], cat=cat, para=para, system_prompt=system_prompt)
        tasks_list.append(task)
        
        # Check if our accumulated tasks exceeds the 2 million tokens threshold.
        # this is also always lower than the 50 000 requests allowed per batch
        total_length = sum(len(t['body']['messages'][0]['content']) + len(t['body']['messages'][1]['content'])
                           for t in tasks_list)

        # assume 4 characters per token on average, and keep some spare room below 2 million the be sure
        if ((total_length / 4) > 1850000):
            tasks[f"batch_{batch_count}"] = tasks_list.copy()
            tasks_list = []
            batch_count += 1

# Save any remaining tasks.
if tasks_list:
    tasks[f"batch_{batch_count}"] = tasks_list

304it [00:04, 65.19it/s]


In [29]:
# print total number of requests
print(sum(len(v) for v in tasks.values()))

# check if all batches are within two million tokens
problems = []

for key, batch in tasks.items():
    tokens = np.array([
        len(i['body']['messages'][0]['content']) + len(i['body']['messages'][1]['content'])
        for i in batch
    ])
    total_tokens = tokens.sum() / 4
    if total_tokens > 2000000:
        problems.append(key)

assert not problems, f"Batches {problems} exceed 2 million tokens."
print("All batches within 2 million tokens")

# check if all custom ids are unique, otherwise the api throws an error
problems = []

for task in tasks.keys():
    custom_ids = [i['custom_id'] for i in tasks[task]]
    if len(custom_ids) != len(set(custom_ids)):
        problems.append(task)

assert not problems, f"Duplicate custom_ids found in tasks: {problems}"
print("All tasks have unique custom_ids")

11552
All batches within 2 million tokens
All tasks have unique custom_ids


In [32]:
# write the tasks to json files to upload to the batch api

for batch in tasks.keys():
    file_name = f'methods_paper_files/batches/batch_tasks_taxonomy_41mini_{batch}.jsonl'

    with open(file_name, 'w') as file:
        for obj in tasks[batch]:
            file.write(json.dumps(obj) + '\n')

## Make the API calls and collect the data

In [35]:
# loop through batches, and save the results as they are returned and as a df

batches = list(tasks.keys())

for batch in batches:  
    filename = f'methods_paper_files/batches/batch_tasks_taxonomy_41mini_{batch}.jsonl'

    # upload the file
    with open(filename, "rb") as f:
        batch_file = client.files.create(
            file=f,
            purpose="batch"
        )

    # start the job
    batch_job = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )

    # get the id to check progress
    job_id = batch_job.id 
    status = client.batches.retrieve(job_id).status
    print('started:')
    print(client.batches.retrieve(job_id))
    print('')
    
    # Wait until the batch job is finished.
    while status in ('in_progress', 'finalizing', 'validating'):
        time.sleep(120)
        status = client.batches.retrieve(job_id).status

    # notify if the hob is completed, and get the result
    if status == 'completed':
        print('completed:')
        print(client.batches.retrieve(job_id))
        result_file_id = client.batches.retrieve(job_id).output_file_id
        result = client.files.content(result_file_id).content

    # indicate if there was an error for a batch
    # typically because of rate limits
    else:
        # Optionally handle other statuses or errors
        print('status was:')
        print(status)
        print(batch)
        print(client.batches.retrieve(job_id))
        print('---')
        continue

    

    # Write the result to a file.
    result_file_name = f'methods_paper_files/results/batch_tasks_taxonomy_41mini_{batch}.jsonl'
    with open(result_file_name, 'wb') as file:
        file.write(result)

    # Read and parse the JSON lines of the result
    results = []
    with open(result_file_name, 'r') as file:
        for line in file:
            json_object = json.loads(line.strip())
            results.append(json_object)

    # for each paragraph, make a dict entry with a dict for each category
    para_ids = list(set([i['custom_id'].split('___')[0] for i in tasks[batch]]))
    result_dct = {para_id:{method:0 for method in methods.keys()} for para_id in para_ids}
    
    # store the results in the dictionaries
    for i in results:
        cat = i['custom_id'].split('___')[1]
        doc_id = i['custom_id'].split('___')[0]
        responded_value = int(i['response']['body']['choices'][0]['message']['content'])
        if responded_value != 0:
            result_dct[doc_id][cat] = responded_value

    # Save to a pickle file.
    pickle_filename = f'methods_paper_files/results/results_41mini_{batch}.pkl'
    with open(pickle_filename, "wb") as file:
        pickle.dump(result_dct, file)
    print('saved!')
    print('---')


started:
Batch(id='batch_6877a44b46248190856448c3b2810add', completion_window='24h', created_at=1752671307, endpoint='/v1/chat/completions', input_file_id='file-AsAiGRU78HhpcsGyEenEnJ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1752757707, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

completed:
Batch(id='batch_6877a44b46248190856448c3b2810add', completion_window='24h', created_at=1752671307, endpoint='/v1/chat/completions', input_file_id='file-AsAiGRU78HhpcsGyEenEnJ', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1752672921, error_file_id=None, errors=None, expired_at=None, expires_at=1752757707, failed_at=None, finalizing_at=1752672454, in_progress_at=1752671310, metadata=None, output_file_id='file-9cjJrXXsRwKb7TRqoPY

## Save the data

In [36]:
all_batches = {}

count = 0
for batch in batches:
    with open(f"methods_paper_files/results/results_41mini_{batch}.pkl", "rb") as file:
        dct = pickle.load(file)
        count += len(dct)
        all_batches.update(dct)


# store the results
df_results = pd.DataFrame(all_batches).T.astype('int').sort_index()


In [14]:
# the classification is hierarchical and leaf categories should have all parent categories
# update the gpt results accordingly

def recursive_update(df, classif):
    # Go through each item in the classification
    for key, value in classif.items():
        if isinstance(value, list):
            # First process subcategories (go deeper into the hierarchy)
            for v in value:
                if isinstance(v, dict):
                    # Recursively update subcategories first
                    recursive_update(df, v)
            
            # After processing subcategories, update the current category
            sub_categories = [v if isinstance(v, str) else list(v.keys())[0] for v in value]
            df.loc[df[sub_categories].eq(1).any(axis=1), key] = 1
    
    # Return the DataFrame to allow for chaining if needed
    return df

# for that, we have to update the category names in the classification dictionary
def replace_terms(obj, mapping):
    """
    Recursively replaces strings in a nested structure (dicts/lists)
    using the provided mapping dictionary.
    """
    # If the object is a dictionary, iterate over its keys and values.
    if isinstance(obj, dict):
        new_dict = {}
        for key, value in obj.items():
            # Replace key if it's a string.
            new_key = mapping.get(key, key) if isinstance(key, str) else key
            # Recursively replace strings in the value.
            new_dict[new_key] = replace_terms(value, mapping)
        return new_dict
    
    # If the object is a list, process each item.
    elif isinstance(obj, list):
        return [replace_terms(item, mapping) for item in obj]
    
    # If the object is a string, replace it if it exists in the mapping.
    elif isinstance(obj, str):
        return mapping.get(obj, obj)
    
    # For other types, return the object unchanged.
    else:
        return obj

# update using the classification
classif = replace_terms(classif, category_names)
df_results = recursive_update(df_results, classif)

# update categories that are not hierarchically related
df_results['interbreeding or reproduction'] = np.where(df_results['reproductive morphology'] == 1, 1, df_results['interbreeding or reproduction'])
df_results['phylogenetic methods'] = np.where(((df_results['phylogenetic species delimitation'] == 1) | (df_results['phylogenetic tree reconstruction methods'] == 1) | (df_results['distance based phylogenetic tree inference'] == 1) | (df_results['character based phylogenetic tree inference'] == 1) | (df_results['phylogenetic analysis and reasoning with phenotypic data'] == 1)), 1, df_results['phylogenetic methods'])

# drop singletons, as that's a meaningless hierarchy level
df_results = df_results.drop('singletons', axis=1)

In [39]:
# save the results to analyze them in a different notebook


with open("methods_paper_files/results/gpt_41mini_results_df.pkl", "wb") as f:
    pickle.dump(df_results, f)

# Expensive reasoning model: o3



## Get the data through the api

In [4]:
# function to split up the annotated paragraphs into batches.
# had to remove temperature and add max completion tokens
def create_task(row_id, cat, para, system_prompt):
    """Create a single task dictionary for the given chunk."""
    custom_id = f"{row_id}___{cat}"
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4.1-2025-04-14",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": para}
            ],
        }
    }

In [5]:
import tiktoken
from tqdm import tqdm

enc                 = tiktoken.encoding_for_model("o3")   # needs tiktoken ≥0.9.0
TOKENS_PER_MESSAGE  = 3    # chat markup  (<|role|>, etc.)
ASSISTANT_PRIMER    = 3    # final <|assistant|> header
COMPLETION_TOKENS   = 2    # you only want “0”/“1”
MAX_BATCH_TOKENS    = 85000  # org‑level hard limit for o‑family

def cost_in_tokens(task: dict) -> int:
    """Exact token cost of one chat request, incl. output budget."""
    msgs = task["body"]["messages"]
    prompt = sum(len(enc.encode(m["content"])) for m in msgs)
    overhead = TOKENS_PER_MESSAGE * len(msgs) + ASSISTANT_PRIMER
    return prompt + overhead + COMPLETION_TOKENS

In [6]:
# dictionary that stores all the batches
tasks = {}

# list that stores the requests within each batch
# there is one request for each category for each annotated paragraph
tasks_list = []

used_tokens  = 0
batch_count  = 0

for _, row in tqdm(df[['id', 'displayed_text']].iterrows()):
    para = row["displayed_text"]
    for cat, descr in methods.items():

        system_prompt = (
            f"Imagine you are a professional taxonomist. Your goal is to judge whether a paragraph "
            f"from the methods section of a taxonomic research paper discusses {cat}. {descr} "
            'You will be provided with a paragraph, and you will output a "1" if yes, and a "0" if not. '
            "This is the only output needed. Note, the method may be mentioned either directly or implied by "
            "the content. However, please make sure you are conservative. That is, only output \"1\" if you are "
            "very sure that this method was applied. If the paragraph is consistent with the method not having "
            "been used, please output \"0\"."
        )

        task = create_task(
            row_id=row['id'],
            cat=cat,
            para=para,
            system_prompt=system_prompt
        )

        t_cost = cost_in_tokens(task)

        # flush if adding this task would blow the 90 k quota
        if used_tokens + t_cost > MAX_BATCH_TOKENS:
            tasks[f"batch_{batch_count}"] = tasks_list
            tasks_list   = []
            used_tokens  = 0
            batch_count += 1

        tasks_list.append(task)
        used_tokens += t_cost

# Save any remaining tasks.
if tasks_list:
    tasks[f"batch_{batch_count}"] = tasks_list

304it [00:03, 82.96it/s]


In [30]:
# print total number of requests
print(sum(len(v) for v in tasks.values()))

# check if all batches are within two million tokens
problems = []

for key, batch in tasks.items():
    tokens = np.array([
        len(i['body']['messages'][0]['content']) + len(i['body']['messages'][1]['content'])
        for i in batch
    ])
    total_tokens = tokens.sum() / 4
    if total_tokens > 90000:
        problems.append(key)

assert not problems, f"Batches {problems} exceed 90 k tokens."
print("All batches within the token limit")

# check if all custom ids are unique, otherwise the api throws an error
problems = []

for task in tasks.keys():
    custom_ids = [i['custom_id'] for i in tasks[task]]
    if len(custom_ids) != len(set(custom_ids)):
        problems.append(task)

assert not problems, f"Duplicate custom_ids found in tasks: {problems}"
print("All tasks have unique custom_ids")

11552


AssertionError: Batches ['batch_0', 'batch_2', 'batch_3', 'batch_4', 'batch_5', 'batch_6', 'batch_7', 'batch_8', 'batch_9', 'batch_10', 'batch_11', 'batch_13', 'batch_14', 'batch_15', 'batch_16', 'batch_17', 'batch_18', 'batch_21', 'batch_23', 'batch_24', 'batch_25', 'batch_26', 'batch_27', 'batch_28', 'batch_29', 'batch_30', 'batch_31', 'batch_33', 'batch_35', 'batch_36', 'batch_37', 'batch_38', 'batch_39', 'batch_40', 'batch_41', 'batch_42', 'batch_43', 'batch_45', 'batch_47', 'batch_48', 'batch_49', 'batch_50', 'batch_51', 'batch_52', 'batch_53', 'batch_54', 'batch_55', 'batch_56', 'batch_57', 'batch_59', 'batch_60', 'batch_61'] exceed 90 k tokens.

In [31]:
# write the tasks to json files to upload to the batch api

for batch in tasks.keys():
    file_name = f'methods_paper_files/batches/batch_tasks_taxonomy_41_{batch}.jsonl'

    with open(file_name, 'w') as file:
        for obj in tasks[batch]:
            file.write(json.dumps(obj) + '\n')

print(len(tasks))

62


## Make the api calls

In [63]:
# loop through batches, and save the results as they are returned and as a df

batches = list(tasks.keys())
for batch in batches:

    filename = f'methods_paper_files/batches/batch_tasks_taxonomy_41_{batch}.jsonl'

    # upload the file
    with open(filename, "rb") as f:
        batch_file = client.files.create(
            file=f,
            purpose="batch"
        )

    # start the job
    batch_job = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )

    # get the id to check progress
    job_id = batch_job.id 
    status = client.batches.retrieve(job_id).status
    print('started:')
    print(client.batches.retrieve(job_id))
    print('')
    
    # Wait until the batch job is finished.
    while status in ('in_progress', 'finalizing', 'validating'):
        time.sleep(120)
        status = client.batches.retrieve(job_id).status

    # notify if the job is completed, and get the result
    if status == 'completed':
        print('completed:')
        print(client.batches.retrieve(job_id))
        result_file_id = client.batches.retrieve(job_id).output_file_id
        result = client.files.content(result_file_id).content
        done.append(batch)

    # indicate if there was an error for a batch
    # typically because of rate limits
    else:
        # Optionally handle other statuses or errors
        print('status was:')
        print(status)
        print(batch)
        print(client.batches.retrieve(job_id))
        print('---')
        continue

    

    # Write the result to a file.
    result_file_name = f'methods_paper_files/results/batch_tasks_taxonomy_41_{batch}.jsonl'
    with open(result_file_name, 'wb') as file:
        file.write(result)

    # Read and parse the JSON lines of the result
    results = []
    with open(result_file_name, 'r') as file:
        for line in file:
            json_object = json.loads(line.strip())
            results.append(json_object)

    # for each paragraph, make a dict entry with a dict for each category
    para_ids = list(set([i['custom_id'].split('___')[0] for i in tasks[batch]]))
    result_dct = {para_id:{method:0 for method in methods.keys()} for para_id in para_ids}
    
    # store the results in the dictionaries
    for i in results:
        cat = i['custom_id'].split('___')[1]
        doc_id = i['custom_id'].split('___')[0]
        responded_value = int(i['response']['body']['choices'][0]['message']['content'])
        if responded_value != 0:
            result_dct[doc_id][cat] = responded_value

    # Save to a pickle file.
    pickle_filename = f'methods_paper_files/results/results_41_{batch}.pkl'
    with open(pickle_filename, "wb") as file:
        pickle.dump(result_dct, file)
    print('saved!')
    print('---')

started:
Batch(id='batch_687bc736e7308190bb55b7ed5c692ff8', completion_window='24h', created_at=1752942390, endpoint='/v1/chat/completions', input_file_id='file-RMpSi8WjHXAKgadhTYbmYF', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1753028790, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

status was:
failed
batch_0
Batch(id='batch_687bc736e7308190bb55b7ed5c692ff8', completion_window='24h', created_at=1752942390, endpoint='/v1/chat/completions', input_file_id='file-RMpSi8WjHXAKgadhTYbmYF', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4.1-2025-04-14 in organization org-H9yaeVFAhw5Mg1OR4p

## save the results

In [10]:
all_batches = {}

count = 0
for batch in batches:
    with open(f"methods_paper_files/results/results_41_{batch}.pkl", "rb") as file:
        dct = pickle.load(file)
        count += len(dct)
        all_batches.update(dct)


# store the results
df_results = pd.DataFrame(all_batches).T.astype('int').sort_index()

In [15]:
# save the results to analyze them in a different notebook

# update using the classification
classif = replace_terms(classif, category_names)
df_results = recursive_update(df_results, classif)

# update categories that are not hierarchically related
df_results['interbreeding or reproduction'] = np.where(df_results['reproductive morphology'] == 1, 1, df_results['interbreeding or reproduction'])
df_results['phylogenetic methods'] = np.where(((df_results['phylogenetic species delimitation'] == 1) | (df_results['phylogenetic tree reconstruction methods'] == 1) | (df_results['distance based phylogenetic tree inference'] == 1) | (df_results['character based phylogenetic tree inference'] == 1) | (df_results['phylogenetic analysis and reasoning with phenotypic data'] == 1)), 1, df_results['phylogenetic methods'])

# drop singletons, as that's a meaningless hierarchy level
df_results = df_results.drop('singletons', axis=1)

with open("methods_paper_files/results/gpt_41_results_df.pkl", "wb") as f:
    pickle.dump(df_results, f)