# SETUP

In [32]:
import pandas as pd
import openai
from tqdm import tqdm
import time
import json
import os
from dotenv import load_dotenv

In [33]:
# TODO input csv path

csv_path = "dfposts_for_classification.csv"

In [34]:
def make_prompt(text):
    return f"""POST:
{text}


TASKS:
First, your task is to evaluate how political this post is, using the following 4-point scale:

0 - Not political: No references to political, social, or civic issues. Purely about food, cooking, or personal updates

1 - Slightly political: Indirect or mild references to civic life, e.g., calls for kindness, unity, or vague references to current events.

2 - Moderately political: Mentions or endorses political causes or events (e.g., mentions Gaza, gender equality, food justice), but not confrontational or polarizing

3 - Strongly political: Clearly takes a stance, uses activist language, references specific political actors, calls to action (e.g., “Free Palestine,” “End the blockade,” “Boycott XYZ”).

Classify the caption into one of the four categories. Also briefly explain your rating in 1-2 sentences.

Next, your task is to evaluate how much the post is about food, cooking, or recipes, using the following 4-point scale:

0 - Not about food at all: No mention or visual reference to food, cooking, ingredients, or meals. The post may focus on personal life, holidays, politics, or other unrelated topics.

1 - Slightly about food: Food is mentioned or visible, but only briefly or indirectly (e.g., just hashtags like #food or #dinner, or a food-related photo without explanation).

2 - Mostly about food: The post primarily focuses on food, ingredients, or cooking, but without much detail. It may reference a dish, show a plate, or mention a product, but lacks process or recipe-level information.

3 - Fully about food: The post clearly centers on food or cooking. It includes specific details about preparation, ingredients, techniques, or the full recipe. May also include culinary tips or product reviews within a food-focused context.

Classify the post into one of the four categories. Also briefly explain your rating in 1-2 sentences.
"""

In [42]:
def make_jsonl(input_csv_path, output_base_path, max_file_size_mb=150):
    inputdf = pd.read_csv(input_csv_path)
    
    def row_to_dict(row):
        return {
            "custom_id": str(row["postid"]),  # ensure it's a string
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4.1-mini-2025-04-14",
                "messages": [
                    {"role": "system", "content": "You are a media researcher evaluating Instagram posts by chefs. Each post may include elements related to politics, food, or both."},
                    {"role": "user", "content": make_prompt(row["lowertranslated"])}
                ],
                "max_tokens": 200
            }
        }

    rows = inputdf.apply(row_to_dict, axis=1)

    # File writing setup
    max_bytes = max_file_size_mb * 1024 * 1024
    file_index = 0
    current_file_size = 0
    current_file = open(f"{output_base_path}_{file_index}.jsonl", "w", encoding="utf-8")

    for row_dict in rows:
        line = json.dumps(row_dict, ensure_ascii=False) + "\n"
        encoded = line.encode("utf-8")
        line_size = len(encoded)

        if current_file_size + line_size > max_bytes:
            current_file.close()
            file_index += 1
            current_file_size = 0
            current_file = open(f"{output_base_path}_{file_index}.jsonl", "w", encoding="utf-8")

        current_file.write(line)
        current_file_size += line_size

    current_file.close()
    print(f"Done! {file_index + 1} file(s) written under {max_file_size_mb + 50}MB each.")


# Run below to make batch jsonl files

In [43]:
output_jsonl = "dfposts_batch"

make_jsonl(csv_path, output_jsonl)

Done! 3 file(s) written under 200MB each.


# openai batch api

In [48]:
# UPLOAD FILES

from openai import OpenAI
load_dotenv()
client = OpenAI(api_key=os.getenv("BENS_OPENAI_KEY"))

# Where to log everything
output_log = "upload_and_batch_log.txt"

# Loop over all .jsonl files in current directory
with open(output_log, "w", encoding="utf-8") as log_file:
    for filename in sorted(os.listdir(".")):
        if filename.endswith("2.jsonl"): # CHANGE THE ENDS WITH TO UPLOAD SPECIFIC OR ALL FILES
            print(f"Uploading {filename}...")

            try:
                with open(filename, "rb") as f:
                    file_response = client.files.create(file=f, purpose="batch")

                file_id = file_response.id
                print(f"Uploaded {filename} → file ID: {file_id}")

                # Submit batch job
                batch_response = client.batches.create(
                    input_file_id=file_id,
                    endpoint="/v1/chat/completions",
                    completion_window="24h",
                    metadata={"description": f"batch job for {filename}"}
                )

                print(f"Batch created - batch ID: {batch_response.id}")

                # Log both
                log_file.write(f"# {filename}\n")
                log_file.write(f"file_id: {file_id}\n")
                log_file.write(f"batch_id: {batch_response.id}\n\n")

            except Exception as e:
                print(f"Error with {filename}: {e}")
                log_file.write(f"# ERROR uploading or batching {filename}: {e}\n\n")

print("All files processed. Logs saved to", output_log)

Uploading dfposts_batch_2.jsonl...
Uploaded dfposts_batch_2.jsonl → file ID: file-7WhH6sKPUhkrrUSeaEdG8L
Batch created - batch ID: batch_68377cd300648190aa735d51290b9477
All files processed. Logs saved to upload_and_batch_log.txt


In [52]:
load_dotenv()
client = OpenAI(api_key=os.getenv("BENS_OPENAI_KEY"))

batch = client.batches.retrieve("batch_68377cd300648190aa735d51290b9477")
print(batch)

client.batches.list(limit=10)

batches = client.batches.list()

# Cancel each batch
'''
for batch in batches.data:
    try:
        client.batches.cancel(batch.id)
        print(f"Cancelled batch {batch.id}")
    except Exception as e:
        print(f"Failed to cancel batch {batch.id}: {e}")
        '''


Batch(id='batch_68377cd300648190aa735d51290b9477', completion_window='24h', created_at=1748466899, endpoint='/v1/chat/completions', input_file_id='file-7WhH6sKPUhkrrUSeaEdG8L', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1748553299, failed_at=None, finalizing_at=None, in_progress_at=1748466901, metadata={'description': 'batch job for dfposts_batch_2.jsonl'}, output_file_id=None, request_counts=BatchRequestCounts(completed=3773, failed=0, total=3785))


'\nfor batch in batches.data:\n    try:\n        client.batches.cancel(batch.id)\n        print(f"Cancelled batch {batch.id}")\n    except Exception as e:\n        print(f"Failed to cancel batch {batch.id}: {e}")\n        '