In [None]:
import json

from dotenv import load_dotenv
from openai import OpenAI

_ = load_dotenv()

In [None]:
fake_data = [
    {"university": "Tsinghua University", "major": "Computer Science", "gpa": 3.8},
    {"university": "Peking University", "major": "Electrical Engineering", "gpa": 85},
    {
        "university": "Stanford University",
        "major": "Mechanical Engineering",
        "gpa": 4.0,
    },
    {"university": "Unknown University", "major": "Philosophy", "gpa": 75},
    {"university": "Fudan University", "major": "CS", "gpa": 3.2},
]

print(fake_data)

[{'university': 'Tsinghua University', 'major': 'Computer Science', 'gpa': 3.8}, {'university': 'Peking University', 'major': 'Electrical Engineering', 'gpa': 85}, {'university': 'Stanford University', 'major': 'Mechanical Engineering', 'gpa': 4.0}, {'university': 'Unknown University', 'major': 'Philosophy', 'gpa': 75}, {'university': 'Fudan University', 'major': 'CS', 'gpa': 3.2}]


In [5]:
prompt = """Return the cleaned data into json foramts.
1. clean the key 'university' into f these categories ['985', '211', 'g5', 'other'] 
2. clean the jey 'major' into these categories ['CS', 'EE', 'ME', 'other'] 
3. clean the key 'gpa' from different scale to 4.0 scale

data: {data}
"""

format = {
    "format": {
        "type": "json_schema",
        "name": "cleaned_data",
        "schema": {
            "type": "object",
            "properties": {
                "university": {"type": "string", "enum": ["985", "211", "g5", "other"]},
                "major": {"type": "string", "enum": ["CS", "EE", "ME", "other"]},
                "gpa": {
                    "type": "number",
                },
            },
            "required": ["university", "major", "gpa"],
            "additionalProperties": False,
        },
        "strict": True,
    }
}

In [None]:
def prepare_jsonl_file(
    data_items,
    output_path,
    model_name,
    system_prompt,
    data_cleaning_prompt,
    output_format,
):
    with open(output_path, "w", encoding="utf-8") as f:
        for idx, item in enumerate(data_items):
            full_prompt = f"{data_cleaning_prompt}\nData: {str(item)}"
            request = {
                "custom_id": f"request_{idx}",
                "method": "POST",
                "url": "/v1/responses",
                "body": {
                    "model": model_name,
                    "input": [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": full_prompt},
                    ],
                    "text": output_format,
                },
            }
            f.write(json.dumps(request) + "\n")

In [24]:
prepare_jsonl_file(
    data_items=fake_data,
    output_path="batch_requests.jsonl",
    model_name="gpt-4o-mini",
    system_prompt="You are a helpful data cleaner.",
    data_cleaning_prompt=prompt,
    output_format=format,
)

In [None]:
client = OpenAI()

batch_input_file = client.files.create(
    file=open("batch_requests.jsonl", "rb"), purpose="batch"
)

print(batch_input_file)

FileObject(id='file-2GSXnRLwRynXFE2Et2CvR1', bytes=4744, created_at=1744820383, filename='batch_requests.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


In [26]:
batch_input_file_id = batch_input_file.id
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/responses",
    completion_window="24h",
    metadata={
        "description": "test batch data cleaning",
    },
)

Batch(id='batch_67ffd8bc3abc8190bdd7ac0f67fcb03c', completion_window='24h', created_at=1744820412, endpoint='/v1/responses', input_file_id='file-2GSXnRLwRynXFE2Et2CvR1', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1744906812, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'test batch data cleaning'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [28]:
client.batches.retrieve("batch_67ffd8bc3abc8190bdd7ac0f67fcb03c")

Batch(id='batch_67ffd8bc3abc8190bdd7ac0f67fcb03c', completion_window='24h', created_at=1744820412, endpoint='/v1/responses', input_file_id='file-2GSXnRLwRynXFE2Et2CvR1', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1744906812, failed_at=None, finalizing_at=None, in_progress_at=1744820413, metadata={'description': 'test batch data cleaning'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=5))

In [1]:
from llmdatacleaner._utils.settings import Settings

In [None]:
Settings().model_dump()

Settings(openai_key='xxxx')

In [1]:
import os

os.getenv("OPENAI_API_KEY")

'xxxx'