## Our Prompt
You are a helpful assistant that takes an image caption, generates an edit instruction, and then produces a modified caption based on the edit.

Caption 1: A beach with palm trees and clear blue water
Caption 2: A city street with cars and people walking
Caption 3: A dog lying on the grass



Please respond with the following format for each caption:
Edit 1: ...
Output 1: ...
Edit 2: ...
Output 2: ...
Edit 3: ...
Output 3: ...


In [1]:
BATCH_SIZE = 6


In [2]:
import json

BATCH_SIZE = 6
INPUT_FILE = "caption_edit_triplets.json"
OUTPUT_FILE = "batched_prompts_finetune_data.jsonl"

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

def make_batch(batch, batch_num):
    messages = []
    
    # System message
    messages.append({
        "role": "system",
        "content": "You are an image caption editor assistant. For each input, generate an edit instruction and the updated output caption."
    })

    # User message (batched input captions)
    user_lines = [f"Input {i+1}: {item['input_caption']}" for i, item in enumerate(batch)]
    messages.append({
        "role": "user",
        "content": "\n".join(user_lines)
    })

    # Assistant message (batched instructions and output captions)
    assistant_lines = [
        f"Edit {i+1}: {item['edit_instruction']}\nOutput {i+1}: {item['output_caption']}"
        for i, item in enumerate(batch)
    ]
    messages.append({
        "role": "assistant",
        "content": "\n\n".join(assistant_lines)
    })

    return {"messages": messages}

# Write batched output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for i in range(0, len(data), BATCH_SIZE):
        batch = data[i:i + BATCH_SIZE]
        json.dump(make_batch(batch, i // BATCH_SIZE + 1), f)
        f.write("\n")

print(f"Successfully wrote batched fine-tuning file to {OUTPUT_FILE}")


Successfully wrote batched fine-tuning file to batched_prompts_finetune_data.jsonl


# Make calls to OpenAI

In [2]:
import threading
from concurrent.futures import ThreadPoolExecutor

In [3]:
import openai
openai.api_key = "<<secret-key-gpt>>"

In [4]:
fine_tuned_model_batch = "ft:gpt-3.5-turbo-0125:<<model-name>>"

In [5]:
import json
import re

def save_records(result, out_file_name):
    with open(out_file_name, "a", encoding="utf-8") as f:
        for record in result:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
    
def parse_gpt_response_to_json(input_captions, response_text, file_name, thread_id):
    results = []
    edits = re.findall(r'Edit \d+:\s*(.*)', response_text)
    outputs = re.findall(r'Output \d+:\s*(.*)', response_text)
    for i in range(len(input_captions)):
        if i < len(edits) and i < len(outputs):
            results.append({
                "input_caption": input_captions[i],
                "edit_instruction": edits[i],
                "output_caption": outputs[i]
            })

    if len(results) != 0:
        out_file_name = f'{file_name}-thread{thread_id}.json'
        save_records(results, out_file_name)

        


    

In [6]:
def batch_list(lst, chunk_size):
    """Split a list into chunks of a specific size."""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

In [7]:
def execute_prompt(captions, thread_id):
    batches = batch_list(captions, BATCH_SIZE)

    for batch in batches:
        prompt = []
        prompt.append({
            "role": "system",
            "content": "You are an image caption editor assistant. For each input, generate an edit instruction and the updated output caption."
        })
        prompt_lines = [f'Input {i+1}: {caption}' for i, caption in enumerate(batch)]
        prompt.append({
          "role": "user",  
          "content": '\n'.join(prompt_lines)
        })
    
        response = openai.chat.completions.create(
                model=fine_tuned_model_batch,
                messages=prompt,
                temperature=0.7,
                max_tokens=200
            )
        
        reply = response.choices[0].message.content
        parse_gpt_response_to_json(batch, reply, 'gpt-prompt-result-1',thread_id)
    
    
        

In [8]:
# a = [
#     "A beach with palm trees and clear blue water",
#     "A city street with cars and people walking",
#     "A cat sitting on a windowsill looking outside",
#     "A beach with palm trees and skyscraper in the background",
#     "A moonlight painting by pablo picasso",
#     "A scenic view of rome during rainy season",
#     "A scenic dick of rome during rainy season",
# ]

# execute_prompt(a, 1)


In [9]:
def split_chunks(data, n):
    avg = len(data) // n
    return [data[i * avg: (i + 1) * avg] if i < n - 1 else data[i * avg:] for i in range(n)]

In [10]:
# Load the JSON file with the input prompts, that is downloaded from LAION dataset
with open('high-aesthetic-text-urls.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# List of input captions
input_captions = [entry["TEXT"] for entry in data]

print(input_captions[:10])

['Amazing Oil Painting by South Indian Legend Ilaiyaraaja (6)', 'Harold Roth - Forest Avenue, The Bronx, 1937 - Howard Greenberg Gallery', 'Anthony Bourdain on Food Porn, YouTube Stars and His Intolerance of Gluten-Free Diets | Adweek', 'Panoramic view of famous Hallstatt lakeside town during winter sunrise on a beautiful cold sunny day at Christmas time, Salzkammergut, Austria', 'Flooding Painting - Storm Malta by John or Giovanni Schranz', 'The church of St. Andrew on the Vuoksa River, Russia', 'Friedrich & Romanticism pictures Wall Art as Canvas, Acrylic or Metal Print Napoleon I.als Erster Konsul/Ingres', 'Emily Blunt by Peter Lindbergh - More at http://cine-mania.it', 'World War 1 Propaganda Poster Daddy What Did You Do In The Great War? Print by R Muirhead Art', 'Édouard Manet 1832-1883 | French Realist/Impressionist Painter']


In [11]:
start_index = 8000
input_captions = input_captions[start_index:]
num_threads = 6
chunks = split_chunks(input_captions, num_threads)

In [12]:
len(chunks[2])

11166

#### Manual control 

In [13]:
chunks[0] = chunks[0][1100:]
chunks[1] = chunks[1][1100:]
chunks[2] = chunks[2][1100:]
chunks[3] = chunks[3][1100:]
chunks[4] = chunks[4][1100:]
chunks[5] = chunks[5][1100:]

In [14]:
len(chunks[0])

10066

In [None]:
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = []
    offset = start_index
    for i, chunk in enumerate(chunks):
        futures.append(executor.submit(execute_prompt, chunk, i))

    for future in futures:
        future.result()

# Merge the result of thread into one json file

In [46]:
bogus_data = [
                "remove copyright details",    
                "remove the title and artist information",
                "remove the artist's name",
                "remove artist name",
                "remove the artist name",
                "remove artist's name",
                "remove hashtags",
                "remove the size information",
                "remove the price",
                "remove the user tag",
                "remove the date",
                "\"",
                "“",
                "remove the year",
                "remove the numerical count",
                "remove the release year",
                "remove the mention of",
                "remove the location information",
                "remove the content keywords",
                "remove all text and information",
                "remove the mention of the sequel",
                "remove the name of the WASP member",
                "remove the resolution",
                "remove the version number",
                "remove the caption text",
                "remove the tags",
                "remove the quotation marks",
                "remove the artist's nationality",
                "remove the artist’s name",
                "remove location details",
                "remove the painter's name",
                "remove the Item #",
                "remove the credit line",
                "remove the photographer’s name",
                "remove the producer's name",
                "remove the photographer's name",
                "remove the stretched canvas print",
                "remove the dimensions and the artist's name",
                "remove the image ID at the end",
                "capitalize the first letter of each word",
                "change to a large mural",
                "remove the museum name",
                "remove resolution specifications",
                "remove gibberish code",
                "remove all tags",
                "remove quotes",
                "Remove the attribution text",
                "Remove all hashtags",
                "Remove the dimensions from the caption",
                "remove the frame",
                "remove the location metadata",
                "remove the URL",
                "remove the designer's name",
                "remove the workshop information",
                "remove the measurement information",
                "remove the names of the individuals",
                "remove the source name",
                "remove the source information",
                "remove the quotes",
                "remove the photo credit",
                "remove the quote attribution",
                "remove the detailed description",
                "remove artist’s name",
                "remove the game title",
                "remove the source and question",
                "remove the website",
                "remove the description",
                "remove the unnecessary word",         
                "remove the photographer name",
                "remove the director and studio names",
                "remove the gallery name and date",
                "remove the course information",
                "remove photographer name",
                "remove artist information",
                "remove the text after the title",
                "remove description",
                "remove website",
                "remove the size and date",
                "remove the dimensions",
                "remove the photographer name",
                "remove the photographer credit",
                "remove text after the first pipe symbol",
                "remove the attribution information",
                "remove the non-English title",
                "remove parentheses and the words",
                "remove the artist’s nationality",
                "remove the Pinterest source",
                "remove the iStock attribution",
                "remove the stock photo attribution",
                "remove the photographer attribution",
                "remove the location description",
                "remove the Japanese text",
                "remove the source credit",
                "Add the information that she became an Expressionist in her later years.",
                "Remove the last sentence.",
                "remove the URL",
                "remove the ellipsis",
                "remove the text description",
                "remove the French text",
                "remove the birth and death years",
                "remove the HTML tags",
                "remove the details of the update",
                "remove the location",
                "remove the artist nationality",
                "remove the entire paragraph",
                "remove the Twitter handle and the text after it",
                "remove location information",
                "remove publication information",
                "remove quotation marks",
                "remove the copyright information",
                "remove the movie reference",
                "remove all text after the colon",
                "remove the image name and views information",
                "remove all quotation marks",
                "remove the text within quotes",  
                "remove the question",
                "remove the artist information",
                "make the caption lowercase and remove the HTML tags",
                "remove all text",
                "remove all text",
                "remove the Pinterest reference",
                "remove the ID number",
                "remove the alternative title",
                "remove fog, mist, and storm",
                "remove the name of the photographer and the contest details",
                "remove tags",
                
            ]

In [47]:
temp_out_file_names = ['gpt-prompt-result-1.json',
                       'gpt-prompt-result-1-thread0.json',
                       'gpt-prompt-result-1-thread1.json',
                       'gpt-prompt-result-1-thread2.json',
                       'gpt-prompt-result-1-thread3.json',
                       'gpt-prompt-result-1-thread4.json',
                       'gpt-prompt-result-1-thread5.json',
                       'batch-gpt-response-thread0.json',    
                       'batch-gpt-response-thread1.json',    
                       'batch-gpt-response-thread2.json',    
                       'batch-gpt-response-thread3.json',    
                       'batch-gpt-response-thread4.json',    
                       'batch-gpt-response-thread5.json',    
                       'batch-gpt-response-thread6.json',    
                       'batch-gpt-response-thread7.json',    
                      ]

merged_records = []


idx = 0
for file_path in temp_out_file_names:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            record = json.loads(line)

            # control loop for bogus data
            flag = False
            for b_text in bogus_data:
                if b_text.lower() in record['edit_instruction'].lower():
                   flag = True
                   break

            if flag:
                continue
                
            record['id'] = idx
            idx = idx+1
            merged_records.append(record)

# Write to a single well-formatted JSON file
with open("gpt-prompt-merged-pilot.json", "w", encoding="utf-8") as out_file:
    json.dump(merged_records, out_file, ensure_ascii=False, indent=2)

In [48]:
print("total prompts: ", len(merged_records))

total prompts:  37790
