In [None]:
pip install git+https://github.com/huggingface/transformers

In [2]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

# default: Load the model on the available device(s)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-4B-Instruct", dtype="auto", device_map="auto"
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
def generate_fun(title, rendered_text, image_base64):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f'''Title: {title}
Rendered Text: {rendered_text}

Use the image and the title and rendered text to create a prompt that can be used as an instruction for image generation models to generate the provided image. 
Don't add too much detail. One example instruction can be:
"Generate an image with the background as .......... and text '.......' written on it."
Don't make the prompt too long — maximum 30 words only. 
Add quotes around the text to be written on the image; everything else should be outside quotes. Also the text that we want to print will be in top to bottom order from image.
Most important point is that the text that we want to print will be from `Rendered Text` only, if the `Rendered Text` does not contain any text then dont print any text also if all the text presented in the `Rendered Text` should be there in generated instruction. The prompt should sound natural and do not mention the position of the text.''',
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                },
            ],
        }
    ]

    # Preparation for inference
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    del inputs
    torch.cuda.empty_cache()
    
    return output_text


In [None]:
ds['train'][20]

In [11]:
import base64
from io import BytesIO
from openai import OpenAI
from datasets import DatasetDict, load_dataset
import torch
ds=load_dataset("Pulkit996/Pulkit_dataset")

In [None]:


# --- Define selecting_working_text ---
def selecting_working_text(example):
    font_sizes = example['font_size']
    texts = example['text']

    # Pair text with font size
    items = list(zip(font_sizes, texts))

    # Sort by font size descending
    sorted_items = sorted(items, key=lambda x: x[0], reverse=True)

    # Pick up to two distinct non-empty texts
    kept_texts = []
    for size, text in sorted_items:
        if text and text.strip() and text not in kept_texts:
            kept_texts.append(text)
        if len(kept_texts) >= 2:
            break

    # If the first text is long (>8 chars), keep only it
    if kept_texts and len(kept_texts[0].split()) > 8:
        kept_texts = kept_texts[:1]

    # Blank out all other texts
    new_texts = [t if t in kept_texts else "" for t in texts]
    example["text"] = new_texts
    return example


# --- Define main function ---
def fun(example):
    try:
        example = selecting_working_text(example)
        image_data = example["preview"]
        title = example["title"]
        rendered_text = example["text"]

        # Convert in-memory image to bytes
        if isinstance(image_data, bytes):
            image_bytes = image_data
        else:
            buffer = BytesIO()
            image_data.save(buffer, format="PNG")
            image_bytes = buffer.getvalue()

        image_base64 = base64.b64encode(image_bytes).decode("utf-8")

        
        prompt_text = generate_fun(title, rendered_text, image_base64)
        # print(prompt_text)
        # print(f"✅ Generated prompt: {prompt_text}")
        return {"text": rendered_text, "prompt": prompt_text}

    except Exception as e:
        print(f"⚠️ Error processing one example: {e}")
        return {"prompt": None}


def add_new_column_sample(dataset_dict, num_samples=5):
    updated = DatasetDict()

    for split, dset in dataset_dict.items():
        print(f"\nProcessing split: {split} ({len(dset)} samples)")
        # sample = dset.select(range(min(num_samples, len(dset))))  # take only few samples
        sample=dset
        new_data = []
        for i, example in enumerate(sample):
            print(f"🧠 Processing example {i+1}/{len(sample)}...")
            result = fun(example)  # call your function directly

            if(i%10==0):
                print(result)
            new_data.append({**example, **result})

            # clean GPU memory between runs
            import torch
            torch.cuda.empty_cache()

        # Create a new dataset from processed examples
        updated[split] = sample.from_dict({k: [d[k] for d in new_data] for k in new_data[0].keys()})
        print(f"✅ Done ({len(updated[split])} samples)")

    return updated

# Run it
sampled_dataset = add_new_column_sample(ds, num_samples=5)


In [None]:
sampled_dataset.save_to_disk("dataset")