# Data Generation Script for Training Clip

## Imports

In [None]:
import os
import json
import random
import string
from PIL import Image
import pytesseract
import easyocr
import numpy as np
import torch
from multiprocessing import set_start_method
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from threading import Lock
from functions import *

## Global Variables

In [None]:
in_dir = "/mnt/nis_lab_research/data/coco_files/raw/shah_b1_539_21"
out_dir = "/mnt/nis_lab_research/data/clip_data/test"
out_res_w = 224
out_res_h = 224
bg_color = "white"
padding = 0.05

## Preprocessing

In [None]:
with open(os.path.join(in_dir, "result.json")) as f:
    obj = json.load(f)

In [None]:
img_list = obj["images"]
cat_list = obj["categories"]
ann_list = obj["annotations"]

In [None]:
cat_map = []
for cat in cat_list:
    cat_map.append(cat["name"])
cat_map = sorted(cat_map)

## Main

In [None]:
if not os.path.exists(out_dir):
        os.makedirs(out_dir)
for cat in cat_list:
    os.makedirs(os.path.join(out_dir, cat["name"]), exist_ok=True)

In [None]:
print(torch.cuda.is_available())

In [None]:
torch.cuda.empty_cache()

In [None]:
set_start_method('spawn', True)
print_lock = Lock()

if __name__ == "__main__":

    with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
        
        futures = {}
        for i, img in enumerate(img_list):
            print(i)
            img_bn = os.path.basename(img["file_name"])[0:-4]
            img_fp = os.path.join(in_dir, "images", os.path.basename(img["file_name"]))
            img_id = img["id"]
            for j, ann in enumerate(ann_list):
                ann_img_id = ann["image_id"]
                cat_id = ann["category_id"]
                if img_id == ann_img_id:
                    future = executor.submit(process_image_annotation, img_fp, out_dir, img_bn, ann, cat_map, cat_id, bg_color, out_res_w, out_res_h, j)
                    futures[future] = (i, j)
                
            completed = 0
            for future in as_completed(futures):
                try:
                    result = future.result()
                    with print_lock:
                        completed += 1
                        print(f"Completed images: {completed}, {futures[future]}")
                except Exception as exc:
                    print(f"Generated an exception: {exc}")
    