In [None]:
import os
import pandas as pd
import requests
from tqdm import tqdm
from PIL import Image
from functools import partial
import multiprocessing
import time
import urllib

INPUT_CSV = 'test.csv'
OUTPUT_CSV = 'test_data.csv'
IMAGE_DIR = './test_images/'
BATCH_SIZE = 100  # Batch size for processing images

def create_placeholder_image(image_save_path):
    """Create a placeholder image in case the image download fails."""
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        print(f"Failed to create placeholder image: {e}")

def download_image(image_link, index, save_folder, retries=3, delay=3):
    """Download a single image, retry if it fails, and create a placeholder image if all retries fail."""
    if not isinstance(image_link, str):
        return

    filename = f"{index}.jpg"  # Use the original index from the DataFrame
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)
    
    create_placeholder_image(image_save_path)

def download_images(image_links, indices, download_folder, allow_multiprocessing=True):
    """Download images from a list of links, using multiprocessing if allowed."""
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.starmap(download_image_partial, zip(image_links, indices)), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link, index in tqdm(zip(image_links, indices), total=len(image_links)):
            download_image(image_link, index, save_folder=download_folder, retries=3, delay=3)

def process_images(df, start_index):
    os.makedirs(IMAGE_DIR, exist_ok=True)
    images_data = []

    image_links = df['image_link'].tolist()
    indices = df.index.tolist()  # Use DataFrame's original index

    download_images(image_links, indices, IMAGE_DIR, allow_multiprocessing=True)

    for row in tqdm(df.itertuples(), desc='Processing Images', unit='img', ncols=100, leave=False):
        index = row.Index  # Correct way to get the original index
        image_path = os.path.join(IMAGE_DIR, f"{index}.jpg")  # Name image using original index

        images_data.append({
            'index': index,  # Append the original index to images_data
            'image_path': image_path,
            'user_input': f"extract the {row.entity_name} from given image",
        })
    
    return images_data

def main():
    df = pd.read_csv(INPUT_CSV, index_col=0)  # Ensure index column is used
    if os.path.exists(OUTPUT_CSV):
        existing_df = pd.read_csv(OUTPUT_CSV)
        last_index = existing_df['index'].max() if not existing_df.empty else -1
        start_index = df[df.index > last_index].index.min() if last_index >= 0 else 0
    else:
        start_index = 0
    
    total_images = len(df) - start_index
    if total_images <= 0:
        print("No new images to process.")
        return

    num_batches = (total_images + BATCH_SIZE - 1) // BATCH_SIZE
    with tqdm(total=num_batches, desc='Processing Batches', ncols=100, unit='batch') as batch_progress:
        for batch_start in range(start_index, len(df), BATCH_SIZE):
            batch_end = min(batch_start + BATCH_SIZE, len(df))
            batch_df = df.iloc[batch_start:batch_end]
            
            images_data = process_images(batch_df, batch_start)

            batch_df_output = pd.DataFrame(images_data)
            if os.path.exists(OUTPUT_CSV):
                batch_df_output.to_csv(OUTPUT_CSV, mode='a', header=False, index=False)
            else:
                batch_df_output.to_csv(OUTPUT_CSV, mode='w', header=True, index=False)
            
            batch_progress.update(1)

if __name__ == "__main__":
    main()