In [1]:
import requests
from PIL import Image
from io import BytesIO
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor 

In [2]:
from concurrent.futures import as_completed
from tqdm import tqdm

In [6]:
INPUT_FILES = ["image_urls1.txt", "image_urls2.txt", "image_urls3.txt","dataset4.txt","dataset5.txt","dataset6.txt"] 
OUTPUT_DIR = "downloaded_images" 
IMAGE_FORMAT = "JPEG"          
QUALITY = 90                   
THREADS = 8                  

os.makedirs(OUTPUT_DIR, exist_ok=True)

def process_image(url, idx):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  
        
        img = Image.open(BytesIO(response.content))
        
        file_ext = url.split('.')[-1].lower()
        if file_ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']:
            file_ext = IMAGE_FORMAT.lower()
        
        img_name = f"image_{idx:04d}.{file_ext}"  
        img_path = os.path.join(OUTPUT_DIR, img_name)
        
        img.save(img_path, quality=QUALITY)
        
        return {"url": url, "path": img_path, "status": "success"}
    except Exception as e:
        return {"url": url, "path": None, "status": f"failed: {e}"}

if __name__ == "__main__":
    urls = []
    for input_file in INPUT_FILES:
        with open(input_file, "r") as f:
            urls.extend([line.strip() for line in f if line.strip()])
    
    metadata = []
    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        
        futures = []
        for idx, url in enumerate(urls):
            futures.append(executor.submit(process_image, url, idx))
        
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading images"):
            metadata.append(future.result())
    
    pd.DataFrame(metadata).to_csv("download_metadata.csv", index=False)
    print(f"Done! Downloaded images saved in '{OUTPUT_DIR}/'")
    print(f"Metadata saved to 'download_metadata.csv'")

Downloading images: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 638/638 [05:17<00:00,  2.01it/s]

Done! Downloaded images saved in 'downloaded_images/'
Metadata saved to 'download_metadata.csv'



