<a href="https://colab.research.google.com/github/redrum88/autoevolution/blob/main/step_5_download_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import time
import requests
import urllib.parse
import pandas as pd
from PIL import Image
from requests.exceptions import MissingSchema

load_df = pd.read_csv("dataset_from_1899_to_2024.csv") # Load dataset to download images from

MAX_IMAGES = 35 # Set how many images per car model variant to download

# Function to create new directory if not exist
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Resize images function
def resize_image(image_path, percent):
    with Image.open(image_path) as image:
        width, height = image.size
        new_width = int(width * percent / 100)
        new_height = int(height * percent / 100)
        resized_image = image.resize((new_width, new_height))
        resized_image.save(image_path)

total_images = 0
downloaded_filenames_all = []
images_dir = []
list_dict = []
# Loop each row in df
for index, row in load_df.iterrows():
    # Get the all info from the row
    body_style = row['Body style']
    segment = row['Segment']
    production_years = row['Production years']
    title = row['Title']
    description = row['Description']
    engine_specs_title = row['Engine specs title']
    cylinders = row['Cylinders']
    displacement = row['Displacement']
    power = row['Power']
    torque = row['Torque']
    fuel_system = row['Fuel System']
    fuel = row['Fuel']
    fuel_capacity = row['Fuel capacity']
    top_speed = row['Top Speed']
    drive_type = row['Drive Type']
    gearbox = row['Gearbox']
    front = row['Front']
    rear = row['Rear']
    tire_size = row['Tire Size']
    length = row['Length']
    width = row['Width']
    height = row['Height']
    front_rear_track = row['Front/rear Track']
    wheelbase = row['Wheelbase']
    cargo_volume = row['Cargo Volume']
    unladen_weight = row['Unladen Weight']
    highway = row['Highway']
    combined = row['Combined']
    image_urls = row['image urls']
    acceleration = row['Acceleration 0-62 Mph (0-100 kph)']
    aerodynamics = row['Aerodynamics (Cd)']
    city = row['City']
    ground_clearance = row['Ground Clearance']
    gross_weight_limit = row['Gross Weight Limit']
    model = row['Model']
    model_url = row['Model URL']   
    from_year = row['From year']
    to_year = row['To year']
    link = row['Link']
    image_link = row['Image_Link']
    brand_url = row['Brand URL']
    brand = row['Brand']
    brand_logo_url = row['Brand Logo URL']
    
    print(f"{index} / {len(load_df)} | Start downloading {model} images...")

    # Trim brand, model, and title to create path for new folders
    brand = ''.join(e for e in brand if e.isalnum() or e.isspace()).replace(' ', '_')
    model = ''.join(e for e in model if e.isalnum() or e.isspace())
    title = ''.join(e for e in title if e.isalnum() or e.isspace())

    # Create directory structure
    directory = os.path.join('images', brand, model.replace(' ', '_'), title.split(' ')[-1])
    create_directory(directory)
    downloaded_filenames = []
    downloaded_directory = []

    # Loop each image URL in the row
    counter = 0  # Initialize counter
    

    for url in str(row['image urls']).split(','):
        
        if not url:
            continue
        if counter >= MAX_IMAGES:  # Set how many images to download per each model.
            break

        # Get filename from URL
        filename = os.path.basename(urllib.parse.urlsplit(url).path)
        if len(os.listdir(directory)) >= MAX_IMAGES:
          continue

        # If file exists then skip it
        if os.path.exists(os.path.join(directory, filename)):
            downloaded_filenames.append(filename)
            continue

        try:
            # Download image
            response = requests.get(url)
            with open(os.path.join(directory, filename), 'wb') as f:
                f.write(response.content)
            image_path = os.path.join(directory, filename)
            #resize_image(image_path, 100)  # Resize image by percentage. 50 - half size
            downloaded_filenames.append(filename)
            total_images += 1 # Icrement total images counter
            counter += 1  # Increment counter 
            time.sleep(0.5)
        except AttributeError:
            print("Error: 'float' object has no attribute 'split'")
        except MissingSchema:
            print(f"Error: Invalid URL {url!r}: No scheme supplied. Perhaps you meant https://{url}?")
    
    images_dir.append(directory)
    downloaded_filenames_all.append(downloaded_filenames)
    print(f"{len(list_dict)} / {len(load_df)} | Total images: {total_images} | Images downloaded {len(downloaded_filenames)} for model {title}")
    list_dict.append({"brand": brand, 
    "model": model, "production_years": production_years,
    "from_year": from_year, "to_year": to_year, "body_style": body_style, "segment": segment,
    "title": title, "description": description, "engine_specs_title": engine_specs_title,
    "cylinders": cylinders, "displacement": displacement, "power": power, "torque": torque,
    "fuel_system": fuel_system, "fuel": fuel, "fuel_capacity": fuel_capacity, "top_speed": top_speed,
    "drive_type": drive_type, "gearbox": gearbox, "front": front, "rear": rear, 'tire_size': tire_size,
    'length': length, 'width': width, 'height': height, 'front_rear_track': front_rear_track,
    'wheelbase': wheelbase, 'cargo_volume': cargo_volume, 'unladen_weight': unladen_weight,
    'highway': highway, 'combined': combined, 'acceleration': acceleration, 'aerodynamics': aerodynamics,
    'city': city, 'ground_clearance': ground_clearance, 'gross_weight_limit': gross_weight_limit,
    "brand_url": brand_url, "brand_logo_url": brand_logo_url, "model_url": model_url,
    'image_urls': image_urls, 'image_file_names': downloaded_filenames,
    'dir_path': directory.replace('\\', '/')})
    print(len(list_dict))
        
len(downloaded_filenames_all), len(images_dir),len(list_dict)
