<a href="https://colab.research.google.com/github/redrum88/autoevolution/blob/main/step_5_download_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import os
import time
import requests
import urllib.parse
import pandas as pd
from PIL import Image
from requests.exceptions import MissingSchema

load_df = pd.read_csv("filtered2.csv") # Load dataset to download images from

In [17]:
load_df.columns

Index(['brand', 'model', 'production_years', 'from_year', 'to_year',
       'body_style', 'segment', 'title', 'description', 'engine_specs_title',
       'cylinders', 'displacement', 'power', 'torque', 'fuel_system', 'fuel',
       'fuel_capacity', 'top_speed', 'drive_type', 'gearbox', 'front', 'rear',
       'tire_size', 'length', 'width', 'height', 'front_rear_track',
       'wheelbase', 'cargo_volume', 'unladen_weight', 'highway', 'combined',
       'acceleration', 'aerodynamics', 'city', 'ground_clearance',
       'gross_weight_limit', 'brand_url', 'brand_logo_url', 'model_url',
       'image_urls', 'dir_path'],
      dtype='object')

In [18]:
MAX_IMAGES = 200 # Set how many images per car model variant to download
all_images = load_df['image_urls'].apply(lambda x: len(x.split(','))).sum()
# Function to create new directory if not exist
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Resize images function
def resize_image(image_path, percent):
    with Image.open(image_path) as image:
        width, height = image.size
        new_width = int(width * percent / 100)
        new_height = int(height * percent / 100)
        resized_image = image.resize((new_width, new_height))
        resized_image.save(image_path)


downloaded_filenames_all = []
images_dir = []
list_dict = []
# Loop each row in df
for index, row in load_df.iterrows():
    start_time = time.time()  # start timer
    # Get the all info from the row
    body_style = row['body_style']
    segment = row['segment']
    production_years = row['production_years']
    title = row['title']
    description = row['description']
    engine_specs_title = row['engine_specs_title']
    cylinders = row['cylinders']
    displacement = row['displacement']
    power = row['power']
    torque = row['torque']
    fuel_system = row['fuel_system']
    fuel = row['fuel']
    fuel_capacity = row['fuel_capacity']
    top_speed = row['top_speed']
    drive_type = row['drive_type']
    gearbox = row['gearbox']
    front = row['front']
    rear = row['rear']
    tire_size = row['tire_size']
    length = row['length']
    width = row['width']
    height = row['height']
    front_rear_track = row['front_rear_track']
    wheelbase = row['wheelbase']
    cargo_volume = row['cargo_volume']
    unladen_weight = row['unladen_weight']
    highway = row['highway']
    combined = row['combined']
    image_urls = row['image_urls']
    acceleration = row['acceleration']
    aerodynamics = row['aerodynamics']
    city = row['city']
    ground_clearance = row['ground_clearance']
    gross_weight_limit = row['gross_weight_limit']
    model = row['model']
    model_url = row['model_url']   
    from_year = row['from_year']
    to_year = row['to_year']
    # link = row['Link']
    # image_link = row['Image_Link']
    brand_url = row['brand_url']
    brand = row['brand']
    brand_logo_url = row['brand_logo_url']
    
    print(f"🔃 {index} / {len(load_df)} | Start downloading {model} images...")

    # Trim brand, model, and title to create path for new folders
    brand = ''.join(e for e in brand if e.isalnum() or e.isspace()).replace(' ', '_')
    model = ''.join(e for e in model if e.isalnum() or e.isspace())
    title = ''.join(e for e in title if e.isalnum() or e.isspace())

    # Create directory structure
    directory = os.path.join('images', brand, model.replace(' ', '_'), title.split(' ')[-1])
    create_directory(directory)
    downloaded_filenames = []
    downloaded_directory = []

    # Loop each image URL in the row
    counter = 0  # Initialize counter
    

    for url in str(row['image_urls']).split(','):
        
        if not url:
            continue
        if counter >= MAX_IMAGES:
            break
        if len(os.listdir(directory)) >= MAX_IMAGES:
            continue

        # Get filename from URL
        filename = os.path.basename(urllib.parse.urlsplit(url).path)
        

        # If file exists then skip it
        if os.path.exists(os.path.join(directory, filename)):
            downloaded_filenames.append(filename)
            continue

        try:
            # Download image
            response = requests.get(url)
            with open(os.path.join(directory, filename), 'wb') as f:
                f.write(response.content)

            image_path = os.path.join(directory, filename)
            downloaded_filenames.append(filename)
        #    total_images += 1 # Icrement total images counter
            counter += 1  # Increment counter 
            time.sleep(0.5)

        except AttributeError:
            print("Error: 'float' object has no attribute 'split'")

        except MissingSchema:
            print(f"Error: Invalid URL {url!r}: No scheme supplied. Perhaps you meant https://{url}?")
    
    images_dir.append(directory)
    downloaded_filenames_all.append(downloaded_filenames)
    all_images -= len(downloaded_filenames)
    end_time = time.time()  # end timer
    download_time = end_time - start_time
    
    print(f"✅ {len(list_dict)} / {len(load_df)} | Remaining to download: {all_images} | Took {download_time:.2f} seconds to download {len(downloaded_filenames)} images for model {title}")
    list_dict.append({"brand": brand, 
    "model": model, "production_years": production_years,
    "from_year": from_year, "to_year": to_year, "body_style": body_style, "segment": segment,
    "title": title, "description": description, "engine_specs_title": engine_specs_title,
    "cylinders": cylinders, "displacement": displacement, "power": power, "torque": torque,
    "fuel_system": fuel_system, "fuel": fuel, "fuel_capacity": fuel_capacity, "top_speed": top_speed,
    "drive_type": drive_type, "gearbox": gearbox, "front": front, "rear": rear, 'tire_size': tire_size,
    'length': length, 'width': width, 'height': height, 'front_rear_track': front_rear_track,
    'wheelbase': wheelbase, 'cargo_volume': cargo_volume, 'unladen_weight': unladen_weight,
    'highway': highway, 'combined': combined, 'acceleration': acceleration, 'aerodynamics': aerodynamics,
    'city': city, 'ground_clearance': ground_clearance, 'gross_weight_limit': gross_weight_limit,
    "brand_url": brand_url, "brand_logo_url": brand_logo_url, "model_url": model_url,
    'image_urls': image_urls, 'image_file_names': downloaded_filenames,
    'dir_path': directory.replace('\\', '/')})
    print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ')
        
len(downloaded_filenames_all), len(images_dir),len(list_dict)


🔃 0 / 3814 | Start downloading AC  428 Convertible images...
✅ 0 / 3814 | Remaining to download: 194924 | Took 18.78 seconds to download 24 images for model AC  428 Convertible 19661971
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
🔃 1 / 3814 | Start downloading ACURA MDX images...
✅ 1 / 3814 | Remaining to download: 194891 | Took 25.90 seconds to download 33 images for model ACURA MDX 2021Present
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
🔃 2 / 3814 | Start downloading ACURA MDX ASpec images...
✅ 2 / 3814 | Remaining to download: 194843 | Took 0.00 seconds to download 48 images for model ACURA MDX ASpec 2018Present
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
🔃 3 / 3814 | Start downloading ACURA MDX images...
✅ 3 / 3814 | Remaining to download: 194806 | Took 0.00 seconds to download 37 images for model ACURA MDX 20132016
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
🔃 4 / 3814 | Start downl

KeyboardInterrupt: ignored