In [1]:
%pip install pillow

Note: you may need to restart the kernel to use updated packages.


In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests
import os
import uuid


# Path to chromedriver executable
chromedriver_path = './chrome-win64/chromedriver.exe'
# Path to chrome binary
chrome_binary_path = './chrome-win64/chrome.exe'

service = Service(executable_path=chromedriver_path)
options = Options()
options.binary_location = chrome_binary_path

driver = webdriver.Chrome(service=service, options=options)

# Wait for the page to load
wait = WebDriverWait(driver, 10)

def download_image(url, path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(path, 'wb') as f:
            f.write(response.content)

try:
    for i in range(1, 2070):  # Adjust the range according to the maximum number of pages
        page_url = f"https://www.autogespot.com/spots/Switzerland/{i}"
        driver.get(page_url)
        # Fetch the list of cars each time to avoid stale references
        car_links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//li/div[contains(@class, 'spot')]/a[contains(@class, 'cover-img-parent')]")))
        urls = [link.get_attribute('href') for link in car_links]
        print(urls)

        for url in urls:
            driver.get(url)
            try:
                img_element = driver.find_element(By.XPATH, "//article/ul/li/img")
                image_url = img_element.get_attribute("data-src")
                random_filename = f"{uuid.uuid4()}.jpg"
                download_image(image_url, os.path.join('./images', random_filename))
            except:
                print("Error scraping: " + url)

finally:
    driver.quit()

In [2]:
import requests
from bs4 import BeautifulSoup
import os
import uuid
from concurrent.futures import ThreadPoolExecutor

def download_image(image_url, image_directory, brand, car_name):
    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            filename = f"{brand}_{car_name}_{uuid.uuid4()}.jpg"  # Integrate brand and car name into filename
            filepath = os.path.join(image_directory, filename)
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {filepath}")
        else:
            print(f"Failed to download {image_url}: Status code {response.status_code}")
    except Exception as e:
        print(f"Error downloading image {image_url}: {str(e)}")

def process_page(page_url, image_directory):
    try:
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        car_spots = soup.find_all("div", class_="bg-white p1 sm-p2 spot")
        
        for spot in car_spots:
            brand_model_element = spot.find("strong", class_="truncate")
            if brand_model_element:
                brand, car_name = [item.get_text(strip=True) for item in brand_model_element.find_all('a')]
                spot_url = spot.find("a", href=True)['href']
                image_urls = fetch_image_urls_from_spot(spot_url)
                if len(image_urls) != 0:
                    for img_url in image_urls:
                        download_image(img_url, image_directory, brand, car_name)
    except Exception as e:
        print(f"Error processing page {page_url}: {str(e)}")

def fetch_image_urls_from_spot(spot_url):
    try:
        response = requests.get(spot_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        image_elements = soup.find_all("img", attrs={"data-src": True, "class": "photo"})
        image_urls = []
        for image_element in image_elements:
            image_urls.append(image_element["data-src"])
        return image_urls
    except Exception as e:
        print(f"Error fetching images from {spot_url}: {str(e)}")
    return []



In [None]:
# Multithread
base_url = "https://www.autogespot.com/spots/Switzerland/"
image_directory = './images'
os.makedirs(image_directory, exist_ok=True)
urls = [f"{base_url}{i}" for i in range(1, 2070)]  # Adjust range as necessary

with ThreadPoolExecutor(max_workers=1) as executor:
    executor.map(lambda url: process_page(url, image_directory), urls)

In [None]:
# Single Thread
base_url = "https://www.autogespot.com/spots/Switzerland/"
image_directory = './images'
os.makedirs(image_directory, exist_ok=True)
urls = [f"{base_url}{i}" for i in range(1, 2070)]  # Adjust range as necessary

for url in urls:
    process_page(url, image_directory)

In [26]:
def get_license_plate_number(image_url, api_token):
    api_endpoint = 'https://api.platerecognizer.com/v1/plate-reader/'
    headers = {'Authorization': f'Token {api_token}'}
    data = {'upload_url': image_url, 'regions': 'ch'}
    try:
        response = requests.post(api_endpoint, headers=headers, data=data)
        response.raise_for_status()
        print(response.text)
        plates_data = response.json()
        return plates_data['results'][0]['plate'] if plates_data['results'] else None
    except Exception as e:
        return None

api_token = '...'  # Replace with your actual API token
image_url = 'https://spots.ag/2024/04/10/ferrari-812-gts-c830110042024004411_1.jpg?1712702690'
plate_number = get_license_plate_number(image_url, api_token)
print(plate_number)

{"processing_time":62.547,"results":[{"box":{"xmin":1624,"ymin":1221,"xmax":1805,"ymax":1274},"plate":"zh917866","region":{"code":"ch","score":0.581},"score":0.9,"candidates":[{"score":0.9,"plate":"zh917866"}],"dscore":0.764,"vehicle":{"score":0.968,"type":"Sedan","box":{"xmin":394,"ymin":584,"xmax":2097,"ymax":1365}}}],"filename":"1845_X2VXj_ferrari-812-gts-c830110042024004411_1.jpg","version":1,"camera_id":null,"timestamp":"2024-04-10T18:45:35.026088Z"}
zh917866


In [2]:
def get_license_plate_number(image_url, api_token):
    api_endpoint = 'https://api.platerecognizer.com/v1/plate-reader/'
    headers = {'Authorization': f'Token {api_token}'}
    data = {'upload_url': image_url, 'regions': 'ch'}
    try:
        response = requests.post(api_endpoint, headers=headers, data=data)
        response.raise_for_status()
        print(response.text)
        plates_data = response.json()
        return plates_data['results'][0]['plate'] if plates_data['results'] else None
    except Exception as e:
        return None

In [3]:
from io import BytesIO
import requests

def download_image(url):
    response = requests.get(url)
    response.raise_for_status()
    return BytesIO(response.content)

In [4]:
from PIL import Image

def compress_image(image_bytes, max_size_mb=3, step=5, quality=85):
    img = Image.open(image_bytes)
    while image_bytes.getbuffer().nbytes > (max_size_mb * 1024 * 1024):
        img = img.resize((int(img.width * (100 - step) / 100), int(img.height * (100 - step) / 100)))
        image_bytes = BytesIO()
        img.save(image_bytes, format='JPEG', quality=quality)
    return image_bytes

In [7]:
def get_license_plate_number_bytes(image_bytes, api_token):
    time.sleep(1)
    api_endpoint = 'https://api.platerecognizer.com/v1/plate-reader/'
    headers = {'Authorization': f'Token {api_token}'}
    files = {'upload': ('image.jpg', image_bytes, 'image/jpeg')}  # Assume the image is JPEG
    data = {'regions': 'ch'}
    try:
        response = requests.post(api_endpoint, headers=headers, files=files, data=data)
        response.raise_for_status()
        print(response.text)
        plates_data = response.json()
        return plates_data['results'][0]['plate'] if plates_data['results'] else None
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
'''
Create excel sheet with brand, model, license plate

'''
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def process_page(page_url, car_details, api_token):
    try:
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        car_spots = soup.find_all("div", class_="bg-white p1 sm-p2 spot")
        
        for spot in car_spots:
            brand_model_element = spot.find("strong", class_="truncate")
            if brand_model_element:
                brand, car_name = [item.get_text(strip=True) for item in brand_model_element.find_all('a')]
                spot_url = spot.find("a", href=True)['href']
                preview_url = spot.find("img")['src']
                preview_bytes = download_image(preview_url)
                #image_url = fetch_image_urls_from_spot(spot_url)[0]
                #image_bytes = compress_image(download_image(image_url))

                #Get license plate from preview
                img_url = preview_url
                license_plate = get_license_plate_number_bytes(preview_bytes, api_token)

                car_details.append({
                    "Brand": brand,
                    "Model": car_name,
                    "Image URL": img_url,
                    "Spot URL": spot_url,
                    "License Plate": license_plate
                })
                # Write to Excel file periodically or after adding each entry
                df = pd.DataFrame(car_details)
                df.to_excel('car_details.xlsx', index=False)
    except Exception as e:
        print(f"Error processing page {page_url}: {str(e)}")

def fetch_image_urls_from_spot(spot_url):
    try:
        response = requests.get(spot_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        image_elements = soup.find_all("img", attrs={"data-src": True, "class": "photo"})
        return [image_element["data-src"] for image_element in image_elements]
    except Exception as e:
        print(f"Error fetching images from {spot_url}: {str(e)}")
    return []

# Main processing
base_url = "https://www.autogespot.com/spots/Switzerland/"
car_details = []
api_token = "..."  # Replace with your actual API token

# Process pages in a loop
for i in range(1, 2070):  # Adjust range as necessary
    url = f"{base_url}{i}"
    process_page(url, car_details, api_token)

# Write to Excel after completing all pages as a final save
df = pd.DataFrame(car_details)
df.to_excel('car_details.xlsx', index=False)