In [133]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, WebDriverException
from rapidfuzz import process
from sqlalchemy import create_engine, Column, Integer, String, Float
from sqlalchemy.orm import sessionmaker, declarative_base
import time, re

In [134]:
# initialize the chrome driver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
# get the website URL
driver.get("https://www.autotrader.ca/")
# will wait for an element to be interactable.
driver.implicitly_wait(20)


# Define the database path and create the engine
db_path = "../database/auto_trader_car_data.db"
engine = create_engine(f'sqlite:///{db_path}')
    
# Create a session for the db connection
Session = sessionmaker(bind=engine)
session = Session()



In [135]:
# Initialize Dictionary that will store the make as key and model as value
car_data = {}

# Locate the 'select' element that host all the cars brand names
makes_drop_down_element = wait.until(EC.presence_of_element_located((By.ID, "rfMakes")))

# Find the 'optgroup' with label "All Makes"
all_makes_optgroup = makes_drop_down_element.find_element(By.XPATH, "./optgroup[@label='All Makes']")

# Find all 'option' elements under the 'optgroup'
all_makes_options = all_makes_optgroup.find_elements(By.TAG_NAME, "option")

# Loop through each 'option' and click it
for option in all_makes_options:
    car_make = option.text
    option.click()
    time.sleep(2)
    
    # Locate the 'select' element for models
    model_drop_down_element = wait.until(EC.presence_of_element_located((By.ID, "rfModel")))
    
    # Find all 'option' elements for models
    model_options = model_drop_down_element.find_elements(By.TAG_NAME, "option")
    
    # Initialize list for models
    all_models_options = []
    
    # Loop through each 'option' for models
    for model_option in model_options[1:]:  # Skip the first 'option'
        all_models_options.append(model_option.text)
    
    # Store in dictionary
    car_data[car_make] = all_models_options
# adding non specified car make and model as "Other"
car_data['Other'] = 'Other'


In [136]:
# Initialize the dictionary to None for all the specification and to 0 for the highlights and features
car_specs = {
    'Make': None,
    'Model': None,
    'Year Of Manufacturing': None,
    'Kilometres': None,
    'Kilometres Condition': None,
    'Status': None,
    'Trim': None,
    'Body Type': None,
    'Cylinder': None,
    'Transmission': None,
    'Drivetrain': None,
    'Exterior Colour': None,
    'Interior Colour': None,
    'Passengers': None,
    'Doors': None,
    'Fuel Type': None,
    'Fuel Consumption': None,
    'Price': None
}

In [137]:
# select "Any Make" on the dropdown menu to get all the make
selector = Select(makes_drop_down_element)
selector.select_by_visible_text('Any Make')


In [138]:
postal_code_input_element = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.ID, "locationAddressV2")))))
postal_code_input_element.send_keys("M5V 3L9")
show_me_cars_btn = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.ID, "SearchButton")))))
show_me_cars_btn.click()

In [139]:
def find_closest_words(input_word, list_of_words):
    # Use RapidFuzz to find the closest matches for the make
    closest_matches = process.extractOne(input_word, list_of_words)
    
    # Extract the closest make
    closest_match = closest_matches[0]
    
    return closest_match

In [140]:
### Transformation of the dictionary
def modify_dict(car_dict):
    modified_dict = {}
    for key, value in car_dict.items():
        # Handle None values
        if value is None and key is not None:
            modified_dict[key] = None
            continue

        # Modify specific keys
        if key == "Year Of Manufacturing" or key == "Cylinder" or key == "Door" or key == "Passengers":
            modified_dict[key] = int(value)
        elif key == "Price":
            modified_dict[key] = int(value.replace(",", ""))
        elif key == "Kilometres":
            modified_dict[key] = int(value.replace(",", "").replace(" km", ""))
        elif key == "Trim":
            modified_dict[key] = re.split('[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', value)[0].strip()
        elif key == "Fuel Consumption" and not isinstance(car_specs['Fuel Consumption'], float):
            modified_dict[key] = float(re.search(r"\d+\.\d+", value).group())
        elif key == "Kilometres Condition":
            modified_dict[key] = value.replace(" KM", "").lower()
        else:
            modified_dict[key] = value
    return modified_dict

In [141]:
def set_page_filter():
    # click on the postal code box on the left side and change the radius to national to get all the cars listing
    postal_code_element = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.ID, "faceted-Location")))))
    postal_code_element.click()

    # Locate the dropdown element
    dropdown_element = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.ID, "proximity")))))

    # Initialize Select class
    select = Select(dropdown_element)

    # Select the 'National' option
    select.select_by_visible_text("National")

    # save by clicking on the apply location button
    apply_location_btn = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.ID, "applyLocation")))))

    apply_location_btn.click()

    # Locate the checkbox element
    damaged_checkbox_element = driver.find_element(By.ID, "rfDamaged")

    # Use JavaScript to click the checkbox
    driver.execute_script("arguments[0].click();", damaged_checkbox_element)

    try:
        # Wait until the apply button becomes clickable
        apply_condition_btn = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.ID, "applyCondition")))))
        
        # Click the button
        apply_condition_btn.click()
    except TimeoutException:
        driver.execute_script("document.getElementById('applyCondition').click();")

    # click on "Other Options" menu
    other_option_menu  = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.ID, 'faceted-parent-Other')))))
    driver.execute_script("arguments[0].click();", other_option_menu)

    # Locate the "With photos" checkbox element
    with_photos_checkbox = driver.find_element(By.ID, "rfPhoto")

    # Use JavaScript to uncheck the "with photo"checkbox
    if with_photos_checkbox.is_selected():
        driver.execute_script("arguments[0].click();", with_photos_checkbox)

    # Get the apply button element
    apply_others_btn = driver.find_element(By.ID, "applyOthers")
    driver.execute_script("arguments[0].click();", apply_others_btn)

In [142]:
def get_and_store_car_data():

    try:
        # Locate the car head info element using CSS selector
        car_header_info = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'hero-title'))))).text
        # Split the string by spaces and tabs
        split_string = car_header_info.split()
        # Assign the first, second, and third words to respective variables
        year_of_manufacturing = split_string[0]
        make_estimate = split_string[1]
        model_estimate = split_string[2]
        car_specs["Year Of Manufacturing"] = year_of_manufacturing
    except (TimeoutException, UnboundLocalError, WebDriverException):
        # ignore if element is not found
        pass

    ### Added a word matcher using rapidfuzz to match make and model name that is extracted from the title and the dictionary collection of make and model
    def find_closest_words(input_word, list_of_words):
        # Use RapidFuzz to find the closest matches for the make
        closest_matches = process.extractOne(input_word, list_of_words)
        
        # Extract the closest make
        closest_match = closest_matches[0]
        
        return closest_match

    try: 
        make = find_closest_words(make_estimate, car_data.keys())
        all_models = car_data[make]

        model = find_closest_words(model_estimate, all_models)
        price_elements = driver.find_elements(By.XPATH,'//p[@class="hero-price"]')

        price = price_elements[0].text


        car_specs["Make"] = make
        car_specs["Model"] = model
        car_specs["Price"] = price
        
        # Expand to see all the specs
        all_spec_toggle = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.ID, "btn-vdp-specs-toggle")))))
        driver.execute_script("arguments[0].click();", all_spec_toggle)
        # wait for the element to load
        time.sleep(1)
    except (TimeoutException,WebDriverException, UnboundLocalError):
        pass
    # Find the number of list items in the unordered list in the Specifications block item
    car_specs_items = driver.find_elements(By.CSS_SELECTOR, "#sl-card-body li")
    # Loop through each list item
    for i in range(len(car_specs_items)):
        try:
            key_element = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f"#spec-key-{i}")))))
            value_element = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f"#spec-value-{i}")))))
            # Get the text from the elements
            key = key_element.text
            # only add the text that are present in the dictionary key
            if key in car_specs.keys():
                value = value_element.text      
                # Store the key-value pair in the dictionary
                car_specs[key] = value
            else:
                if key == "City Fuel Economy":
                    city_fuel = float(re.search(r"\d+\.\d+",value_element.text).group())
                elif key == "Hwy Fuel Economy":
                    hwy_fuel = float(re.search(r"\d+\.\d+",value_element.text).group())
                else:
                    #if the spec is not found in the car_specs dictionary key and not "City Fuel Consumption" or "Hwy Fuel Consumption", skip it
                    pass
        except (WebDriverException, AttributeError):
            pass


    # Find the combined Fuel Consumption car Kilometres condition and add it to the dictionary
    try:
        # Find the combined Fuel Consumption
        cbn_fuel_eco = driver.find_element(By.ID,"vdp-fv-combined").text
        car_specs['Fuel Consumption'] = cbn_fuel_eco + 'L/100km'
    except (NoSuchElementException, WebDriverException):
        # If not found, get the average of city Fuel Consumption and highway Fuel Consumption
        try:
            if isinstance(city_fuel, float) and isinstance(hwy_fuel, float):
                # Find the average of city and highway
                average_fuel = (city_fuel + hwy_fuel) / 2
                car_specs['Fuel Consumption'] = average_fuel
            elif isinstance(city_fuel, float) and not isinstance(hwy_fuel, float):
                # set the Fuel Consumption to the city fuel
                car_specs['Fuel Consumption'] = city_fuel
            elif not isinstance(city_fuel, float) and isinstance(hwy_fuel, float):
                # set the Fuel Consumption to the hwy fuel
                car_specs['Fuel Consumption'] = hwy_fuel
            else:
                car_specs['Fuel Consumption'] = None
        except NameError:
            car_specs['Fuel Consumption'] = None
    # Find the combined Fuel Consumption car Kilometres condition and add it to the dictionary
    try:
        # Find the element with the class="ca-indicator-active" and get its text and add it to the dictionary
        Kilometres_condition = wait.until(EC.presence_of_element_located(((By.CSS_SELECTOR, "p.ca-indicator-active")))).text
        car_specs['Kilometres Condition'] = Kilometres_condition
    except (NoSuchElementException, WebDriverException):
        # If not found, do nothing
        pass
 
    def modify_dict(car_dict):
        modified_dict = {}
        for key, value in car_dict.items():
            # Handle None values
            if value is None and key is not None:
                modified_dict[key] = None
                continue

            # Modify specific keys
            if key == "Year Of Manufacturing" or key == "Cylinder" or key == "Door" or key == "Passengers":
                modified_dict[key] = int(value)
            elif key == "Price":
                modified_dict[key] = int(value.replace(",", ""))
            elif key == "Kilometres":
                modified_dict[key] = int(value.replace(",", "").replace(" km", ""))
            elif key == "Trim":
                modified_dict[key] = re.split('[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', value)[0].strip()
            elif key == "Fuel Consumption" and not isinstance(car_specs['Fuel Consumption'], float):
                modified_dict[key] = float(re.search(r"\d+\.\d+", value).group())
            elif key == "Kilometres Condition":
                modified_dict[key] = value.replace(" KM", "").lower()
            else:
                modified_dict[key] = value
        return modified_dict
    car_specs_mod = modify_dict(car_specs)
    car_specs_mod_no_space = {key.replace(" ", "_"): value for key, value in car_specs_mod.items()}
    car_specs_mod_no_space


    # Define the base class for declarative models
    Base = declarative_base()

    # Define the Car model
    class Car(Base):
        __tablename__ = 'cars'
        
        id = Column(Integer, primary_key=True)
        Make = Column(String)
        Model = Column(String)
        Year_Of_Manufacturing = Column(Integer)
        Kilometres = Column(Integer)
        Kilometres_Condition = Column(String)
        Status = Column(String)
        Trim = Column(String)
        Body_Type = Column(String)
        Cylinder = Column(Integer)
        Transmission = Column(String)
        Drivetrain = Column(String)
        Exterior_Colour = Column(String)
        Interior_Colour = Column(String)
        Passengers = Column(Integer)
        Doors = Column(String)
        Fuel_Type = Column(String)
        Fuel_Consumption = Column(Float)
        Price = Column(Integer)

    # Create the table
    Base.metadata.create_all(engine)

    # Check if the record already exists
    existing_record = session.query(Car).filter_by(**car_specs_mod_no_space).first()

    # Add the record if it doesn't exist
    if existing_record is None:
        new_car = Car(**car_specs_mod_no_space)
        session.add(new_car)
        session.commit()
    else:
        print("Car already in the database, skipping...")
    
    try:
        # Go back to the main car listing with driver
        driver.back()
    except (TimeoutException,WebDriverException):
        pass


In [143]:
def car_listing_loop():
    # Reset the page filter
    set_page_filter()

    index = 1  # Start with 1 as the data-list-numerical-position is likely 1-based
    while True:
        try:
            # Find the car element by its "data-list-numerical-position"
            car = wait.until(EC.element_to_be_clickable(wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f"span[data-list-numerical-position='{index}']")))))

            # Use JavaScript to click the car element
            driver.execute_script("arguments[0].click();", car)

            # scrape the data
            get_and_store_car_data()

            # Navigate back to the original page containing the list of cars
            driver.back()

            # Increment index for the next iteration
            index += 1
        except Exception as e:
            print(f"An exception occurred: {e}")
            break  # Exit loop if any other exception occurs
    # Refresh the current page
    driver.refresh()


In [144]:
# Reset the page filter
set_page_filter()
# Start from page 1
current_page = 1


while True:
    try:
        # Find the li element for the current page using its data-page attribute
        page_element = driver.find_element(By.CSS_SELECTOR, f"li[data-page='{current_page}'] a.page-link-{current_page} p.page-link-text")
    except (NoSuchElementException, TimeoutException):
        print("Reached the end of the pages.")
        # refresh the page to make the pagination appear again
        driver.refresh()
        
        break
    
    page_element = driver.find_element(By.CSS_SELECTOR, f"li[data-page='{current_page}'] a.page-link-{current_page} p.page-link-text")
    # Only click on the next button when we are on the second page or above
    if current_page >= 2:
        driver.execute_script("arguments[0].click();", page_element)



    car_no_in_listing = 1  # Start with 1 as the data-list-numerical-position is 1-based
    while True:
        try:
            # Find the car element by its "data-list-numerical-position"
            car = driver.find_element(By.CSS_SELECTOR, f"span[data-list-numerical-position='{car_no_in_listing}']")
            # click on the car element    
            driver.execute_script("arguments[0].click();", car)
        except Exception as e:
            print(f"An exception occurred: {e}")
            break        

        # Scrape the data
        get_and_store_car_data()


        print("We are on page: {} car number {}".format(current_page,car_no_in_listing))
        
        # Increment car_no_in_listing for the next iteration
        car_no_in_listing += 1
        
    # Increment current_page for the next iteration
    current_page += 1
    

# Close the db session
session.close()

We are on page: 1 car number 1
We are on page: 1 car number 2
Car already in the database, skipping...
We are on page: 1 car number 3
We are on page: 1 car number 4
We are on page: 1 car number 5
We are on page: 1 car number 6
We are on page: 1 car number 7
Car already in the database, skipping...
We are on page: 1 car number 8
We are on page: 1 car number 9
We are on page: 1 car number 10
Car already in the database, skipping...
We are on page: 1 car number 11
Car already in the database, skipping...
We are on page: 1 car number 12
Car already in the database, skipping...
We are on page: 1 car number 13
Car already in the database, skipping...
We are on page: 1 car number 14
Car already in the database, skipping...
We are on page: 1 car number 15
Car already in the database, skipping...
We are on page: 1 car number 16
Car already in the database, skipping...
We are on page: 1 car number 17
Car already in the database, skipping...
We are on page: 1 car number 18
Car already in the data

TimeoutException: Message: timeout: Timed out receiving message from renderer: 300.000
  (Session info: chrome=119.0.6045.105)
Stacktrace:
0   chromedriver                        0x0000000104812004 chromedriver + 4169732
1   chromedriver                        0x0000000104809ff8 chromedriver + 4136952
2   chromedriver                        0x000000010445f500 chromedriver + 292096
3   chromedriver                        0x0000000104447a88 chromedriver + 195208
4   chromedriver                        0x000000010444797c chromedriver + 194940
5   chromedriver                        0x0000000104445ef4 chromedriver + 188148
6   chromedriver                        0x0000000104446b14 chromedriver + 191252
7   chromedriver                        0x0000000104455c80 chromedriver + 253056
8   chromedriver                        0x000000010446aa3c chromedriver + 338492
9   chromedriver                        0x000000010444726c chromedriver + 193132
10  chromedriver                        0x000000010446a858 chromedriver + 338008
11  chromedriver                        0x00000001044df788 chromedriver + 817032
12  chromedriver                        0x00000001044985e8 chromedriver + 525800
13  chromedriver                        0x00000001044994b8 chromedriver + 529592
14  chromedriver                        0x00000001047d8334 chromedriver + 3932980
15  chromedriver                        0x00000001047dc970 chromedriver + 3950960
16  chromedriver                        0x00000001047c0774 chromedriver + 3835764
17  chromedriver                        0x00000001047dd478 chromedriver + 3953784
18  chromedriver                        0x00000001047b2ab4 chromedriver + 3779252
19  chromedriver                        0x00000001047f9914 chromedriver + 4069652
20  chromedriver                        0x00000001047f9a90 chromedriver + 4070032
21  chromedriver                        0x0000000104809c70 chromedriver + 4136048
22  libsystem_pthread.dylib             0x0000000180f7f034 _pthread_start + 136
23  libsystem_pthread.dylib             0x0000000180f79e3c thread_start + 8
