In [74]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from rapidfuzz import process
from sqlalchemy import create_engine, Column, Integer, String, Float, Boolean
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
import time, re

In [75]:
# initialize the chrome driver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 2)
# get the website URL
driver.get("https://www.autotrader.ca/")
# will wait for an element to be interactable.
driver.implicitly_wait(2)


In [76]:
# Initialize Dictionary that will store the make as key and model as value
car_data = {}

# Locate the 'select' element that host all the cars brand names
makes_drop_down_element = driver.find_element(By.ID, "rfMakes")

# Find the 'optgroup' with label "All Makes"
all_makes_optgroup = makes_drop_down_element.find_element(By.XPATH, "./optgroup[@label='All Makes']")

# Find all 'option' elements under the 'optgroup'
all_makes_options = all_makes_optgroup.find_elements(By.TAG_NAME, "option")

# Loop through each 'option' and click it
for option in all_makes_options:
    car_make = option.text
    option.click()
    time.sleep(2)
    
    # Locate the 'select' element for models
    model_drop_down_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "rfModel")))
    
    # Find all 'option' elements for models
    model_options = model_drop_down_element.find_elements(By.TAG_NAME, "option")
    
    # Initialize list for models
    all_models_options = []
    
    # Loop through each 'option' for models
    for model_option in model_options[1:]:  # Skip the first 'option'
        all_models_options.append(model_option.text)
    # Loop through each 'option' for models
    # for model_option in range(1, len(model_options)):  # Skip the first 'option'
    #     #WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'your_value')))
    #     #model_option = driver.find_elements_by_your_selector_here()[i]
    #     all_models_options.append(model_option.text)
    
    # Store in dictionary
    car_data[car_make] = all_models_options
# adding non specified car make and model as "Other"
car_data['Other'] = 'Other'


In [77]:
# select "Any Make" on the dropdown menu to get all the make
selector = Select(makes_drop_down_element)
selector.select_by_visible_text('Any Make')


In [78]:
postal_code_input_element = driver.find_element(By.ID, "locationAddressV2")
postal_code_input_element.send_keys("M5V 3L9")
show_me_cars_btn = driver.find_element(By.ID, "SearchButton")
show_me_cars_btn.click()

In [79]:
# click on the postal code box on the left side and change the radius to national to get all the cars listing
postal_code_element = driver.find_element(By.ID, "faceted-Location")
postal_code_element.click()

# Locate the dropdown element
dropdown_element = driver.find_element(By.ID, "proximity")

# Initialize Select class
select = Select(dropdown_element)

# Select the 'National' option
select.select_by_visible_text("National")

# save by clicking on the apply location button
apply_location_btn = wait.until(EC.element_to_be_clickable((By.ID, "applyLocation")))

apply_location_btn.click()

# Locate the checkbox element
damaged_checkbox_element = driver.find_element(By.ID, "rfDamaged")

# Use JavaScript to click the checkbox
driver.execute_script("arguments[0].click();", damaged_checkbox_element)

try:
    # Wait until the apply button becomes clickable
    apply_condition_btn = wait.until(EC.element_to_be_clickable((By.ID, "applyCondition")))
    
    # Click the button
    apply_condition_btn.click()
except TimeoutException:
    driver.execute_script("document.getElementById('applyCondition').click();")

# click on "Other Options" menu
other_option_menu  = wait.until(EC.element_to_be_clickable((By.ID, 'faceted-parent-Other')))
driver.execute_script("arguments[0].click();", other_option_menu)

# Locate the "With photos" checkbox element
with_photos_checkbox = driver.find_element(By.ID, "rfPhoto")

# Use JavaScript to uncheck the "with photo"checkbox
if with_photos_checkbox.is_selected():
    driver.execute_script("arguments[0].click();", with_photos_checkbox)

# Get the apply button element
apply_others_btn = driver.find_element(By.ID, "applyOthers")
driver.execute_script("arguments[0].click();", apply_others_btn)

# Locate the dropdown element by its ID
dropdown_element = driver.find_element(By.ID, "pageSize")

# Initialize the Select class for the dropdown element to display 100 cars instead of 15 cars default
select = Select(dropdown_element)

# Select the option with the text "100"
select.select_by_visible_text("100")

# Execute JavaScript to trigger the click event on the selected option
driver.execute_script("arguments[0].dispatchEvent(new Event('change', {'bubbles': true}));", dropdown_element)

In [80]:
def get_and_store_car_data():
    # Perform any additional actions on the new page (e.g., scraping data)
    ### Collection each car information
    # Initialize the dictionary to None for all the specification and to 0 for the highlights and features
    car_specs = {
        'Make': None,
        'Model': None,
        'Year Of Manufacturing': None,
        'Kilometres': None,
        'Kilometres Condition': None,
        'Status': None,
        'Trim': None,
        'Body Type': None,
        'Cylinder': None,
        'Transmission': None,
        'Drivetrain': None,
        'Exterior Colour': None,
        'Interior Colour': None,
        'Passengers': None,
        'Doors': None,
        'Fuel Type': None,
        'Fuel Consumption': None,
        'Air Conditioning': 0,
        'Alarm': 0,
        'Heated Mirrors': 0,
        'Power Seat': 0,
        'Heated Seats': 0,
        'Power Windows': 0,
        'Alloy Wheels': 0,
        'Keyless Entry': 0,
        'Stability Control': 0,
        'Bluetooth': 0,
        'Memory Seats': 0,
        'Sunroof': 0,
        'Dual Climate Controls': 0,
        'Navigation System': 0,
        'Tow Package': 0,
        'Entertainment Package': 0,
        'Power Locks': 0,
        'Xenon Headlights': 0,
        'Fog Lights': 0,
        'Power Mirrors': 0,
        'Price': None
    }
    # select the keys = 0 which are the car categorical features and highlight
    car_specs_feat_only = {key: value for key, value in car_specs.items() if value == 0}
    # Locate the element using CSS selector
    car_header_info = driver.find_element(By.CSS_SELECTOR, "div#heroTitleWrapper > h1").text

    # Split the string by spaces and tabs
    split_string = car_header_info.split()

    # Assign the first, second, and third words to respective variables
    year_of_manufacturing = split_string[0]
    make_estimate = split_string[1]
    model_estimate = split_string[2]
    car_specs["Year Of Manufacturing"] = year_of_manufacturing
    ### Added a word matcher using rapidfuzz to match make and model name that is extracted from the title and the dictionary collection of make and model
    def find_closest_words(input_word, list_of_words):
        # Use RapidFuzz to find the closest matches for the make
        closest_matches = process.extractOne(input_word, list_of_words)
        
        # Extract the closest make
        closest_match = closest_matches[0]
        
        return closest_match


    make = find_closest_words(make_estimate, car_data.keys())
    all_models = car_data[make]

    model = find_closest_words(model_estimate, all_models)
    price_elements = driver.find_elements(By.XPATH,'//p[@class="hero-price"]')

    price = price_elements[0].text


    car_specs["Make"] = make
    car_specs["Model"] = model
    car_specs["Price"] = price
    # Expand to see all the specs
    all_spec_toggle = driver.find_element(By.ID, "btn-vdp-specs-toggle")
    driver.execute_script("arguments[0].click();", all_spec_toggle)
    # wait for the element to load
    time.sleep(1)
    # Find the number of list items in the unordered list in the Specifications block item
    car_specs_items = driver.find_elements(By.CSS_SELECTOR, "#sl-card-body li")
    # Loop through each list item
    for i in range(len(car_specs_items)):
        key_element = driver.find_element(By.CSS_SELECTOR, f"#spec-key-{i}")
        value_element = driver.find_element(By.CSS_SELECTOR, f"#spec-value-{i}")
        
        # Get the text from the elements
        key = key_element.text
        # only add the text that are present in the dictionary key
        if key in car_specs.keys():
            value = value_element.text      
            # Store the key-value pair in the dictionary
            car_specs[key] = value
        else:
            if key == "City Fuel Economy":
                city_fuel = float(re.search(r"\d+\.\d+",value_element.text).group())
            elif key == "Hwy Fuel Economy":
                hwy_fuel = float(re.search(r"\d+\.\d+",value_element.text).group())
            else:
                #if the spec is not found in the car_specs dictionary key and not "City Fuel Consumption" or "Hwy Fuel Consumption", skip it
                pass


    # Find the combined Fuel Consumption car Kilometres condition and add it to the dictionary
    try:
        # Find the combined Fuel Consumption
        cbn_fuel_eco = driver.find_element(By.ID,"vdp-fv-combined").text
        car_specs['Fuel Consumption'] = cbn_fuel_eco + 'L/100km'
    except NoSuchElementException:
        # If not found, get the average of city Fuel Consumption and highway Fuel Consumption
        try:
            if isinstance(city_fuel, float) and isinstance(hwy_fuel, float):
                # Find the average of city and highway
                average_fuel = (city_fuel + hwy_fuel) / 2
                car_specs['Fuel Consumption'] = average_fuel
            elif isinstance(city_fuel, float) and not isinstance(hwy_fuel, float):
                # set the Fuel Consumption to the city fuel
                car_specs['Fuel Consumption'] = city_fuel
            elif not isinstance(city_fuel, float) and isinstance(hwy_fuel, float):
                # set the Fuel Consumption to the hwy fuel
                car_specs['Fuel Consumption'] = hwy_fuel
            else:
                car_specs['Fuel Consumption'] = None
        except NameError:
            car_specs['Fuel Consumption'] = None
    # Find the combined Fuel Consumption car Kilometres condition and add it to the dictionary
    try:
        # Find the element with the class="ca-indicator-active" and get its text and add it to the dictionary
        Kilometres_condition = driver.find_element(By.CSS_SELECTOR, "p.ca-indicator-active").text
        car_specs['Kilometres Condition'] = Kilometres_condition
    except NoSuchElementException:
        # If not found, do nothing
        pass
    ### Getting all the features
    try:
        # Expand to see all the feature section 
        all_feat_toggle = driver.find_element(By.ID, "vdp-feature-toggle-btn")
        driver.execute_script("arguments[0].click();", all_feat_toggle)
        # wait for the element to load
        time.sleep(1)
        # Find the ul feat element by its ID
        ul_car_feat_element = driver.find_element(By.ID, 'fo-card-body')

        # Find all li elements within the ul
        li_car_feat_elements = ul_car_feat_element.find_elements(By.TAG_NAME, 'li')

        
        
        # Loop through each li element and get the text inside the span tag
        for li in li_car_feat_elements:
            span_text = li.find_element(By.TAG_NAME, 'span').text

            # only add the text that are present in the dictionary key
            if span_text in car_specs_feat_only.keys():
                # find the closest highlight element within the list of the car features we are interested in
                span_text_closest = find_closest_words(span_text, car_specs.keys())
                car_specs[span_text_closest] = 1
            
        
    except NoSuchElementException:
        # If not found, dont do anything
        pass
    ### Getting all the highlights
    try:
        # Expand all the highlights
        all_highlights = driver.find_element(By.ID, "vdp-highlight-toggle-btn")
        driver.execute_script("arguments[0].click();", all_highlights)
        time.sleep(1)
    except NoSuchElementException:
        # If not found, dont do anything
        pass
    try:
        # Wait until the ul element is available
        ul_element = driver.find_element(By.ID, 'hl-card-body')
        time.sleep(1)
        # Loop through all li elements within the ul
        for li in ul_element.find_elements(By.TAG_NAME, 'li'):
            # Locate the nested span containing the text
            nested_span = li.find_element(By.CSS_SELECTOR, 'span.list-text')
            
            # Get the text from the nested span
            feature_text = nested_span.text
            
            # find the closest highlight element within the list of the car features we are interested in
            feature_text_closest = find_closest_words(feature_text, car_specs.keys())
            
            # Check if the text are only present in the car_feat dictionary
            if feature_text_closest in car_specs_feat_only.keys():
                
                car_specs[feature_text_closest] = 1

    except NoSuchElementException:
        # If not found, dont do anything
        pass
    ### Transformation of the dictionary
    def modify_dict(car_dict):
        modified_dict = {}
        for key, value in car_dict.items():
            # Handle None values
            if value is None and key is not None:
                modified_dict[key] = None
                continue

            # Modify specific keys
            if key == "Year Of Manufacturing" or key == "Cylinder" or key == "Door" or key == "Passengers":
                modified_dict[key] = int(value)
            elif key == "Price":
                modified_dict[key] = int(value.replace(",", ""))
            elif key == "Kilometres":
                modified_dict[key] = int(value.replace(",", "").replace(" km", ""))
            elif key == "Trim":
                modified_dict[key] = re.split('[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', value)[0].strip()
            elif key == "Fuel Consumption" and not isinstance(car_specs['Fuel Consumption'], float):
                modified_dict[key] = float(re.search(r"\d+\.\d+", value).group())
            elif key == "Kilometres Condition":
                modified_dict[key] = value.replace(" KM", "").lower()
            else:
                modified_dict[key] = value
        return modified_dict
    car_specs_mod = modify_dict(car_specs)
    car_specs_mod_no_space = {key.replace(" ", "_"): value for key, value in car_specs_mod.items()}
    car_specs_mod_no_space
    ### Add the modified dictionary to a SQLite database
    # Define the database path and create the engine
    db_path = "../database/auto_trader_car_data.db"
    engine = create_engine(f'sqlite:///{db_path}')

    # Define the base class for declarative models
    Base = declarative_base()

    # Define the Car model
    class Car(Base):
        __tablename__ = 'cars'
        
        id = Column(Integer, primary_key=True)
        Make = Column(String)
        Model = Column(String)
        Year_Of_Manufacturing = Column(Integer)
        Kilometres = Column(Integer)
        Kilometres_Condition = Column(String)
        Status = Column(String)
        Trim = Column(String)
        Body_Type = Column(String)
        Cylinder = Column(Integer)
        Transmission = Column(String)
        Drivetrain = Column(String)
        Exterior_Colour = Column(String)
        Interior_Colour = Column(String)
        Passengers = Column(Integer)
        Doors = Column(String)
        Fuel_Type = Column(String)
        Fuel_Consumption = Column(Float)
        Air_Conditioning = Column(Boolean)
        Alarm = Column(Boolean)
        Heated_Mirrors = Column(Boolean)
        Power_Seat = Column(Boolean)
        Heated_Seats = Column(Boolean)
        Power_Windows = Column(Boolean)
        Alloy_Wheels = Column(Boolean)
        Keyless_Entry = Column(Boolean)
        Stability_Control = Column(Boolean)
        Bluetooth = Column(Boolean)
        Memory_Seats = Column(Boolean)
        Sunroof = Column(Boolean)
        Dual_Climate_Controls = Column(Boolean)
        Navigation_System = Column(Boolean)
        Tow_Package = Column(Boolean)
        Entertainment_Package = Column(Boolean)
        Power_Locks = Column(Boolean)
        Xenon_Headlights = Column(Boolean)
        Fog_Lights = Column(Boolean)
        Power_Mirrors = Column(Boolean)
        Price = Column(Integer)

    # Create the table
    Base.metadata.create_all(engine)

    # Create a session
    Session = sessionmaker(bind=engine)
    session = Session()


    # Check if the record already exists
    existing_record = session.query(Car).filter_by(**car_specs_mod_no_space).first()

    # Add the record if it doesn't exist
    if existing_record is None:
        new_car = Car(**car_specs_mod_no_space)
        session.add(new_car)
        session.commit()
    else:
        print("Car already in the database, skipping...")
    


In [81]:
def car_listing_loop():
    # Wait for the car elements to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.ID, 'SearchListings')))

    index = 1  # Start with 1 as the data-list-numerical-position is likely 1-based
    while True:
        try:
            # Find the car element by its "data-list-numerical-position"
            car = driver.find_element(By.CSS_SELECTOR, f"span[data-list-numerical-position='{index}']")

            # Use JavaScript to click the car element
            driver.execute_script("arguments[0].click();", car)

            # scrape the data
            get_and_store_car_data()

            # Navigate back to the original page containing the list of cars
            driver.back()

            # Wait for the car elements to reload
            wait.until(EC.presence_of_element_located((By.ID, 'SearchListings')))

            # Increment index for the next iteration
            index += 1
        except StaleElementReferenceException:
            print("StaleElementReferenceException caught, retrying...")
        except Exception as e:
            print(f"An exception occurred: {e}")
            break  # Exit loop if any other exception occurs
    # Refresh the current page
    driver.refresh()

In [82]:
# Start from page 2
current_page = 1
while True:
    try:
        # Find the li element for the current page using its data-page attribute
        page_element = driver.find_element(By.CSS_SELECTOR,f'li[data-page="{current_page}"]')
        
        # Use JavaScript to click the element
        driver.execute_script("arguments[0].click();", page_element)
        
        # Placeholder for looping through all the car listings
        car_listing_loop()
        
        # Increment the current_page for the next iteration
        current_page += 1
        
    except NoSuchElementException:
        # Break the loop if the element is not found, indicating we've reached the end of the pagination
        print("Reached the end of the pages.")
        break

  Base = declarative_base()


In [None]:
# TODO: Close the driver and the session
#driver.quit()
#session.close()