# Notebook Context

This is 1 of 6 Jupyter Notebooks associated with the used car project.

This Notebook covers web scraping of the data to be used in the project from [autotrader.co.uk](https://www.autotrader.co.uk/)

In [1]:
# Import packages 

import pandas as pd
import glob
import numpy as np
import re
import regex
import time
import random
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import requests
import cloudscraper
from bs4 import BeautifulSoup
import urllib.parse
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# AutoTrader Scraping Functions

In [2]:
# AutoTrader used car scraping function 

def get_used_cars(
    postcode="KT12",
    radius=1500,
    min_year=1995,
    max_year=1995,
    include_writeoff="include",
    max_attempts_per_page=5,
    verbose=False):

    # To bypass Cloudflare protection
    scraper = cloudscraper.create_scraper()

    # Basic variables
    results = []
    n_this_year_results = 0
    # If a search returns > 1000 results then only the first 1000 are made available.
    # These such searches are recorded in the extra_make_list for later attention 
    extra_make_list = []
    url = "https://www.autotrader.co.uk/results-car-search"

    # Keywords commonly used on Autotrader in each of the fields being scraped
    keywords = {}
    keywords["mileage"] = ["miles"]
    keywords["BHP"] = ["BHP","PS"]
    keywords["transmission"] = ["Automatic", "Manual"]
    keywords["fuel"] = ["Petrol", "Diesel", "Electric", "Hybrid – Diesel/Electric Plug-in", 
                        "Hybrid – Petrol/Electric", "Hybrid – Petrol/Electric Plug-in", "Bi Fuel",
                        "Diesel Hybrid","Diesel Plug-in Hybrid","Hydrogen","Natural Gas","Petrol Hybrid"
                        "Petrol Plug-in Hybrid"]
    keywords["owners"] = ["owners", "owner","own"]
    keywords["body"] = ["Coupe", "Convertible", "Estate", "Hatchback", "MPV", "Pickup", "SUV", "Saloon"]
    keywords["ULEZ"] = ["ULEZ"]
    keywords["year"] = [" reg)","new"]
    keywords["engine"] = ["engine"]

    # Set up parameters for query to autotrader.co.uk
    mpgall = ['OVER_60','OVER_50','OVER_40','OVER_30']
    drivetrainall = ['Four Wheel Drive','Front Wheel Drive','Rear Wheel Drive']
    makeall = ['AC','Abarth','Aixam','Alfa Romeo','Alpine','Ariel','Aston Martin','Audi','Austin',
        'BAC','BMW','Beauford','Bentley','Bowler','Bugatti','Buick',
        'CUPRA','Cadillac','Carbodies','Caterham','Chesil','Chevrolet','Chrysler','Citroen','Corvette',
        'DAF','DFSK','DS Automobiles','Dacia','Daewoo','Daihatsu','Daimler','Datsun','Delorean','Dodge',
        'Ferrari','Fiat','Ford','GMC','Great Wall',
        'Hillman','Honda','Hummer','Hyundai','Infiniti','Isuzu','Iveco','Jaguar','Jeep','Jensen','KIA',
        'LEVC','Lada','Lamborghini','Lancia','Land Rover','Lexus','Lincoln','London Taxis International','Lotus',
        'MG','MINI','Mahindra','Maserati','Maybach','Mazda','McLaren','Mercedes-Benz','Microcar','Mitsubishi',
        'Mitsuoka','Morgan','Morris','Nissan','Noble','Opel','Packard','Perodua','Peugeot','Pilgrim','Polestar',
        'Pontiac','Porsche','Proton','REO','Radical','Rage','Raptor','Reliant','Renault','Replica','Reva','Riley',
        'Rolls-Royce','Rover','SEAT','SKODA','Saab','Sebring','Singer','Smart','Spyker','Ssangyong','Subaru',
        'Sunbeam','Suzuki','TVR','Tesla','Tiger','Toyota','Triumph','Ultima','Vauxhall','Venturi','Volkswagen',
        'Volvo','Westfield','Wolseley','Yamaha','Zenos']
    body = ["Coupe", "Convertible", "Estate", "Hatchback", "MPV", "Pickup", "SUV", "Saloon"]
    trans = ["Automatic", "Manual"]
    sort = ["price-asc","price-desc"]
    
    # Iterate over combinations of the search key words. We make such specific searches in an aim
    # to return <1000 results per search
    for divide in tqdm(list(itertools.product(mpgall, drivetrainall, makeall, body, trans, sort))):  
        params = {
            "sort": divide[5],
            "postcode": postcode,
            "radius": radius,
            "make": divide[2],
            "search-results-price-type": "total-price",
            "search-results-year": "select-year",
            "exclude-writeoff-categories":"on",
            "fuel-consumption":divide[0],
            "drivetrain":divide[1],
            "body-type":divide[3],
            "transmission":divide[4]
             }    
    
        # Set up writeoff parameters for query. Included by default args. 
        if (include_writeoff == "include"):
            params["writeoff-categories"] = "on"
        elif (include_writeoff == "exclude"):
            params["exclude-writeoff-categories"] = "on"
        elif (include_writeoff == "writeoff-only"):
            params["only-writeoff-categories"] = "on"

        # Set up year parameters for query. Start at year_min and grow to year_max  
        year = min_year
        page = 1
        attempt = 1

        try:
            while year <= max_year:
                params["year-from"] = year
                params["year-to"] = year
                params["page"] = page

                #Sleep timer was not required with VPN cloudscraper combination.
                #time.sleep(random.randint(1,9))
                r = scraper.get(url, params=params)
                
                if verbose:
                    print("Year:     ", year,"\t Page:     ", page,"\t Response: ", r)

                try:
                    if r.status_code != 200: # if not successful (e.g. due to bot protection), log as an attempt
                        attempt = attempt + 1
                        if attempt <= max_attempts_per_page: # break if max_attempts reached
                            if verbose:
                                print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                        if r.status_code == 500: # break if internal website error
                            break
                        else:
                            page = page + 1
                            attempt = 1
                            if verbose:
                                print("Exception. All attempts exhausted for this page. Skipping to next page #", page)
                            if page > 100: # Autotrader does not return sensible results beyond 100 pages
                                break
                    else:

                        j = r.json()
                        s = BeautifulSoup(j["html"], features="html.parser")

                        # Use beautifulsoup to generate list of cars on each page - called articles 
                        articles = s.find_all("article", attrs={"data-standout-type":""})

                        # if no results or reached end of results, report this then increment for next search
                        if len(articles) == 0 or r.url[r.url.find("page=")+5:] != str(page):
                            if verbose:
                                print("Found total", n_this_year_results, "results for year", year, "and brand", divide, "across", page-1, "pages")
                                # If search returnd > 1000 cars, add car band to the list.
                                if n_this_year_results == 1000:
                                    extra_make_list.append(params["make"])
                                if year+1 <= max_year:
                                    print("Moving on to year", year + 1)
                                    print("---------------------------------")

                            # Increment year and reset relevant variables
                            year = year + 1
                            page = 1
                            attempt = 1
                            n_this_year_results = 0
                        else:
                            # For each car, build a dictionary. Some from seach parameters, some from produced article
                            for article in articles:
                                car = {}
                                seller_href=[]
                                car["name"] = article.find("h3", {"class": "product-card-details__title"}).text.strip()
                                car["name_subtitle"] = article.find("h3", {"class": "product-card-details__title"}).findNext('p').text.strip()                       
                                car["link"] = "https://www.autotrader.co.uk" + article.find("a", {"class": "tracking-standard-link"})["href"][: article.find("a", {"class": "tracking-standard-link"})["href"].find("?")]
                                car["price"] = article.find("div", {"class": "product-card-pricing__price"}).text.strip()
                                car["mpg"] = divide[0]
                                car["drivertrain"] = divide[1]
                                car["make"] = divide[2]
                                seller = article.find_all("span", {"class": "product-card-seller-info__spec-item-copy"})#.text.strip()
                                seller=reversed(seller)
                                for count,sel in enumerate(seller):
                                    car["seller"+str(count)] = sel.text.strip()
                                for a in article.find_all("a", {"class": "product-card-seller-info__review-count dealer-profile-link", "href":True}):
                                    seller_href.append(a["href"])
                                for count,ref in enumerate(seller_href):
                                    car["href"+str(count)] = ref

                                # The key specs are in a bulleted list
                                key_specs_bs_list = article.find("ul", {"class": "listing-key-specs"}).find_all("li")

                                for key_spec_bs_li in key_specs_bs_list:

                                    key_spec_bs = key_spec_bs_li.text                              

                                    if any(keyword in key_spec_bs for keyword in keywords["mileage"]):
                                        car["mileage"] = int(key_spec_bs[:key_spec_bs.find(" miles")].replace(",",""))
                                    elif any(keyword in key_spec_bs for keyword in keywords["BHP"]):
                                        car["BHP"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["transmission"]):
                                        car["transmission"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["fuel"]):
                                        car["fuel"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["owners"]):
                                        car["owners"] = int(key_spec_bs[:key_spec_bs.find(" own")])
                                    elif any(keyword in key_spec_bs for keyword in keywords["body"]):
                                        car["body"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["ULEZ"]):
                                        car["ULEZ"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["year"]):
                                        car["year"] = key_spec_bs
                                    elif key_spec_bs[1] == "." and key_spec_bs[3] == "L":
                                        car["engine"] = key_spec_bs
                                
                                # Set any missing dictionary keys to NA for complete car details.
                                for key in keywords.keys():
                                    if key in car.keys():
                                        pass
                                    else:
                                        car[key]='NA'
                                # Add complete car dictionary to results list.
                                results.append(car)
                                n_this_year_results = n_this_year_results + 1

                            page = page + 1
                            attempt = 1

                            if verbose:
                                print("Car count: ", len(results))
                                print("---------------------------------")

                except KeyboardInterrupt:
                    break

                except:
                    traceback.print_exc()
                    attempt = attempt + 1
                    if attempt <= max_attempts_per_page:
                        if verbose:
                            print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                    else:
                        page = page + 1
                        attempt = 1
                        if verbose:
                            print("Exception. All attempts exhausted for this page. Skipping to next page #", page)

        except KeyboardInterrupt:
            pass
    
    # Make df of results and output to csv
    results = pd.DataFrame(results)
    now = (datetime.datetime.now().strftime("%d%B_%I%M%p"))
    filepathdf=f'/raw_data_used/{now}_{min_year}-{max_year}_used.csv'
    results.to_csv(filepathdf, index=False, header=results.columns )

    return results, list(set(extra_make_list))

In [3]:
# AutoTrader used electric car scraping function 

def get_used_electric_cars(
    postcode="KT12",
    radius=1500,
    min_year=1995,
    max_year=1995,
    include_writeoff="include",
    max_attempts_per_page=5,
    verbose=False):

    # To bypass Cloudflare protection
    scraper = cloudscraper.create_scraper()

    # Basic variables
    results = []
    n_this_year_results = 0
    # If a search returns > 1000 results then only the first 1000 are made available.
    # These such searches are recorded in the extra_make_list for later attention 
    extra_make_list = []
    url = "https://www.autotrader.co.uk/results-car-search"

    # Keywords commonly used on Autotrader in each of the fields being scraped
    keywords = {}
    keywords["mileage"] = ["miles"]
    keywords["BHP"] = ["BHP","PS"]
    keywords["transmission"] = ["Automatic", "Manual"]
    keywords["fuel"] = ["Petrol", "Diesel", "Electric", "Hybrid – Diesel/Electric Plug-in", 
                        "Hybrid – Petrol/Electric", "Hybrid – Petrol/Electric Plug-in", "Bi Fuel",
                        "Diesel Hybrid","Diesel Plug-in Hybrid","Hydrogen","Natural Gas","Petrol Hybrid"
                        "Petrol Plug-in Hybrid"]
    keywords["owners"] = ["owners", "owner","own"]
    keywords["body"] = ["Coupe", "Convertible", "Estate", "Hatchback", "MPV", "Pickup", "SUV", "Saloon"]
    keywords["ULEZ"] = ["ULEZ"]
    keywords["year"] = [" reg)","new"]
    keywords["engine"] = ["engine"]

    # Set up parameters for query to autotrader.co.uk
    mpgall = ['OVER_60','OVER_50','OVER_40','OVER_30']
    drivetrainall = ['Four Wheel Drive','Front Wheel Drive','Rear Wheel Drive']
    makeall = ['AC','Abarth','Aixam','Alfa Romeo','Alpine','Ariel','Aston Martin','Audi','Austin',
        'BAC','BMW','Beauford','Bentley','Bowler','Bugatti','Buick',
        'CUPRA','Cadillac','Carbodies','Caterham','Chesil','Chevrolet','Chrysler','Citroen','Corvette',
        'DAF','DFSK','DS Automobiles','Dacia','Daewoo','Daihatsu','Daimler','Datsun','Delorean','Dodge',
        'Ferrari','Fiat','Ford','GMC','Great Wall',
        'Hillman','Honda','Hummer','Hyundai','Infiniti','Isuzu','Iveco','Jaguar','Jeep','Jensen','KIA',
        'LEVC','Lada','Lamborghini','Lancia','Land Rover','Lexus','Lincoln','London Taxis International','Lotus',
        'MG','MINI','Mahindra','Maserati','Maybach','Mazda','McLaren','Mercedes-Benz','Microcar','Mitsubishi',
        'Mitsuoka','Morgan','Morris','Nissan','Noble','Opel','Packard','Perodua','Peugeot','Pilgrim','Polestar',
        'Pontiac','Porsche','Proton','REO','Radical','Rage','Raptor','Reliant','Renault','Replica','Reva','Riley',
        'Rolls-Royce','Rover','SEAT','SKODA','Saab','Sebring','Singer','Smart','Spyker','Ssangyong','Subaru',
        'Sunbeam','Suzuki','TVR','Tesla','Tiger','Toyota','Triumph','Ultima','Vauxhall','Venturi','Volkswagen',
        'Volvo','Westfield','Wolseley','Yamaha','Zenos']
    body = ["Coupe", "Convertible", "Estate", "Hatchback", "MPV", "Pickup", "SUV", "Saloon"]
    trans = ["Automatic", "Manual"]
    sort = ["price-asc","price-desc"]
    
    # Iterate over combinations of the search key words. We make such specific searches in an aim
    # to return <1000 results per search
    for divide in tqdm(list(itertools.product(drivetrainall, makeall, body, trans, sort))):  
        params = {
            "sort": divide[4],
            "postcode": postcode,
            "radius": radius,
            "make": divide[1],
            "search-results-price-type": "total-price",
            "search-results-year": "select-year",
            "exclude-writeoff-categories":"on",
            "fuel-type":'Electric',
            "drivetrain":divide[0],
            "body-type":divide[2],
            "transmission":divide[3]
             }    
    
        # Set up writeoff parameters for query. Included by default args. 
        if (include_writeoff == "include"):
            params["writeoff-categories"] = "on"
        elif (include_writeoff == "exclude"):
            params["exclude-writeoff-categories"] = "on"
        elif (include_writeoff == "writeoff-only"):
            params["only-writeoff-categories"] = "on"

        # Set up year parameters for query. Start at year_min and grow to year_max  
        year = min_year
        page = 1
        attempt = 1

        try:
            while year <= max_year:
                params["year-from"] = year
                params["year-to"] = year
                params["page"] = page

                #Sleep timer was not required with VPN cloudscraper combination.
                #time.sleep(random.randint(1,9))
                r = scraper.get(url, params=params)
                
                if verbose:
                    print("Year:     ", year,"\t Page:     ", page,"\t Response: ", r)

                try:
                    if r.status_code != 200: # if not successful (e.g. due to bot protection), log as an attempt
                        attempt = attempt + 1
                        if attempt <= max_attempts_per_page: # break if max_attempts reached
                            if verbose:
                                print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                        if r.status_code == 500: # break if internal website error
                            break
                        else:
                            page = page + 1
                            attempt = 1
                            if verbose:
                                print("Exception. All attempts exhausted for this page. Skipping to next page #", page)
                            if page > 100: # Autotrader does not return sensible results beyond 100 pages
                                break
                    else:

                        j = r.json()
                        s = BeautifulSoup(j["html"], features="html.parser")

                        # Use beautifulsoup to generate list of cars on each page - called articles 
                        articles = s.find_all("article", attrs={"data-standout-type":""})

                        # if no results or reached end of results, report this then increment for next search
                        if len(articles) == 0 or r.url[r.url.find("page=")+5:] != str(page):
                            if verbose:
                                print("Found total", n_this_year_results, "results for year", year, "and brand", divide, "across", page-1, "pages")
                                # If search returnd > 1000 cars, add car band to the list.
                                if n_this_year_results == 1000:
                                    extra_make_list.append(params["make"])
                                if year+1 <= max_year:
                                    print("Moving on to year", year + 1)
                                    print("---------------------------------")

                            # Increment year and reset relevant variables
                            year = year + 1
                            page = 1
                            attempt = 1
                            n_this_year_results = 0
                        else:
                            # For each car, build a dictionary. Some from seach parameters, some from produced article
                            for article in articles:
                                car = {}
                                seller_href=[]
                                car["name"] = article.find("h3", {"class": "product-card-details__title"}).text.strip()
                                car["name_subtitle"] = article.find("h3", {"class": "product-card-details__title"}).findNext('p').text.strip()                       
                                car["link"] = "https://www.autotrader.co.uk" + article.find("a", {"class": "tracking-standard-link"})["href"][: article.find("a", {"class": "tracking-standard-link"})["href"].find("?")]
                                car["price"] = article.find("div", {"class": "product-card-pricing__price"}).text.strip()
                                car["mpg"] = 'electric'
                                car["drivertrain"] = divide[0]
                                car["make"] = divide[1]
                                seller = article.find_all("span", {"class": "product-card-seller-info__spec-item-copy"})#.text.strip()
                                seller=reversed(seller)
                                for count,sel in enumerate(seller):
                                    car["seller"+str(count)] = sel.text.strip()
                                for a in article.find_all("a", {"class": "product-card-seller-info__review-count dealer-profile-link", "href":True}):
                                    seller_href.append(a["href"])
                                for count,ref in enumerate(seller_href):
                                    car["href"+str(count)] = ref

                                # The key specs are in a bulleted list
                                key_specs_bs_list = article.find("ul", {"class": "listing-key-specs"}).find_all("li")

                                for key_spec_bs_li in key_specs_bs_list:

                                    key_spec_bs = key_spec_bs_li.text                              

                                    if any(keyword in key_spec_bs for keyword in keywords["mileage"]):
                                        car["mileage"] = int(key_spec_bs[:key_spec_bs.find(" miles")].replace(",",""))
                                    elif any(keyword in key_spec_bs for keyword in keywords["BHP"]):
                                        car["BHP"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["transmission"]):
                                        car["transmission"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["fuel"]):
                                        car["fuel"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["owners"]):
                                        car["owners"] = int(key_spec_bs[:key_spec_bs.find(" own")])
                                    elif any(keyword in key_spec_bs for keyword in keywords["body"]):
                                        car["body"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["ULEZ"]):
                                        car["ULEZ"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["year"]):
                                        car["year"] = key_spec_bs
                                    elif key_spec_bs[1] == "." and key_spec_bs[3] == "L":
                                        car["engine"] = key_spec_bs
                                
                                # Set any missing dictionary keys to NA for complete car details.
                                for key in keywords.keys():
                                    if key in car.keys():
                                        pass
                                    else:
                                        car[key]='NA'
                                # Add complete car dictionary to results list.
                                results.append(car)
                                n_this_year_results = n_this_year_results + 1

                            page = page + 1
                            attempt = 1

                            if verbose:
                                print("Car count: ", len(results))
                                print("---------------------------------")

                except KeyboardInterrupt:
                    break

                except:
                    traceback.print_exc()
                    attempt = attempt + 1
                    if attempt <= max_attempts_per_page:
                        if verbose:
                            print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                    else:
                        page = page + 1
                        attempt = 1
                        if verbose:
                            print("Exception. All attempts exhausted for this page. Skipping to next page #", page)

        except KeyboardInterrupt:
            pass
    
    # Make df of results and output to csv
    results = pd.DataFrame(results)
    now = (datetime.datetime.now().strftime("%d%B_%I%M%p"))
    filepathdf=f'/raw_data_used_electric/{now}_{min_year}-{max_year}_used.csv'
    results.to_csv(filepathdf, index=False, header=results.columns )

    return results, list(set(extra_make_list))

In [4]:
# AutoTrader new car scraping function 

def get_new_cars(
    postcode="KT12",
    radius=1500,
    include_writeoff="include",
    max_attempts_per_page=5,
    verbose=False):

    # To bypass Cloudflare protection
    scraper = cloudscraper.create_scraper()

    # Basic variables

    results = []
    extra_brand_list=[]
    n_this_year_results = 0
    # If a search returns > 1000 results then only the first 1000 are made available.
    # These such searches are recorded in the extra_make_list for later attention 
    extra_make_list = []
    url = "https://www.autotrader.co.uk/results-car-search"

    # Keywords commonly used on Autotrader in each of the fields being scraped
    keywords = {}
    keywords["mileage"] = ["miles"]
    keywords["BHP"] = ["BHP","PS"]
    keywords["transmission"] = ["Automatic", "Manual"]
    keywords["fuel"] = ["Petrol", "Diesel", "Electric", "Hybrid – Diesel/Electric Plug-in", 
                        "Hybrid – Petrol/Electric", "Hybrid – Petrol/Electric Plug-in", "Bi Fuel",
                        "Diesel Hybrid","Diesel Plug-in Hybrid","Hydrogen","Natural Gas","Petrol Hybrid"
                        "Petrol Plug-in Hybrid"]
    keywords["owners"] = ["owners", "owner","own"]
    keywords["body"] = ["Coupe", "Convertible", "Estate", "Hatchback", "MPV", "Pickup", "SUV", "Saloon"]
    keywords["ULEZ"] = ["ULEZ"]
    keywords["year"] = [" reg)","new"]
    keywords["engine"] = ["engine"]

    # Set up parameters for query to autotrader.co.uk
    mpgall = ['OVER_60','OVER_50','OVER_40','OVER_30']
    drivetrainall = ['Four Wheel Drive','Front Wheel Drive','Rear Wheel Drive']
    makeall = ['AC','Abarth','Aixam','Alfa Romeo','Alpine','Ariel','Aston Martin','Audi','Austin',
        'BAC','BMW','Beauford','Bentley','Bowler','Bugatti','Buick',
        'CUPRA','Cadillac','Carbodies','Caterham','Chesil','Chevrolet','Chrysler','Citroen','Corvette',
        'DAF','DFSK','DS Automobiles','Dacia','Daewoo','Daihatsu','Daimler','Datsun','Delorean','Dodge',
        'Ferrari','Fiat','Ford','GMC','Great Wall',
        'Hillman','Honda','Hummer','Hyundai','Infiniti','Isuzu','Iveco','Jaguar','Jeep','Jensen','KIA',
        'LEVC','Lada','Lamborghini','Lancia','Land Rover','Lexus','Lincoln','London Taxis International','Lotus',
        'MG','MINI','Mahindra','Maserati','Maybach','Mazda','McLaren','Mercedes-Benz','Microcar','Mitsubishi',
        'Mitsuoka','Morgan','Morris','Nissan','Noble','Opel','Packard','Perodua','Peugeot','Pilgrim','Polestar',
        'Pontiac','Porsche','Proton','REO','Radical','Rage','Raptor','Reliant','Renault','Replica','Reva','Riley',
        'Rolls-Royce','Rover','SEAT','SKODA','Saab','Sebring','Singer','Smart','Spyker','Ssangyong','Subaru',
        'Sunbeam','Suzuki','TVR','Tesla','Tiger','Toyota','Triumph','Ultima','Vauxhall','Venturi','Volkswagen',
        'Volvo','Westfield','Wolseley','Yamaha','Zenos']
    body=["Coupe", "Convertible", "Estate", "Hatchback", "MPV", "Pickup", "SUV", "Saloon"]
    trans = ["Automatic", "Manual"]
    sort = ["price-asc","price-desc"]

    # Iterate over combinations of the search key words. We make such specific searches in an aim
    # to return <1000 results per search    
    for divide in tqdm(list(itertools.product(mpgall, drivetrainall, makeall, body, trans, sort))): 
        params = {
            "sort": divide[5],
            "postcode": postcode,
            "radius": radius,
            "make": divide[2],
            "search-results-price-type": "total-price",
            # Search year not used in new car search
            # "search-results-year": "select-year",
            "exclude-writeoff-categories":"on",
            "fuel-consumption":divide[0],
            "drivetrain":divide[1],
            "body-type":divide[3],
            "transmission":divide[4]
             }

        # Set up writeoff parameters for query. Included by default args. 
        if (include_writeoff == "include"):
            params["writeoff-categories"] = "on"
        elif (include_writeoff == "exclude"):
            params["exclude-writeoff-categories"] = "on"
        elif (include_writeoff == "writeoff-only"):
            params["only-writeoff-categories"] = "on"

        # Year parameters are hardcoded for new car search. Notice that the 'year-from' paraneter is hard
        # coded to be 'new' below so we don't only search for new cars from 2022.
        year = 2022
        max_year = 2022
        page = 1
        attempt = 1


        try:
            
            # This only occurs once as year=max_year to start with
            while year <= max_year:

                # Search for new cars only
                params["year-from"] = 'new'
                params["page"] = page

                # Sleep timer was not required with VPN cloudscraper combination.
                # time.sleep(random.randint(1,9))
                r = scraper.get(url, params=params)
                if verbose:
                    print("Year:     ", year)
                    print("Page:     ", page)
                    print("Response: ", r)

                try:
                    if r.status_code != 200: # if not successful (e.g. due to bot protection), log as an attempt
                        attempt = attempt + 1
                        if attempt <= max_attempts_per_page: # break if max_attempts reached
                            if verbose:
                                print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                        if r.status_code == 500: # break if internal website error
                            break
                        else:
                            page = page + 1
                            attempt = 1
                            if verbose:
                                print("Exception. All attempts exhausted for this page. Skipping to next page #", page)
                            if page > 100: # Autotrader does not return sensible results beyond 100 pages
                                break
                    else:

                        j = r.json()
                        s = BeautifulSoup(j["html"], features="html.parser")

                        # Use beautifulsoup to generate list of cars on each page - called articles                        
                        articles = s.find_all("article", attrs={"data-standout-type":""})

                        # if no results or reached end of results, report this then increment for next search
                        if len(articles) == 0 or r.url[r.url.find("page=")+5:] != str(page):
                            if verbose:
                                print("Found total", n_this_year_results, "results for year", year, "and brand", divide, "across", page-1, "pages")
                                # If search returnd > 1000 cars, add car band to the list.
                                if n_this_year_results == 1000:
                                    extra_brand_list.append(params["make"])                                
                                if year+1 <= max_year:
                                    print("Moving on to year", year + 1)
                                    print("---------------------------------")

                            # Increment year and reset relevant variables
                            year = year + 1
                            page = 1
                            attempt = 1
                            n_this_year_results = 0
                        else:
                            # For each car, build a dictionary. Some from seach parameters, some from produced article
                            for article in articles:
                                car = {}
                                seller_href=[]
                                car["name"] = article.find("h3", {"class": "product-card-details__title"}).text.strip()
                                car["name_subtitle"] = article.find("h3", {"class": "product-card-details__title"}).findNext('p').text.strip()                       
                                car["link"] = "https://www.autotrader.co.uk" + article.find("a", {"class": "tracking-standard-link"})["href"][: article.find("a", {"class": "tracking-standard-link"})["href"].find("?")]
                                car["price"] = article.find("div", {"class": "product-card-pricing__price"}).text.strip()
                                car["mpg"] = divide[0]
                                car["drivertrain"] = divide[1]
                                car["make"] = divide[2]
                                seller = article.find_all("span", {"class": "product-card-seller-info__spec-item-copy"})#.text.strip()
                                seller=reversed(seller)
                                for count,sel in enumerate(seller):
                                    car["seller"+str(count)] = sel.text.strip()
                                for a in article.find_all("a", {"class": "product-card-seller-info__review-count dealer-profile-link", "href":True}):
                                    seller_href.append(a["href"])
                                for count,ref in enumerate(seller_href):
                                    car["href"+str(count)] = ref

                                # The key specs are in a bulleted list
                                key_specs_bs_list = article.find("ul", {"class": "listing-key-specs"}).find_all("li")

                                for key_spec_bs_li in key_specs_bs_list:

                                    key_spec_bs = key_spec_bs_li.text                              

                                    if any(keyword in key_spec_bs for keyword in keywords["mileage"]):
                                        car["mileage"] = int(key_spec_bs[:key_spec_bs.find(" miles")].replace(",",""))
                                    elif any(keyword in key_spec_bs for keyword in keywords["BHP"]):
                                        car["BHP"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["transmission"]):
                                        car["transmission"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["fuel"]):
                                        car["fuel"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["owners"]):
                                        car["owners"] = int(key_spec_bs[:key_spec_bs.find(" own")])
                                    elif any(keyword in key_spec_bs for keyword in keywords["body"]):
                                        car["body"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["ULEZ"]):
                                        car["ULEZ"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["year"]):
                                        car["year"] = key_spec_bs
                                    elif key_spec_bs[1] == "." and key_spec_bs[3] == "L":
                                        car["engine"] = key_spec_bs

                                # Set any missing dictionary keys to NA for complete car details.
                                for key in keywords.keys():
                                    if key in car.keys():
                                        pass
                                    else:
                                        car[key]='NA'
                                # Add complete car dictionary to results list.
                                results.append(car)
                                n_this_year_results = n_this_year_results + 1

                            page = page + 1
                            attempt = 1

                            if verbose:
                                print("Car count: ", len(results))
                                print("---------------------------------")

                except KeyboardInterrupt:
                    break

                except:
                    traceback.print_exc()
                    attempt = attempt + 1
                    if attempt <= max_attempts_per_page:
                        if verbose:
                            print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                    else:
                        page = page + 1
                        attempt = 1
                        if verbose:
                            print("Exception. All attempts exhausted for this page. Skipping to next page #", page)

        except KeyboardInterrupt:
            pass
    
    # Make df of results and output to csv
    results = pd.DataFrame(results)
    now = (datetime.datetime.now().strftime("%d%B_%I%M%p"))
    filepathdf=f'/raw_data_new/{now}_{year}_new.csv'
    results.to_csv(filepathdf, index=False, header=results.columns )

    return results, list(set(extra_brand_list))

In [5]:
# AutoTrader new electric car scraping function 

def get_new_electric_cars(
    postcode="KT12",
    radius=1500,
    include_writeoff="include",
    max_attempts_per_page=5,
    verbose=False):

    # To bypass Cloudflare protection
    scraper = cloudscraper.create_scraper()

    # Basic variables

    results = []
    extra_brand_list=[]
    n_this_year_results = 0
    # If a search returns > 1000 results then only the first 1000 are made available.
    # These such searches are recorded in the extra_make_list for later attention 
    extra_make_list = []
    url = "https://www.autotrader.co.uk/results-car-search"

    # Keywords commonly used on Autotrader in each of the fields being scraped
    keywords = {}
    keywords["mileage"] = ["miles"]
    keywords["BHP"] = ["BHP","PS"]
    keywords["transmission"] = ["Automatic", "Manual"]
    keywords["fuel"] = ["Petrol", "Diesel", "Electric", "Hybrid – Diesel/Electric Plug-in", 
                        "Hybrid – Petrol/Electric", "Hybrid – Petrol/Electric Plug-in", "Bi Fuel",
                        "Diesel Hybrid","Diesel Plug-in Hybrid","Hydrogen","Natural Gas","Petrol Hybrid"
                        "Petrol Plug-in Hybrid"]
    keywords["owners"] = ["owners", "owner","own"]
    keywords["body"] = ["Coupe", "Convertible", "Estate", "Hatchback", "MPV", "Pickup", "SUV", "Saloon"]
    keywords["ULEZ"] = ["ULEZ"]
    keywords["year"] = [" reg)","new"]
    keywords["engine"] = ["engine"]

    # Set up parameters for query to autotrader.co.uk
    mpgall = ['OVER_60','OVER_50','OVER_40','OVER_30']
    drivetrainall = ['Four Wheel Drive','Front Wheel Drive','Rear Wheel Drive']
    makeall = ['AC','Abarth','Aixam','Alfa Romeo','Alpine','Ariel','Aston Martin','Audi','Austin',
        'BAC','BMW','Beauford','Bentley','Bowler','Bugatti','Buick',
        'CUPRA','Cadillac','Carbodies','Caterham','Chesil','Chevrolet','Chrysler','Citroen','Corvette',
        'DAF','DFSK','DS Automobiles','Dacia','Daewoo','Daihatsu','Daimler','Datsun','Delorean','Dodge',
        'Ferrari','Fiat','Ford','GMC','Great Wall',
        'Hillman','Honda','Hummer','Hyundai','Infiniti','Isuzu','Iveco','Jaguar','Jeep','Jensen','KIA',
        'LEVC','Lada','Lamborghini','Lancia','Land Rover','Lexus','Lincoln','London Taxis International','Lotus',
        'MG','MINI','Mahindra','Maserati','Maybach','Mazda','McLaren','Mercedes-Benz','Microcar','Mitsubishi',
        'Mitsuoka','Morgan','Morris','Nissan','Noble','Opel','Packard','Perodua','Peugeot','Pilgrim','Polestar',
        'Pontiac','Porsche','Proton','REO','Radical','Rage','Raptor','Reliant','Renault','Replica','Reva','Riley',
        'Rolls-Royce','Rover','SEAT','SKODA','Saab','Sebring','Singer','Smart','Spyker','Ssangyong','Subaru',
        'Sunbeam','Suzuki','TVR','Tesla','Tiger','Toyota','Triumph','Ultima','Vauxhall','Venturi','Volkswagen',
        'Volvo','Westfield','Wolseley','Yamaha','Zenos']
    body=["Coupe", "Convertible", "Estate", "Hatchback", "MPV", "Pickup", "SUV", "Saloon"]
    trans = ["Automatic", "Manual"]
    sort = ["price-asc","price-desc"]

    # Iterate over combinations of the search key words. We make such specific searches in an aim
    # to return <1000 results per search    
    for divide in tqdm(list(itertools.product(drivetrainall, makeall, body, trans, sort))): 
        params = {
            "sort": divide[4],
            "postcode": postcode,
            "radius": radius,
            "make": divide[1],
            "search-results-price-type": "total-price",
            # Search year not used in new car search
            # "search-results-year": "select-year",
            "fuel-type":'Electric',
            "exclude-writeoff-categories":"on",
            "drivetrain":divide[0],
            "body-type":divide[2],
            "transmission":divide[3]
             }

        # Set up writeoff parameters for query. Included by default args. 
        if (include_writeoff == "include"):
            params["writeoff-categories"] = "on"
        elif (include_writeoff == "exclude"):
            params["exclude-writeoff-categories"] = "on"
        elif (include_writeoff == "writeoff-only"):
            params["only-writeoff-categories"] = "on"

        # Year parameters are hardcoded for new car search. Notice that the 'year-from' paraneter is hard
        # coded to be 'new' below so we don't only search for new cars from 2022.
        year = 2022
        max_year = 2022
        page = 1
        attempt = 1


        try:
            
            # This only occurs once as year=max_year to start with
            while year <= max_year:

                # Search for new cars only
                params["year-from"] = 'new'
                params["page"] = page

                # Sleep timer was not required with VPN cloudscraper combination.
                # time.sleep(random.randint(1,9))
                r = scraper.get(url, params=params)
                if verbose:
                    print("Year:     ", year)
                    print("Page:     ", page)
                    print("Response: ", r)

                try:
                    if r.status_code != 200: # if not successful (e.g. due to bot protection), log as an attempt
                        attempt = attempt + 1
                        if attempt <= max_attempts_per_page: # break if max_attempts reached
                            if verbose:
                                print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                        if r.status_code == 500: # break if internal website error
                            break
                        else:
                            page = page + 1
                            attempt = 1
                            if verbose:
                                print("Exception. All attempts exhausted for this page. Skipping to next page #", page)
                            if page > 100: # Autotrader does not return sensible results beyond 100 pages
                                break
                    else:

                        j = r.json()
                        s = BeautifulSoup(j["html"], features="html.parser")

                        # Use beautifulsoup to generate list of cars on each page - called articles                        
                        articles = s.find_all("article", attrs={"data-standout-type":""})

                        # if no results or reached end of results, report this then increment for next search
                        if len(articles) == 0 or r.url[r.url.find("page=")+5:] != str(page):
                            if verbose:
                                print("Found total", n_this_year_results, "results for year", year, "and brand", divide, "across", page-1, "pages")
                                # If search returnd > 1000 cars, add car band to the list.
                                if n_this_year_results == 1000:
                                    extra_brand_list.append(params["make"])                                
                                if year+1 <= max_year:
                                    print("Moving on to year", year + 1)
                                    print("---------------------------------")

                            # Increment year and reset relevant variables
                            year = year + 1
                            page = 1
                            attempt = 1
                            n_this_year_results = 0
                        else:
                            # For each car, build a dictionary. Some from seach parameters, some from produced article
                            for article in articles:
                                car = {}
                                seller_href=[]
                                car["name"] = article.find("h3", {"class": "product-card-details__title"}).text.strip()
                                car["name_subtitle"] = article.find("h3", {"class": "product-card-details__title"}).findNext('p').text.strip()                       
                                car["link"] = "https://www.autotrader.co.uk" + article.find("a", {"class": "tracking-standard-link"})["href"][: article.find("a", {"class": "tracking-standard-link"})["href"].find("?")]
                                car["price"] = article.find("div", {"class": "product-card-pricing__price"}).text.strip()
                                car["mpg"] = 'electric'
                                car["drivertrain"] = divide[0]
                                car["make"] = divide[1]
                                seller = article.find_all("span", {"class": "product-card-seller-info__spec-item-copy"})#.text.strip()
                                seller=reversed(seller)
                                for count,sel in enumerate(seller):
                                    car["seller"+str(count)] = sel.text.strip()
                                for a in article.find_all("a", {"class": "product-card-seller-info__review-count dealer-profile-link", "href":True}):
                                    seller_href.append(a["href"])
                                for count,ref in enumerate(seller_href):
                                    car["href"+str(count)] = ref

                                # The key specs are in a bulleted list
                                key_specs_bs_list = article.find("ul", {"class": "listing-key-specs"}).find_all("li")

                                for key_spec_bs_li in key_specs_bs_list:

                                    key_spec_bs = key_spec_bs_li.text                              

                                    if any(keyword in key_spec_bs for keyword in keywords["mileage"]):
                                        car["mileage"] = int(key_spec_bs[:key_spec_bs.find(" miles")].replace(",",""))
                                    elif any(keyword in key_spec_bs for keyword in keywords["BHP"]):
                                        car["BHP"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["transmission"]):
                                        car["transmission"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["fuel"]):
                                        car["fuel"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["owners"]):
                                        car["owners"] = int(key_spec_bs[:key_spec_bs.find(" own")])
                                    elif any(keyword in key_spec_bs for keyword in keywords["body"]):
                                        car["body"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["ULEZ"]):
                                        car["ULEZ"] = key_spec_bs
                                    elif any(keyword in key_spec_bs for keyword in keywords["year"]):
                                        car["year"] = key_spec_bs
                                    elif key_spec_bs[1] == "." and key_spec_bs[3] == "L":
                                        car["engine"] = key_spec_bs

                                # Set any missing dictionary keys to NA for complete car details.
                                for key in keywords.keys():
                                    if key in car.keys():
                                        pass
                                    else:
                                        car[key]='NA'
                                # Add complete car dictionary to results list.
                                results.append(car)
                                n_this_year_results = n_this_year_results + 1

                            page = page + 1
                            attempt = 1

                            if verbose:
                                print("Car count: ", len(results))
                                print("---------------------------------")

                except KeyboardInterrupt:
                    break

                except:
                    traceback.print_exc()
                    attempt = attempt + 1
                    if attempt <= max_attempts_per_page:
                        if verbose:
                            print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                    else:
                        page = page + 1
                        attempt = 1
                        if verbose:
                            print("Exception. All attempts exhausted for this page. Skipping to next page #", page)

        except KeyboardInterrupt:
            pass
    
    # Make df of results and output to csv
    results = pd.DataFrame(results)
    now = (datetime.datetime.now().strftime("%d%B_%I%M%p"))
    filepathdf=f'/raw_data_new_electric/{now}_{year}_new.csv'
    results.to_csv(filepathdf, index=False, header=results.columns )

    return results, list(set(extra_brand_list))

In [6]:
# Sample function calls. In practise these were looped over years 1990-2022, scraping a single year at a time

# ucars, extra_used_car_list = get_used_cars(1995,2021, Verbose=True)
# uecars, extra_used_ecar_list = get_used_electric_cars(2005,2021, Verbose=True)
# ncars, extra_new_car_list = get_new_cars(Verbose=True)
# necars, extra_new_ecar_list = get_new_electric_cars(Verbose=True)

No data cleaning is done in this notebook. It is purely here to define the functions which would be used to scrape the data. Note that due to the size of the dataset it will not be possible to include it in the GitHub repo.