In [1]:
from playwright.async_api import async_playwright
import asyncio
from openpyxl import load_workbook

In [2]:
URL = 'https://www.iseecars.com/'

### Function to scrape data for one make

In [None]:
async def get_data(make, model):
    try:
        async with async_playwright() as p:
            # open website
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()
            await page.goto(URL, wait_until="networkidle")
            await page.wait_for_load_state("load")
            # enter car information and search
            await page.locator('#make').select_option(make)
            await page.locator('#model').select_option(model)
            await page.get_by_placeholder("ZIP Code").click()
            await page.get_by_placeholder("ZIP Code").press_sequentially(zip) 
            await page.get_by_role("button", name="SEARCH").click()
            await page.wait_for_load_state("load") 
            await asyncio.sleep(5)
            await page.get_by_label('Radius').select_option('all')
            # prepare excel sheet for input
            WB = load_workbook('./CarData.xlsx')
            WS = WB.active
            WS.delete_rows(1, WS.max_row)
            ymmm_string = 'year_make_model_mileage' + make
            p_string = 'price' + model

            WS.append([ymmm_string, p_string]) 
            # loop for going through the pages with car listings. will break loop when button is no longer found
            while True:
                # get the divs of the listings
                car_listings = await page.locator('article.article-search-result').all()
                # handle no cars
                if not car_listings:
                    print("No Cars Found")
                    break
                # go through each div
                for listing in car_listings:
                    # get the information
                    year_make_model_mileage = await listing.locator('h3 span.detailLink span').text_content(timeout=5000)
                    price = await listing.locator('div.col3 h4').text_content(timeout=5000)
                    # handle when price isnt in its usual location
                    if not price:
                        price = await listing.locator('ul.result-items').inner_text()       
                    # add the data to the excel sheet
                    WS.append([year_make_model_mileage, price])
                # search for next button
                next_button = page.get_by_text("Next »")
                # if no next button then break the loop since there are no more listings
                if await next_button.count() == 0:
                    break
                # click the next button and wait for results to load
                await page.get_by_text("Next »").click()
                await asyncio.sleep(5)
            # save the excel
            WB.save('./CarData.xlsx')
            # close tab and browser
            await page.close()
            await browser.close()
        print("done")
    except Exception as e:
        print(e)
        WB.save('./CarData.xlsx') # when you cant press the next button anymore, error is thrown but save the file
        print("done")


### Clear excel file

In [None]:
WB = load_workbook('./CarData.xlsx')
WS = WB.active
WS.delete_rows(1, WS.max_row)
WB.save('./CarData.xlsx')

### Parallel Processing to scrape all models

In [None]:
makes_models = await get_car_makes_models()

In [None]:
for make, models in makes_models.items():
    print(make)
    for model in models:
        print(model)
    print('\n')

In [None]:
async def get_make_model_data(browser, make, model):
    try:
        print("get_make_model_data")
        # open website
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto(URL, wait_until="networkidle")
        await page.wait_for_load_state("load")
        # enter car information and search
        await page.locator('#make').select_option(make)
        await page.locator('#model').select_option(model)
        await page.get_by_placeholder("ZIP Code").click()
        await page.get_by_placeholder("ZIP Code").press_sequentially('08550') 
        await page.get_by_role("button", name="SEARCH").click()
        await page.wait_for_load_state("load") 
        await asyncio.sleep(5)
        await page.get_by_label('Radius').select_option('all')
        # prepare excel file
        WB = load_workbook('./CarData.xlsx')
        WS = WB.active
        print(make, ":",model)
        num_cars = 0
        while num_cars<=100:
            # get the divs of the listings
            car_listings = await page.locator('article.article-search-result').all()
            # handle no cars
            if not car_listings:
                print("No Cars Found")
                return 
            # go through each div
            for listing in car_listings:
                num_cars+=1
                full_info = await listing.locator('div.additional-info-content > div > b').first.inner_text()
                # template for each row in excel file
                attributes_template = {
                    "Make:": make,
                    "Model:": model,
                    "Trim:": full_info,
                    "Year:": full_info,
                    "Location:": None,
                    "Price:": None,
                    "Mileage:": None,
                    "Transmission:": None,
                    "Exterior Color:": None,
                    "Engine:": None,
                    "Fuel Type:": None,
                    "MPG:": None,
                    "Seats:": None,
                    "Drivetrain:": None,
                    "VIN:": None,
                    
                }

                # get the information
                info_headers = await listing.locator('div.additional-info-content-column > b').all()
                info_values = await listing.locator('div.additional-info-content-column > span').all()
                
                # skip over dealer rating tag, since it doesnt have body text
                headers = []
                for h in info_headers:
                    header_text = await h.inner_text()
                    if header_text != 'Dealer Rating:':
                        headers.append(header_text)

                # fill in attributes
                curr_car = attributes_template.copy()
                for header,value in zip(headers,info_values):
                    if header in attributes_template:
                        value_text = await value.inner_text()
                        curr_car[header] = value_text
                new_row = []
                for value in curr_car.values():
                    new_row.append(value)
                WS.append(new_row)
            if num_cars >= 100:
                break
            # search for next button
            next_button = page.get_by_text("Next »")
            # if no next button then break the loop since there are no more listings
            if await next_button.count() == 0:
                break
            # click the next button and wait for results to load
            await page.get_by_text("Next »").click()
            await asyncio.sleep(3)
    except Exception as e:
        print(e)
        print("error, done")
    finally:
        WB.save('./CarData.xlsx')
        await page.close()
        await context.close()
        print("done")


In [None]:
async def scrape_all_makes_models(makes_models):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        tasks=[]
        for make, models in makes_models.items():
            for model in models:
                tasks.append(get_make_model_data(browser,make,model))
        await asyncio.gather(*tasks)
        await browser.close()

In [None]:
await scrape_all_makes_models(makes_models)

### Iteratively scape all data

In [3]:
async def get_car_makes_models():
    try:
        print("get_car_makes_models")
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()
            await page.goto(URL, wait_until="networkidle")
            await page.wait_for_load_state("load")
            # extract car makes from dropdown
            makes = await page.eval_on_selector_all("#make option", "options => options.map(option => option.textContent.trim())")
            makes.pop(0)
            makes_models = {}
            for make in makes:
                await page.locator('#make').select_option(make)
                models_for_make = await page.eval_on_selector_all("#model option", "options => options.map(option => option.textContent.trim())")
                models_for_make.pop(0)
                makes_models[make] = models_for_make
            
            await page.close()
            await browser.close()
        print("done")
        return makes_models
    except Exception as e:
        print(e)
        return {}

In [4]:
async def get_make_model_data(browser, make, model):
    try:
        print("get_make_model_data")
        
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto(URL, wait_until="networkidle")
        await page.wait_for_load_state("load")
        # enter car information and search
        await page.locator('#make').select_option(make)
        await page.locator('#model').select_option(model)
        await page.get_by_placeholder("ZIP Code").click()
        await page.get_by_placeholder("ZIP Code").press_sequentially('08550') 
        await page.get_by_role("button", name="SEARCH").click()
        await page.wait_for_load_state("load") 
        await asyncio.sleep(3)
        await page.get_by_label('Radius').select_option('all')
        # prepare excel file
        WB = load_workbook('./CarData.xlsx')
        WS = WB.active
        print(make, ":",model)
        num_cars = 0
        while num_cars<=75:
            # get the divs of the listings
            car_listings = await page.locator('article.article-search-result').all()
            # handle no cars
            if not car_listings:
                print("No Cars Found")
                return 
            # go through each div
            for listing in car_listings:
                num_cars+=1
                full_info = await listing.locator('div.additional-info-content > div > b').first.inner_text()
                # template for each row in excel file
                attributes_template = {
                    "Make:": make,
                    "Model:": model,
                    "Trim:": full_info,
                    "Year:": full_info,
                    "Location:": None,
                    "Price:": None,
                    "Mileage:": None,
                    "Transmission:": None,
                    "Exterior Color:": None,
                    "Engine:": None,
                    "Fuel Type:": None,
                    "MPG:": None,
                    "Seats:": None,
                    "Drivetrain:": None,
                    "VIN:": None,
                    
                }

                # get the information
                info_headers = await listing.locator('div.additional-info-content-column > b').all()
                info_values = await listing.locator('div.additional-info-content-column > span').all()
                
                # skip over dealer rating tag, since it doesnt have body text
                headers = []
                for h in info_headers:
                    header_text = await h.inner_text()
                    if header_text != 'Dealer Rating:':
                        headers.append(header_text)

                # fill in attributes
                curr_car = attributes_template.copy()
                for header,value in zip(headers,info_values):
                    if header in attributes_template:
                        value_text = await value.inner_text()
                        curr_car[header] = value_text
                new_row = []
                for value in curr_car.values():
                    new_row.append(value)
                WS.append(new_row)
            if num_cars >= 100:
                break
            # search for next button
            next_button = page.get_by_text("Next »")
            # if no next button then break the loop since there are no more listings
            if await next_button.count() == 0:
                break
            # click the next button and wait for results to load
            await page.get_by_text("Next »").click()
            await asyncio.sleep(3)
    except Exception as e:
        print(e)
        print("error, done")
    finally:
        WB.save('./CarData.xlsx')
        await page.close()
        await context.close()
        print("done")


In [5]:
makes_models = await get_car_makes_models()

get_car_makes_models
done


In [None]:
print(makes_models)

In [7]:
last_make = 'Volvo'
last_model = 'S90'
start_make_index = list(makes_models.keys()).index(last_make)
start_model_index = makes_models[last_make].index(last_model)
continue_scrape = {}
continue_scrape[last_make] = makes_models[last_make][start_model_index:]

for i, (make,models) in enumerate(makes_models.items()):
    if i <= start_make_index:
        continue
    else:
        continue_scrape[make]=models
print(continue_scrape)

{'Volvo': ['S90', 'S90 Recharge', 'V40', 'V50', 'V60', 'V60 Cross Country', 'V60 Recharge', 'V70', 'V70 R', 'V90', 'V90 Cross Country', 'XC', 'XC40', 'XC40 Recharge', 'XC60', 'XC60 Recharge', 'XC70', 'XC90', 'XC90 Recharge'], 'Yugo': ['GV']}


In [None]:
for make, models in continue_scrape.items():
    print(make)
    for model in models:
        print(model)

In [8]:
for make,models in continue_scrape.items():
    try:
        async with async_playwright() as p:
            # open website
            browser = await p.chromium.launch(headless=True)
            print("current make: ", make)
            for model in models:
                await get_make_model_data(browser, make,model)  
                #print(model)
    except Exception as e:
        print(e)
        print("error, continue")
        continue
    finally:
        await browser.close()
    
print("finished with all scraping")

current make:  Volvo
get_make_model_data
Volvo : S90
done
get_make_model_data
Volvo : S90 Recharge
done
get_make_model_data
Volvo : V40
done
get_make_model_data
Volvo : V50
done
get_make_model_data
Volvo : V60
done
get_make_model_data
Volvo : V60 Cross Country
done
get_make_model_data
Volvo : V60 Recharge
done
get_make_model_data
Volvo : V70
done
get_make_model_data
Volvo : V70 R
done
get_make_model_data
Volvo : V90
done
get_make_model_data
Volvo : V90 Cross Country
done
get_make_model_data
Volvo : XC
done
get_make_model_data
Volvo : XC40
done
get_make_model_data
Volvo : XC40 Recharge
done
get_make_model_data
Volvo : XC60
done
get_make_model_data
Volvo : XC60 Recharge
done
get_make_model_data
Volvo : XC70
done
get_make_model_data
Volvo : XC90
done
get_make_model_data
Volvo : XC90 Recharge
done
current make:  Yugo
get_make_model_data
Yugo : GV
No Cars Found
done
finished with all scraping
