# Now that we've saved each URL from the sitemap, go to each sitemap URL and look at products details page for `max_products_per_page` products

In [1]:
pip install chromedriver-py

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from chromedriver_py import binary_path # this will get you the path variable
from selenium.webdriver.common.by import By
import pandas as pd

In [4]:
sitemap_urls = pd.read_csv('sitemap_urls')
all_product_links = set()

# appliances will not show up bc of different HTML structure
errors = set()
MAX_PRODUCTS_PER_PAGE = 12


# Selenium configs
options = Options()
options.add_argument('--headless=new')
options.add_argument('--window-size=1920,1200')
service_object = Service(binary_path)
driver = webdriver.Chrome(options=options, service=service_object)

In [5]:
# finds links that will lead to product detail page
xpath_products_page = "//div[contains(@class, 'browse-search__pod')]/div/a"

sitemap_length = sitemap_urls.shape[0] 

count = 0
for i, row in sitemap_urls.iterrows():
    if count % 100 == 0:
        print('Progress: {}%'.format(count/sitemap_length))
    
    count += 1
        
    url = row['url']
    driver.get(url)
    
    # wait for page to load
    product_link = driver.find_elements(By.XPATH, xpath_products_page)
    
    if product_link == []:
        errors.add(url)
        print(url)
        
    # ensure that if there are less products than MAX_PRODUCTS_PER_PAGE, no out of bounds error
    for j in range(min(MAX_PRODUCTS_PER_PAGE, len(product_link))):
        try:
            all_product_links.add(product_link[j].get_attribute('href'))
        except:
            errors.add(url)
            
driver.quit()      
        


Progress: 0.0%
https://www.homedepot.com/b/Appliances-Dishwashers/N-5yc1vZc3po
https://www.homedepot.com/b/Appliances-Ranges/N-5yc1vZc3o9
https://www.homedepot.com/b/Appliances-Range-Hoods/N-5yc1vZc3nk
https://www.homedepot.com/b/Appliances-Freezers-Ice-Makers-Ice-Makers/N-5yc1vZc3pw
https://www.homedepot.com/b/Appliances-Mini-Refrigerators/N-5yc1vZc4mo
https://www.homedepot.com/b/Appliances-Refrigerators/N-5yc1vZc3pi
https://www.homedepot.com/b/Appliances-Washers-Dryers/N-5yc1vZc3ol
https://www.homedepot.com/b/Appliances-Wall-Ovens/N-5yc1vZc3nq
https://www.homedepot.com/b/Automotive-Car-Cleaning-Supplies/Car-Cleaning-Kit/N-5yc1vZc8neZ1z1uryc
https://www.homedepot.com/b/Automotive-Truck-Accessories-Tonneau-Cover/N-5yc1vZ2fkok9f
https://www.homedepot.com/b/Bath-Bathroom-Accessories/N-5yc1vZcfvt
https://www.homedepot.com/b/Bath-Bathroom-Faucets/N-5yc1vZbreo
https://www.homedepot.com/b/Bath-Bathroom-Sinks/N-5yc1vZbzb0
https://www.homedepot.com/b/Bath-Bathroom-Vanities/N-5yc1vZcfv3
https:/

https://www.homedepot.com/b/Furniture/N-5yc1vZc7pc
https://www.homedepot.com/b/Furniture-Bedroom-Furniture-Mattresses/N-5yc1vZc7oe
Progress: 0.8810572687224669%
https://www.homedepot.com/b/Home-Decor-Wall-Decor/N-5yc1vZar8x
https://www.homedepot.com/b/Home-Decor-Wall-Decor-Wall-Art/N-5yc1vZbbzh
https://www.homedepot.com/b/Home-Decor-Wallpaper/N-5yc1vZbc0q
https://www.homedepot.com/b/Home-Decor-Home-Accents/N-5yc1vZar58
https://www.homedepot.com/b/Home-Decor-Bedding-Bath/N-5yc1vZci04
https://www.homedepot.com/b/Home-Decor-Bedding-Bath-Mattress-Toppers/N-5yc1vZc1ku
https://www.homedepot.com/b/Home-Decor-Bedding-Bath-Towels/N-5yc1vZcfva
https://www.homedepot.com/b/Appliances-Small-Kitchen-Appliances/N-5yc1vZbv48
https://www.homedepot.com/b/Appliances-Small-Kitchen-Appliances-Blenders/N-5yc1vZbv6y
https://www.homedepot.com/b/Appliances-Small-Kitchen-Appliances-Coffee-Espresso/N-5yc1vZbv2f
https://www.homedepot.com/b/Appliances-Small-Kitchen-Appliances-Cookers/N-5yc1vZ1z18gdp
https://www.ho

In [6]:
len(all_product_links)

3272

In [12]:
len(errors)

178

In [8]:
product_urls = pd.DataFrame(all_product_links, columns=['url'])
product_urls.to_csv('product_urls', index=False)

In [11]:
error_urls = pd.DataFrame(errors, columns=['url'])
error_urls.to_csv('error_urls', index=False)