In [20]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize WebDriver with the specified options using webdriver-manager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Base URL of the Nordstrom women's clothing section
base_url = 'https://www.nordstrom.com/browse/women/clothing?breadcrumb=Home%2FWomen%2FClothing&origin=topnav&page='

# CSV file to store product details
csv_file = '/Users/habibi/Desktop/nordstorm_DS/nordstrom_products.csv'

# Column headers for the CSV file
headers = ["Name", "Brand", "Price", "Image URL", "Product URL", "Star Rating", "Number of Reviews"]

# Write the headers to the CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()

# Number of pages to scrape
num_pages = 500  # Adjust this to the number of pages you want to scrape

for page in range(1, num_pages + 1):
    # Navigate to the URL
    url = base_url + str(page)
    driver.get(url)

    # Wait until the articles are loaded
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//html/body/div[1]/div[2]/main/div/section/div/div/div[2]/div/section/div/article[1]'))
    )

    # Adding a delay to ensure the page is fully loaded
    time.sleep(5)

    # Get the page source and pass it to BeautifulSoup
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all product articles using BeautifulSoup
    articles = soup.find_all('article', class_='zzWfq RpUx3')

    # Extract and print product data
    for article in articles:
        try:
            name_element = article.find('h3', class_='kKGYj Y9bA4').find('a')
            name = name_element.text.strip()
            product_url = "https://www.nordstrom.com" + name_element['href']

            brand_element = article.find('div', class_='KtWqU jgLpg Y9bA4 Io521')
            brand = brand_element.text.strip()
            
            price_element = article.find('span', class_='qHz0a EhCiu dls-ihm460')
            price = price_element.text.strip()
            
            image_element = article.find('img', {'name': 'product-module-image'})
            image_url = image_element['src']

            star_rating_element = article.find('span', class_='T2Mzf', role='img')
            star_rating = star_rating_element['aria-label'].strip() if star_rating_element else 'No rating'
            
            num_reviews_element = article.find('span', class_='HZv8u')
            num_reviews = num_reviews_element.text.strip() if num_reviews_element else 'No reviews'

            # Product details
            product = {
                "Name": name,
                "Brand": brand,
                "Price": price,
                "Image URL": image_url,
                "Product URL": product_url,
                "Star Rating": star_rating,
                "Number of Reviews": num_reviews
            }

            # Append product details to the CSV file immediately
            with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(file, fieldnames=headers)
                writer.writerow(product)

            # Print the product information
            print(f"Name: {name}")
            print(f"Brand: {brand}")
            print(f"Price: {price}")
            print(f"Image URL: {image_url}")
            print(f"Product URL: {product_url}")
            print(f"Star Rating: {star_rating}")
            print(f"Number of Reviews: {num_reviews}")
            print('-' * 40)
        except Exception as e:
            print(f"Failed to scrape a product entry: {e}")

# Close the Selenium browser
driver.quit()

print(f"Data saved to {csv_file}")


Failed to scrape a product entry: 'NoneType' object has no attribute 'text'
Failed to scrape a product entry: 'NoneType' object has no attribute 'text'
Name: Total Eclipse Oversize T-Shirt
Brand: Free People
Price: $68.00
Image URL: https://n.nordstrommedia.com/id/sr3/a61d5dfe-943f-442e-8e79-30787fd2d780.jpeg?h=365&w=240&dpr=2
Product URL: https://www.nordstrom.com/s/free-people-total-eclipse-oversize-t-shirt/7789715?origin=category-personalizedsort&breadcrumb=Home%2FWomen%2FClothing&color=001
Star Rating: 5 out of 5 stars
Number of Reviews: (1)
----------------------------------------
Name: Tiah Playsuit Lingerie
Brand: Adore Me
Price: $54.95
Image URL: https://n.nordstrommedia.com/id/sr3/266d5ebb-5cdc-4838-a14a-241884ae2230.jpeg?h=365&w=240&dpr=2
Product URL: https://www.nordstrom.com/s/7874625?origin=category-personalizedsort&breadcrumb=Home%2FWomen%2FClothing&color=001
Star Rating: No rating
Number of Reviews: No reviews
----------------------------------------
Failed to scrape a p

TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x0000000102b2b848 chromedriver + 5179464
1   chromedriver                        0x0000000102b2327a chromedriver + 5145210
2   chromedriver                        0x000000010269a2b0 chromedriver + 389808
3   chromedriver                        0x00000001026e63e1 chromedriver + 701409
4   chromedriver                        0x00000001026e6691 chromedriver + 702097
5   chromedriver                        0x0000000102728464 chromedriver + 971876
6   chromedriver                        0x00000001027087dd chromedriver + 841693
7   chromedriver                        0x0000000102725b9b chromedriver + 961435
8   chromedriver                        0x0000000102708553 chromedriver + 841043
9   chromedriver                        0x00000001026d97f6 chromedriver + 649206
10  chromedriver                        0x00000001026da05e chromedriver + 651358
11  chromedriver                        0x0000000102aeeab0 chromedriver + 4930224
12  chromedriver                        0x0000000102af39c6 chromedriver + 4950470
13  chromedriver                        0x0000000102af4095 chromedriver + 4952213
14  chromedriver                        0x0000000102ad0e79 chromedriver + 4808313
15  chromedriver                        0x0000000102af4389 chromedriver + 4952969
16  chromedriver                        0x0000000102ac27d4 chromedriver + 4749268
17  chromedriver                        0x0000000102b13558 chromedriver + 5080408
18  chromedriver                        0x0000000102b13717 chromedriver + 5080855
19  chromedriver                        0x0000000102b22e5e chromedriver + 5144158
20  libsystem_pthread.dylib             0x00007ff8065b1202 _pthread_start + 99
21  libsystem_pthread.dylib             0x00007ff8065acbab thread_start + 15


In [2]:
import time
import csv
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium_stealth import stealth

# List of US proxies extracted from the provided image
proxies = [
    "http://155.94.241.134:3128",
    "http://162.223.90.130:80",
    "http://162.223.89.34:80",
    "http://198.199.83.163:80"
    
    
]

def get_driver(proxy=None, headless=False):
    # Set up Chrome options
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless")  # Ensure GUI is off
    # chrome_options.add_argument("--no-sandbox")
    # chrome_options.add_argument('--disable-gpu')
    # chrome_options.add_argument('--disable-dev-shm-usage')
    # chrome_options.add_argument('--start-maximized')
    # chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    # chrome_options.add_argument(f'user-agent={UserAgent().random}')
    
    if proxy:
        chrome_options.add_argument(f'--proxy-server={proxy}')
    
    # Initialize WebDriver with the specified options using webdriver-manager
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    # Apply stealth settings
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )

    # Additional settings from Intoli article to avoid headless detection
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': '''
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            });
            window.navigator.chrome = {
                runtime: {},
                // Add any other properties here as needed
            };
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en']
            });
            Object.defineProperty(navigator, 'plugins', {
                get: () => [1, 2, 3, 4, 5]
            });
        '''
    })

    return driver

def write_to_csv(csv_file, headers, data):
    with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writerow(data)

def scrape_page(page, csv_file, headers, proxy=None, headless=False):
    driver = get_driver(proxy, headless)
    try:
        # Navigate to the URL
        url = f'https://www.nordstrom.com/browse/women/clothing?breadcrumb=Home%2FWomen%2FClothing&origin=topnav&page={page}'
        driver.get(url)

        # Wait until the articles are loaded
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, '//article[contains(@class, "zzWfq RpUx3")]'))
        )

        # Adding a delay to ensure the page is fully loaded
        time.sleep(random.uniform(5, 10))

        # Get the page source and pass it to BeautifulSoup
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all product articles using BeautifulSoup
        articles = soup.find_all('article', class_='zzWfq RpUx3')

        # Extract and append product data to the CSV file one by one
        for article in articles:
            try:
                name_element = article.find('h3', class_='kKGYj Y9bA4').find('a')
                name = name_element.text.strip()
                product_url = "https://www.nordstrom.com" + name_element['href']

                brand_element = article.find('div', class_='KtWqU jgLpg Y9bA4 Io521')
                brand = brand_element.text.strip()
                
                price_element = article.find('span', class_='qHz0a EhCiu dls-ihm460')
                price = price_element.text.strip()
                
                image_element = article.find('img', {'name': 'product-module-image'})
                image_url = image_element['src']

                star_rating_element = article.find('span', class_='T2Mzf', role='img')
                star_rating = star_rating_element['aria-label'].strip() if star_rating_element else 'No rating'
                
                num_reviews_element = article.find('span', class_='HZv8u')
                num_reviews = num_reviews_element.text.strip() if num_reviews_element else 'No reviews'

                # Product details
                product = {
                    "Name": name,
                    "Brand": brand,
                    "Price": price,
                    "Image URL": image_url,
                    "Product URL": product_url,
                    "Star Rating": star_rating,
                    "Number of Reviews": num_reviews
                }

                # Write product details to the CSV file immediately
                write_to_csv(csv_file, headers, product)

                # Print the product information
                print(f"Page {page} - Name: {name}")
            except Exception as e:
                print(f"Failed to scrape a product entry on page {page}: {e}")

    except Exception as e:
        print(f"Failed to load page {page} with proxy {proxy}: {e}")
        driver.save_screenshot(f'error_page_{page}.png')
        return False
    finally:
        driver.quit()
    return True

# CSV file to store product details
csv_file = 'nordstrom_products_fast.csv'

# Column headers for the CSV file
headers = ["Name", "Brand", "Price", "Image URL", "Product URL", "Star Rating", "Number of Reviews"]

# Write the headers to the CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()

# Number of pages to scrape
num_pages = 500  # Adjust this to the number of pages you want to scrape

def scrape_pages_with_retries(page):
    for _ in range(3):  # Try up to 3 different proxies
        proxy = random.choice(proxies)  # Select a random proxy for each page
        try:
            if scrape_page(page, csv_file, headers, proxy, headless=False):
                return
        except Exception as e:
            print(f"Error scraping page {page} with proxy {proxy}: {e}")
            continue
    print(f"Skipping page {page} after 3 failed attempts.")

# Use ThreadPoolExecutor to scrape pages in parallel
with ThreadPoolExecutor(max_workers=3) as executor:  # Limiting to 3 threads for stability
    future_to_page = {executor.submit(scrape_pages_with_retries, page): page for page in range(1, num_pages + 1)}
    for future in as_completed(future_to_page):
        page = future_to_page[future]
        try:
            future.result()
        except Exception as e:
            print(f"Page {page} generated an exception: {e}")

print(f"Data saved to {csv_file}")


Failed to load page 1 with proxy http://155.94.241.134:3128: Message: unknown error: net::ERR_TUNNEL_CONNECTION_FAILED
  (Session info: chrome=127.0.6533.89)
Stacktrace:
0   chromedriver                        0x0000000103351848 chromedriver + 5179464
1   chromedriver                        0x000000010334927a chromedriver + 5145210
2   chromedriver                        0x0000000102ec02b0 chromedriver + 389808
3   chromedriver                        0x0000000102eb821e chromedriver + 356894
4   chromedriver                        0x0000000102ea8d48 chromedriver + 294216
5   chromedriver                        0x0000000102eaa8a4 chromedriver + 301220
6   chromedriver                        0x0000000102ea9002 chromedriver + 294914
7   chromedriver                        0x0000000102ea8898 chromedriver + 293016
8   chromedriver                        0x0000000102ea881a chromedriver + 292890
9   chromedriver                        0x0000000102ea6d1b chromedriver + 285979
10  chromedriver  

Proxy http://155.94.241.134:3128 is valid.
Proxy http://50.175.212.77:80 is not valid.
Proxy http://50.168.72.116:80 is not valid.
Proxy http://162.223.90.130:80 is valid.
Proxy http://162.223.89.34:80 is valid.
Proxy http://50.145.24.180:80 is not valid.
Proxy http://50.168.72.112:80 is not valid.
Proxy http://50.223.239.175:80 is not valid.
Proxy http://50.223.239.177:80 is not valid.
Proxy http://50.171.187.50:80 is not valid.
Proxy http://50.223.239.168:80 is not valid.
Proxy http://50.223.239.194:80 is not valid.
Proxy http://50.171.187.51:80 is not valid.
Proxy http://50.169.135.10:80 is not valid.
Proxy http://198.199.83.163:80 is valid.
Valid proxies:
http://155.94.241.134:3128
http://162.223.90.130:80
http://162.223.89.34:80
http://198.199.83.163:80


[]