Aim: to scrape all product names and prices for women's fashion from the Farmers.co.nz site

Start with just one page - the first page for women's tops.

In [1]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd 

# Define the URL of the website
url = "https://www.farmers.co.nz/women/fashion/tops"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the product listings
    products = soup.find_all("div", class_ = "product-tile")
    
    # Create lists to store the data
    product_names = []
    product_prices = []
    
    # Loop through the product listings and extract the data
    for product in products:
        name_tag = product.find("span", class_ = "product-title-span") # found by inspecting html
        price_tag = product.find("div", class_="current-price") # found by inspecting html
        
        if name_tag:
            name = name_tag.text.strip()  
        else:
            name = "N/A"
        
        if price_tag:
            price = price_tag.text.strip()
        else:
            price = "N/A"
        
        product_names.append(name)
        product_prices.append(price)
    
    # Create a DataFrame from the lists
    df = pd.DataFrame({
        "Product Name": product_names,
        "Product Price": product_prices
    })
    
    # Save the DataFrame to a CSV file
    df.to_csv("farmers_women_tops_p1.csv", index=False)
    
    print("Data has been written to farmers_women_tops_p1.csv")
else:
    print("Failed to retrieve the webpage.")

Data has been written to farmers_women_tops_p1.csv


That got 24 prices (one page) - next step is to extend across all pages for women's tops

Call this new file farmers_women_tops.csv

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL of the website
base_url = "https://www.farmers.co.nz/women/fashion/tops"

# Lists to store the product data
product_names = []
product_prices = []

# Loop through each page (0 to 14 in this specific case) 
for page_num in range(0, 15): # note, 2nd number of range not included, so needs to be 14+1
    # Modify the URL to include the page number
    url = f"{base_url}/Page-{page_num}-SortingAttribute-SortBy-asc" # specific to the Farmers.co.nz site
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find the product listings
        products = soup.find_all("div", class_="product-tile") # found by inspecting html
        
        # Loop through the product listings and extract the data
        for product in products:
            name_tag = product.find("span", class_="product-title-span") # found by inspecting html
            price_tag = product.find("div", class_="current-price") # found by inspecting html
            
            # Extract the product name
            if name_tag:
                name = name_tag.text.strip()  
            else:
                name = "N/A"
            
            # extract the product price
            if price_tag:
                price = price_tag.text.strip()
            else:
                price = "N/A"
            
            # Append the data to the lists
            product_names.append(name)
            product_prices.append(price)
    else:
        print(f"Failed to retrieve page {page_num}")

# Create a DataFrame from the lists
df = pd.DataFrame({
    "Product Name": product_names,
    "Product Price": product_prices
})

# Save the DataFrame to a CSV file
df.to_csv("farmers_women_tops.csv", index=False)

print("Data has been written to farmers_women_tops.csv")

Data has been written to farmers_women_tops.csv


Successfully scraped all 348 women's tops.

Next need to work out how to loop across categories (i.e. 'new arrivals', 'dresses', 'tops'...). Work out how to get the URLs for each category.

In [4]:
# Define the URL of the website
url = "https://www.farmers.co.nz/women/fashion"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the categories
    categories = soup.find_all("a", class_ = "category-list-image")
    
    # Create list to store the data
    category_urls = []

    # Loop through the categories and extract the names
    for category in categories:
        url = category.get("href", "N/A")
        category_urls.append(url)
          
    print("category_urls list has been created")
else:
    print("Failed to retrieve the webpage.")    

category_urls list has been created


Work out how to get the number of pages as a variable so it doesn't have to be hard-coded as above for tops (which had pages 0-14)

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL of the website
base_url = "https://www.farmers.co.nz/women/fashion/dresses"

# Send a GET request to the URL
response = requests.get(base_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the number of pages to be iterated through
    pagenum_tag = soup.find_all("span", class_ = "pagination-hide")

    lastpage = pagenum_tag[-1].text.strip()  

    lastpage_num = int(lastpage[2:])
    print("retrieved number of pages")
else:
    print("Failed to retrieve the webpage.")    

retrieved number of pages


Now try and combine the outer loop above with the code that scrapes all the pages (after determining number of pages) for each of the categories.
And remember to add in a variable which has the date and time of the scrape.

In [None]:
#import requests
#from bs4 import BeautifulSoup
#import pandas as pd
from datetime import datetime # need this new module

# Define the URL of the website
url = "https://www.farmers.co.nz/women/fashion"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the categories
    categories = soup.find_all("a", class_ = "category-list-image")
    
    # Create list to store the data
    category_urls = []

    # Loop through the categories and extract the names
    for category in categories:
        url = category.get("href", "N/A")
        category_urls.append(url)
          
    print("category_urls list has been created")
else:
    print("Failed to retrieve the webpage.")   

# now run a loop across the list of categories (dresses, tops etc ) 

# Lists to store the product data
product_names = []
product_prices = []
product_urls = [] # get the full url for now can transform at later stage
scrape_times = [] 
    
for category_url in category_urls:
    
    # Base URL for the product category
    base_url = category_url

    # Send a GET request to the URL
    response = requests.get(base_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
    
        # Find the number of pages to be iterated through
        pagenum_tag = soup.find_all("span", class_ = "pagination-hide")
        if pagenum_tag == [] :
            lastpage_num = 0
        
        else : 

            lastpage = pagenum_tag[-1].text.strip()  
            lastpage_num = int(lastpage[2:])

        print(f"Number of pages is {lastpage_num}")
    else:
        print("Failed to retrieve the webpage.")    

    for page_num in range(0, lastpage_num):
        # Modify the URL to include the page number
        url = f"{base_url}/Page-{page_num}-SortingAttribute-SortBy-asc"
    
        # Send a GET request to the URL
        response = requests.get(url)
    
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, "html.parser")
        
            # Find the product listings
            products = soup.find_all("div", class_="product-tile")
        
            # Loop through the product listings and extract the data
            for product in products:
                name_tag = product.find("span", class_="product-title-span")
                price_tag = product.find("div", class_="current-price")
            
                # Extract and clean the product name
                name = name_tag.text.strip() if name_tag else "N/A"
            
                # Extract and clean the product price
                price = price_tag.text.strip() if price_tag else "N/A"

                # get time of scrape
                scrape_time = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
            
                # Append the data to the lists
                product_names.append(name)
                product_prices.append(price)
                product_urls.append(base_url)
                scrape_times.append(scrape_time)
        else:
            print(f"Failed to retrieve page {page_num}")

        # Create a DataFrame from the lists
    df = pd.DataFrame({
        "Product Name": product_names,
        "Product Price": product_prices,
        "Product Url" : product_urls, # note this is actually the category URL (eg 'tops') so will help with categorisation
        "Scrape Time" : scrape_times
    })

    # Save the DataFrame to a CSV file
    df.to_csv("farmers_womens_fashion.csv", index=False)

    print(f"Data has been written to farmers_womens_fashion.csv for page {category_url}")