In [17]:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import os
import sys
from urllib.parse import urljoin

class AdoreBeautyScraper:
    def __init__(self):
        self.base_url = "https://www.adorebeauty.com.au"
        self.skincare_url = "https://www.adorebeauty.com.au/c/skin-care.html"
        # Create a cloudscraper session
        self.scraper = cloudscraper.create_scraper(
            browser={
                'browser': 'chrome',
                'platform': 'windows',
                'mobile': False
            }
        )
        self.product_urls = set()

    def get_product_urls_from_page(self, page_number):
        """Extract product URLs from a single page"""
        url = f"{self.skincare_url}?p={page_number}"
        print(f"Scanning page {page_number}...")
        
        try:
            # Use cloudscraper instead of requests
            response = self.scraper.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all product containers
            product_containers = soup.find_all('div', class_='relative rounded-md border-[1px] border-[#e1dfdf]')
            
            # Extract URLs from each container
            for container in product_containers:
                product_link = container.find('a', href=True)
                if product_link and '/p/' in product_link['href']:
                    full_url = urljoin(self.base_url, product_link['href'])
                    self.product_urls.add(full_url)
            
            print(f"Found {len(product_containers)} products on page {page_number}")
            return len(product_containers) > 0  # Return True if products were found
            
        except Exception as e:
            print(f"Error processing page {page_number}: {str(e)}")
            return False

    def collect_all_product_urls(self, max_pages=None):
        """Iterate through all pages and collect product URLs"""
        page_number = 1
        
        while True:
            if max_pages and page_number > max_pages:
                print(f"Reached maximum pages limit ({max_pages})")
                break
                
            # Add delay between pages
            if page_number > 1:
                time.sleep(2)
            
            # Get URLs from current page
            found_products = self.get_product_urls_from_page(page_number)
            
            # If no products found or error occurred, stop
            if not found_products:
                print(f"No more products found after page {page_number-1}")
                break
                
            page_number += 1
        
        print(f"\nTotal unique product URLs collected: {len(self.product_urls)}")
        return list(self.product_urls)

    def save_urls_to_file(self, filename="product_urls.txt"):
        """Save collected URLs to a file"""
        filepath = os.path.join("data", "raw", filename)
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        with open(filepath, 'w') as f:
            for url in self.product_urls:
                f.write(f"{url}\n")
        
        print(f"Saved {len(self.product_urls)} URLs to {filepath}")

if __name__ == "__main__":
    # Initialize scraper
    scraper = AdoreBeautyScraper()
    
    # Collect URLs (limit to 5 pages for testing)
    scraper.collect_all_product_urls(max_pages=5)
    
    # Save URLs to file
    scraper.save_urls_to_file()

Scanning page 1...
Found 50 products on page 1
Scanning page 2...
Found 50 products on page 2
Scanning page 3...
Found 50 products on page 3
Scanning page 4...
Found 50 products on page 4
Scanning page 5...
Found 50 products on page 5
Reached maximum pages limit (5)

Total unique product URLs collected: 250
Saved 250 URLs to data\raw\product_urls.txt


In [20]:
os.curdir

'.'

In [26]:
target_website = ('https://www.adorebeauty.com.au/c/skin-care.html?p=1')
# target_website = ('https://nudieglow.com/collections')
target_website = 'https://www.adorebeauty.com.au/p/la-roche-posay/la-roche-posay-cicaplast-baume-b5-100ml.html'
request_headers = {
    'referer': 'https://www.scrapingcourse.com/ecommerce/',
    'accept-language': 'en-US,en;q=0.9',
    'content-type': 'application/json',
    'accept-encoding': 'gzip, deflate, br',
    'sec-ch-device-memory': '8',
    'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
    'sec-ch-ua-platform': "Windows",
    'sec-ch-ua-platform-version': '"10.0.0"',
    'sec-ch-viewport-width': '792',
    'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  }
 
response = requests.get(target_website, headers=request_headers)

In [41]:
target_website = ('https://www.adorebeauty.com.au/c/skin-care.html?p=2')
# target_website = ('https://nudieglow.com/collections')
target_website = 'https://www.adorebeauty.com.au/p/la-roche-posay/la-roche-posay-toleriane-dermo-cleasner.html'
import cloudscraper

scraper = cloudscraper.create_scraper()

soup = BeautifulSoup(scraper.get(target_website).text)

print(soup)

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><meta content="width=device-width,initial-scale=1" name="viewport"/><meta content="px-captcha" name="description"/><title>Access to this page has been denied</title></head><body><script>window._pxVid='';window._pxUuid='cec88c30-0ef2-11f0-a634-b8ea80eb2ea4';window._pxAppId='PX1Dk3430L';window._pxHostUrl='/1Dk3430L/xhr';window._pxCustomLogo='';window._pxJsClientSrc='/1Dk3430L/init.js';window._pxMobile=false;window._pxFirstPartyEnabled=true;var pxCaptchaSrc='/1Dk3430L/captcha/PX1Dk3430L/captcha.js?a=c&u=cec88c30-0ef2-11f0-a634-b8ea80eb2ea4&v=&m=0&b=aHR0cDovL3d3dy5hZG9yZWJlYXV0eS5jb20uYXUvcC9sYS1yb2NoZS1wb3NheS9sYS1yb2NoZS1wb3NheS10b2xlcmlhbmUtZGVybW8tY2xlYXNuZXIuaHRtbA==';var script=document.createElement('script');script.src=pxCaptchaSrc;script.onload=onScriptLoad;script.onerror=onScriptError;var onScriptErrorCalled;document.head.appendChild(script);var timeoutID=setTimeout(onScriptError,5000);function onScriptLoad(){clearTimeo

In [35]:
response

<Response [403]>

In [None]:
# In your notebook or Python console
scraper = AdoreBeautyScraper()
urls = scraper.collect_all_product_urls(max_pages=2)  # Start with 2 pages for testing
print(f"\nFirst few URLs collected:")
for url in list(urls)[:5]:
    print(url)

In [42]:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
import random
from datetime import datetime

class AdoreProductScraper:
    def __init__(self):
        self.scraper = cloudscraper.create_scraper(
        )

    def extract_reviews(self, product_data):
        """Extract reviews from product data into a separate DataFrame"""
        reviews = product_data.get('review', [])
        reviews_data = []
        
        for review in reviews:
            review_data = {
                'review_id': hash(review.get('reviewBody', '')),  # Create unique ID
                'product_sku': product_data.get('sku'),
                'author': review.get('author', {}).get('name'),
                'title': review.get('name'),
                'body': review.get('reviewBody'),
                'rating': review.get('reviewRating', {}).get('ratingValue'),
                'date_published': review.get('datePublished'),
            }
            reviews_data.append(review_data)
            
        return reviews_data

    def get_product_data(self, url):
        """Extract product data from the structured JSON data on the page"""
        try:
            # Add random delay
            time.sleep(random.uniform(1, 3))
            
            # Get the page
            response = self.scraper.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find the script containing product data
            script = soup.find('script', {'id': 'product_structured_data'})
            if not script:
                print(f"No product data found for {url}")
                return None, None
            
            # Parse the JSON data
            product_data = json.loads(script.string)
            
            # Extract product details
            product_details = {
                'url': url,
                'sku': product_data.get('sku'),
                'name': product_data.get('name'),
                'brand': product_data.get('brand', {}).get('name'),
                'description': product_data.get('description'),
                'size': product_data.get('size'),
                'ingredients': product_data.get('material'),
                'image_urls': product_data.get('image', []),
                'scraped_at': datetime.now()
            }
            
            # Extract reviews
            reviews_data = self.extract_reviews(product_data)
            
            return product_details, reviews_data
            
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            return None, None

    def scrape_products_from_file(self, urls_file):
        """Scrape product data for all URLs in a file"""
        # Read URLs from file
        with open(urls_file, 'r') as f:
            urls = [line.strip() for line in f]
        
        # Initialize lists for products and reviews
        products_data = []
        all_reviews = []
        
        # Scrape each product
        for i, url in enumerate(urls, 1):
            print(f"Processing product {i}/{len(urls)}: {url}")
            product_data, reviews_data = self.get_product_data(url)
            
            if product_data:
                products_data.append(product_data)
                if reviews_data:
                    all_reviews.extend(reviews_data)
        
        # Convert to DataFrames
        products_df = pd.DataFrame(products_data)
        reviews_df = pd.DataFrame(all_reviews)
        
        return products_df, reviews_df

    def save_data(self, products_df, reviews_df, timestamp=None):
        """Save products and reviews to separate CSV files"""
        if timestamp is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            
        # Save products
        products_file = f"data/raw/products_{timestamp}.csv"
        products_df.to_csv(products_file, index=False)
        print(f"Saved {len(products_df)} products to {products_file}")
        
        # Save reviews
        reviews_file = f"data/raw/reviews_{timestamp}.csv"
        reviews_df.to_csv(reviews_file, index=False)
        print(f"Saved {len(reviews_df)} reviews to {reviews_file}")

if __name__ == "__main__":
    # Test with a single URL
    scraper = AdoreProductScraper()
    test_url = "https://www.adorebeauty.com.au/p/innisfree/innisfree-energy-mask-centella.html"
    
    # Get data for single product
    product_data, reviews_data = scraper.get_product_data(test_url)
    
    if product_data:
        # Create DataFrames
        products_df = pd.DataFrame([product_data])
        reviews_df = pd.DataFrame(reviews_data)
        
        print("\nProduct Data:")
        print(products_df)
        print("\nReviews Data:")
        print(reviews_df)


Product Data:
                                                 url        sku  \
0  https://www.adorebeauty.com.au/p/innisfree/inn...  131174706   

                                name      brand  \
0  INNISFREE Energy Mask -  Centella  INNISFREE   

                                         description size  \
0  Stressed out skin? Sit back, soothe and hydrat...        

                                         ingredients  \
0  <div>water, dipropylene glycol, butylene glyco...   

                                          image_urls  \
0  [https://www.adorebeauty.com.au/pim_media/000/...   

                  scraped_at  
0 2025-04-01 22:14:52.277515  

Reviews Data:
              review_id product_sku  author                     title  \
0  -4898063170814228968   131174706      RH        Fav Innisfree Mask   
1   3912133291945534699   131174706      TL      Not hydrating enough   
2     54965610252414939   131174706   Cindy   Soft and refreshed skin   
3  -1832191239075145020   131