# Fetch products from Rao's

> This notebook collects and processes listings on the company's product page.

In [1]:
# Python tools and Jupyter config

In [2]:
import os
import json
import time
import boto3
import requests
import pandas as pd
import jupyter_black
from time import sleep
from pathlib import Path
from random import randint
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
today = pd.Timestamp.today().strftime("%Y-%m-%d")

In [4]:
# Headers for requests

In [5]:
headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Referer": "https://www.ottogi.co.kr/en/product/product_search",
}

In [6]:
# Variables

In [7]:
brand = "raos"
parent = "Campbell Soup Company"
brand_formal = "Rao's"

In [8]:
# BASE = Path(__file__).resolve().parent
BASE = Path.cwd()
CSV_OUT = BASE / f"../../../data/brands/{brand}.csv"
JSON_OUT = BASE / f"../../../data/brands/{brand}.json"

In [9]:
base_url = "https://www.raos.com"
collection_url = f"{base_url}/collections/all"

In [10]:
def get_total_products(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    total_products = int(
        soup.select_one("#ProductCount").get_text(strip=True).split()[0]
    )
    return total_products


def scrape_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    products = []

    for product in soup.select(".grid__item"):
        try:
            title_tag = product.select_one("h3.card-information__text.h5 a")
            title = title_tag.get_text(strip=True) if title_tag else "N/A"

            price_tag = product.select_one(".price__regular .price-item")
            price = price_tag.get_text(strip=True) if price_tag else "N/A"

            product_url = base_url + title_tag["href"] if title_tag else "N/A"

            image_tag = product.select_one(".card__media-full-spacer img")
            image_url = image_tag["src"] if image_tag else "N/A"

            # Fetch product description if product URL is valid
            description = "N/A"
            if product_url != "N/A":
                description = scrape_product_description(product_url)

            products.append(
                {
                    "title": title,
                    "price": price,
                    "product_url": product_url,
                    "image_url": image_url,
                    "description": description,
                }
            )
        except Exception as e:
            print(f"Error processing product: {e}")

    return products


def scrape_product_description(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    description_tag = soup.select_one(".product__description.rte p")
    description = description_tag.get_text(strip=True) if description_tag else "N/A"
    return description


def scrape_all_products(collection_url):
    total_products = get_total_products(collection_url)
    products_per_page = 12
    total_pages = (total_products // products_per_page) + 1

    all_products = []

    for page in range(1, total_pages + 1):
        print(f"Scraping page {page} of {total_pages}")
        url = f"{collection_url}?page={page}"
        products = scrape_page(url)
        all_products.extend(products)
        time.sleep(1)  # Add delay to avoid getting blocked

    return all_products

In [11]:
# Main script execution
all_products = scrape_all_products(collection_url)

Scraping page 1 of 19


Scraping page 2 of 19


Scraping page 3 of 19


Scraping page 4 of 19


Scraping page 5 of 19


Scraping page 6 of 19


Scraping page 7 of 19


Scraping page 8 of 19


Scraping page 9 of 19


Scraping page 10 of 19


Scraping page 11 of 19


Scraping page 12 of 19


Scraping page 13 of 19


Scraping page 14 of 19


Scraping page 15 of 19


Scraping page 16 of 19


Scraping page 17 of 19


Scraping page 18 of 19


Scraping page 19 of 19


In [12]:
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_products)

In [13]:
df["date_fetch"] = today
df["parent"] = parent
df["brand"] = brand_formal

In [14]:
len(df)

243

In [15]:
# Output

In [16]:
# Local files
df.to_csv(CSV_OUT, index=False)
df.to_json(JSON_OUT, indent=4, orient="records")

In [17]:
# Paths for S3 storage
S3_BUCKET = "stilesdata.com"
S3_CSV_KEY = f"products/data/{brand}.csv"
S3_JSON_KEY = f"products/data/{brand}.json"

# Initialize boto3 client with environment variables
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("MY_AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("MY_AWS_SECRET_ACCESS_KEY"),
    aws_session_token=os.getenv("MY_AWS_SESSION_TOKEN"),
)

# Upload the CSV file to S3
s3_client.upload_file(str(CSV_OUT), S3_BUCKET, S3_CSV_KEY)
print(f"CSV file uploaded to s3://{S3_BUCKET}/{S3_CSV_KEY}")

# Upload the JSON file
s3_client.upload_file(str(JSON_OUT), S3_BUCKET, S3_JSON_KEY)
print(f"JSON file uploaded to s3://{S3_BUCKET}/{S3_JSON_KEY}")

CSV file uploaded to s3://stilesdata.com/products/data/raos.csv


JSON file uploaded to s3://stilesdata.com/products/data/raos.json
