# Fetch products from La Croix

> This notebook collects and processes listings on the company's product page.

In [1]:
# Python tools and Jupyter config

In [2]:
import os
import json
import boto3
import requests
import pandas as pd
import jupyter_black
from time import sleep
from pathlib import Path
from random import randint
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
today = pd.Timestamp.today().strftime("%Y-%m-%d")

In [4]:
# Variables

In [5]:
brand = "la_croix"
brand_formal = "La Croix"
parent = "National Beverage Corporation"

In [6]:
# BASE = Path(__file__).resolve().parent
BASE = Path.cwd()
CSV_OUT = BASE / f"../../../data/brands/{brand}.csv"
JSON_OUT = BASE / f"../../../data/brands/{brand}.json"

---

In [7]:
# Headers for the requests
headers = {
    "Referer": "https://www.heinz.com/",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
}

In [8]:
# Base url
base_url = "https://www.lacroixwater.com/flavors/page/{}"

In [9]:
# Scrape all flavor items
flavors = []

page = 1
while True:
    response = requests.get(base_url.format(page), headers=headers)
    if response.status_code != 200:
        break
    soup = BeautifulSoup(response.text, "html.parser")
    items = soup.find_all("li", class_="flavors-item")

    if not items:
        break

    for item in items:
        flavor_name = item.find("h3").get_text()
        flavor_url = item.find("a")["href"]
        flavor_image = item.find("img")["src"]
        flavors.append(
            {
                "brand": None,
                "name": flavor_name,
                "description": None,
                "url": flavor_url,
                "image": flavor_image,
            }
        )
    page += 1

In [10]:
# Function to get description from each flavor page
def get_flavor_description(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        info_div = soup.find("div", class_="info")
        if info_div:
            description = info_div.find("p").get_text()
            return description
    return None

In [11]:
# Update the flavors with their descriptions
for flavor in tqdm(flavors):
    flavor["description"] = get_flavor_description(flavor["url"])
    # time.sleep(1)  # Sleep if needed

  0%|          | 0/25 [00:00<?, ?it/s]

In [12]:
# Convert to DataFrame
df = pd.DataFrame(flavors)

In [13]:
# Output

In [14]:
df["date_fetch"] = today
df["parent"] = parent
df["brand"] = brand_formal

In [15]:
# Output local files
df.to_csv(CSV_OUT, index=False)
df.to_json(JSON_OUT, indent=4, orient="records")

In [16]:
# Paths for S3 storage
S3_BUCKET = "stilesdata.com"
S3_CSV_KEY = f"products/{brand}/products.csv"
S3_JSON_KEY = f"products/{brand}/products.json"

# Initialize boto3 client with environment variables
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("MY_AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("MY_AWS_SECRET_ACCESS_KEY"),
    aws_session_token=os.getenv("MY_AWS_SESSION_TOKEN"),
)

# Upload the CSV file to S3
s3_client.upload_file(str(CSV_OUT), S3_BUCKET, S3_CSV_KEY)
print(f"CSV file uploaded to s3://{S3_BUCKET}/{S3_CSV_KEY}")

# Upload the JSON file
s3_client.upload_file(str(JSON_OUT), S3_BUCKET, S3_JSON_KEY)
print(f"JSON file uploaded to s3://{S3_BUCKET}/{S3_JSON_KEY}")

CSV file uploaded to s3://stilesdata.com/products/la_croix/products.csv
JSON file uploaded to s3://stilesdata.com/products/la_croix/products.json
