# Fetch products from Heinz

> This notebook collects and processes listings on the company's product page.

In [1]:
# Python tools and Jupyter config

In [2]:
import os
import json
import boto3
import requests
import pandas as pd
import jupyter_black
from time import sleep
from pathlib import Path
from random import randint
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
today = pd.Timestamp.today().strftime("%Y-%m-%d")

In [4]:
# Variables

In [5]:
brand = "heinz"
brand_formal = "Heinz"
parent = "The Kraft Heinz Co."

In [23]:
# BASE = Path(__file__).resolve().parent
BASE = Path.cwd()
CSV_OUT = BASE / f"../../../data/{brand}.csv"
JSON_OUT = BASE / f"../../../data/{brand}.json"

In [24]:
CSV_OUT

PosixPath('/Users/mstiles/github/products/notebooks/brands/heinz/../../../data/heinz.csv')

---

In [7]:
# Headers for the requests
headers = {
    "Referer": "https://www.heinz.com/",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
}

# Data for the POST request
data = '{"requests":[{"indexName":"heinz-prd-products-en-US","params":"clickAnalytics=true&facets=%5B%22*%22%5D&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=100&maxValuesPerFacet=100&page=0&tagFilters="}]}'

In [8]:
# Function to flatten the product JSON
def flatten_product(product):
    flattened = {
        "entryId": product.get("entryId", ""),
        "name": product.get("name", ""),
        "description": product.get("description", ""),
        "tagLine": product.get("tagLine", ""),
        "eyebrow": product.get("eyebrow", ""),
        "image_url": product.get("image", {}).get("url", ""),
        "image_secure_url": product.get("image", {}).get("secure_url", ""),
        "slug": product.get("slug", ""),
        "gtins": ", ".join(product.get("gtins", [])),
        "upcCodes": ", ".join(product.get("upcCodes", [])),
        "brand_key": product.get("brand", {}).get("key", ""),
        "brand_name": product.get("brand", {}).get("name", ""),
        "brand_image_url": product.get("brand", {}).get("image", {}).get("url", ""),
        "brand_image_secure_url": product.get("brand", {})
        .get("image", {})
        .get("secure_url", ""),
        "subheadParts": ", ".join(product.get("subheadParts", [])),
        "condimentType": ", ".join(product.get("condimentType", [])),
        "occasion": ", ".join(product.get("occasion", [])),
        "createdAt": product.get("metadata", {}).get("createdAt", ""),
        "updatedAt": product.get("metadata", {}).get("updatedAt", ""),
    }
    return flattened

In [9]:
# Make the POST request
response = requests.post(
    "https://ir4a3s8sa0-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.14.3)%3B%20Browser%20(lite)%3B%20instantsearch.js%20(4.56.5)%3B%20react%20(18.2.0)%3B%20react-instantsearch%20(6.38.0)%3B%20react-instantsearch-hooks%20(6.38.0)%3B%20JS%20Helper%20(3.16.3)&x-algolia-api-key=495d968fee9cadc3b9f18e9b837798d5&x-algolia-application-id=IR4A3S8SA0",
    headers=headers,
    data=data,
)

# Extract the list of products from the response
product_list = response.json()["results"][0]["hits"]

# List to store the flattened product data
flattened_products = []

# Process each product and flatten the JSON
for product in product_list:
    flattened_product = flatten_product(product)
    flattened_products.append(flattened_product)

# Create a DataFrame from the flattened product data
df = pd.DataFrame(flattened_products)

In [10]:
# Output

In [11]:
df["date_fetch"] = today
df["parent"] = parent 
df['brand'] = brand_formal

In [17]:
# Output local files
df.to_csv(CSV_OUT, index=False)
df.to_json(JSON_OUT, indent=4, orient="records")

In [13]:
# Paths for S3 storage
S3_BUCKET = "stilesdata.com"
S3_CSV_KEY = f"products/data/{brand}.csv"
S3_JSON_KEY = f"products/data/{brand}.json"

# Initialize boto3 client with environment variables
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("MY_AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("MY_AWS_SECRET_ACCESS_KEY"),
    aws_session_token=os.getenv("MY_AWS_SESSION_TOKEN"),
)

# Upload the CSV file to S3
s3_client.upload_file(str(CSV_OUT), S3_BUCKET, S3_CSV_KEY)
print(f"CSV file uploaded to s3://{S3_BUCKET}/{S3_CSV_KEY}")

# Upload the JSON file
s3_client.upload_file(str(JSON_OUT), S3_BUCKET, S3_JSON_KEY)
print(f"JSON file uploaded to s3://{S3_BUCKET}/{S3_JSON_KEY}")

CSV file uploaded to s3://stilesdata.com/products/data/heinz.csv
JSON file uploaded to s3://stilesdata.com/products/data/heinz.json
