# Fetch products from Buldak

> This notebook collects and processes listings on the company's product page.

In [1]:
# Python tools and Jupyter config

In [2]:
import os
import json
import boto3
import requests
import pandas as pd
import jupyter_black
from time import sleep
from pathlib import Path
from random import randint
from bs4 import BeautifulSoup
from tqdmimport tqdm

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
today = pd.Timestamp.today().strftime("%Y-%m-%d")

In [4]:
# Headers for requests

In [5]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
}

In [6]:
# Variables

In [7]:
brand = "buldak"
parent = "Samyang America, Inc."
brand_formal = "Buldak"

In [8]:
# BASE = Path(__file__).resolve().parent
BASE = Path.cwd()
CSV_OUT = BASE / f"../../../data/brands/{brand}.csv"
JSON_OUT = BASE / f"../../../data/brands/{brand}.json"

In [9]:
CSV_OUT

PosixPath('/Users/mstiles/github/products/notebooks/brand_scrapers/buldak/../../../data/brands/buldak.csv')

In [10]:
# Extract product URLs from the grid
url = "https://samyangamerica.com/buldak/products"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
product_boxes = soup.find_all("div", class_="p-item-box")

base_url = "https://samyangamerica.com"
products = []

In [11]:
# Collect the URLs and names
for box in tqdm(product_boxes):
    product_url = base_url + box.find("a", class_="p-item-inner")["href"]
    product_name = box.find("div", class_="p-item-name").get_text(strip=True)
    products.append({"name": product_name, "url": product_url})


# Function to extract detailed information from each product page
def get_product_details(product):
    response = requests.get(product["url"], headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    try:
        image_url = base_url + soup.find("div", class_="item-img").find("img")["src"]
    except AttributeError:
        image_url = None

    try:
        narrative = soup.find("div", class_="i-detail").get_text(strip=True)
    except AttributeError:
        narrative = None

    try:
        ingredients = soup.find("div", class_="i-text").get_text(strip=True)
    except AttributeError:
        ingredients = None

    try:
        nutrition_img = (
            base_url + soup.find("div", class_="i-nutrition").find("img")["src"]
        )
    except AttributeError:
        nutrition_img = None

    product.update(
        {
            "image_url": image_url,
            "narrative": narrative,
            "ingredients": ingredients,
            "nutrition_img": nutrition_img,
        }
    )
    return product

  0%|          | 0/39 [00:00<?, ?it/s]

In [12]:
# Gather details for each product
detailed_products = [get_product_details(product) for product in products]

In [13]:
# Store the data in a DataFrame
df = pd.DataFrame(detailed_products)

In [14]:
# How many products?
len(df["name"])

39

In [15]:
df["date_fetch"] = today
df["parent"] = parent
df["brand"] = brand_formal

In [16]:
# Output

In [17]:
# Local files
df.to_csv(CSV_OUT, index=False)
df.to_json(JSON_OUT, indent=4, orient="records")

In [18]:
# Paths for S3 storage
S3_BUCKET = "stilesdata.com"
S3_CSV_KEY = f"products/data/{brand}.csv"
S3_JSON_KEY = f"products/data/{brand}.json"

# Initialize boto3 client with environment variables
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("MY_AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("MY_AWS_SECRET_ACCESS_KEY"),
    aws_session_token=os.getenv("MY_AWS_SESSION_TOKEN"),
)

# Upload the CSV file to S3
s3_client.upload_file(str(CSV_OUT), S3_BUCKET, S3_CSV_KEY)
print(f"CSV file uploaded to s3://{S3_BUCKET}/{S3_CSV_KEY}")

# Upload the JSON file
s3_client.upload_file(str(JSON_OUT), S3_BUCKET, S3_JSON_KEY)
print(f"JSON file uploaded to s3://{S3_BUCKET}/{S3_JSON_KEY}")

CSV file uploaded to s3://stilesdata.com/products/data/buldak.csv
JSON file uploaded to s3://stilesdata.com/products/data/buldak.json
