# Fetch products from Brandy Melville

> This notebook collects and processes listings on the company's [product page](https://us.brandymelville.com/collections/all).

In [1]:
# Python tools and Jupyter config

In [2]:
import os
import re
import json
import time
import boto3
import requests
import pandas as pd
import jupyter_black
from time import sleep
from pathlib import Path
from random import randint
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
today = pd.Timestamp.today().strftime("%Y-%m-%d")

In [4]:
# Headers for requests

In [5]:
headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Referer": "https://www.ottogi.co.kr/en/product/product_search",
}

In [6]:
# Variables

In [7]:
brand = "brandy_melville"
parent = "Marsan Family"
brand_formal = "Brandy Melville"

In [8]:
# BASE = Path(__file__).resolve().parent
BASE = Path.cwd()
CSV_OUT = BASE / f"../../../data/brands/{brand}.csv"
JSON_OUT = BASE / f"../../../data/brands/{brand}.json"

In [9]:
base_url = "https://us.brandymelville.com"
collection_url = f"{base_url}/collections/all"

In [10]:
def clean_description_text(text):
    return re.sub(r"\s+", " ", text.replace("\n", " ")).strip()


def parse_description(description_text):
    description = "N/A"
    fabrics = "N/A"
    measurements = "N/A"
    manufactured_in = "N/A"

    # Clean the description text
    description_text = clean_description_text(description_text)

    # Extract description
    description_match = re.search(
        r"Product Photo - (.*?)Fabrics:", description_text, re.DOTALL
    )
    if description_match:
        description = description_match.group(1).strip()
    else:
        description = (
            description_text.split("Fabrics:")[0]
            .replace("Product Photo - ", "")
            .strip()
        )

    # Extract fabrics
    fabrics_match = re.search(
        r"Fabrics:\s*(.*?)(Measurements?:|Made in:|$)", description_text
    )
    if fabrics_match:
        fabrics = fabrics_match.group(1).strip()

    # Extract measurements
    measurements_match = re.search(
        r"Measurements?:\s*(.*?)(Made in:|$)", description_text
    )
    if measurements_match:
        measurements = measurements_match.group(1).strip()

    # Extract made in
    manufactured_in_match = re.search(
        r"Made in:\s*(.*?)(Measurements?:|$)", description_text
    )
    if manufactured_in_match:
        manufactured_in = manufactured_in_match.group(1).strip()

    return description, fabrics, measurements, manufactured_in


# Function to scrape product data from a single page
def scrape_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    products = []

    for product in soup.select(".grid__item"):
        try:
            title_tag = product.select_one(".card-information__text.h5")
            title = (
                title_tag.get_text(strip=True).replace("Product Name:", "").strip()
                if title_tag
                else "N/A"
            )

            price_tag = product.select_one(".price__regular .price-item")
            price = (
                price_tag.get_text(strip=True).replace("Product Price:", "").strip()
                if price_tag
                else "N/A"
            )

            product_url_tag = product.select_one("a.full-unstyled-link")
            product_url = (
                base_url + product_url_tag["href"] if product_url_tag else "N/A"
            )

            image_tag = product.select_one(".media img")
            image_url = "https:" + image_tag["src"] if image_tag else "N/A"

            # Extract the description and split it into components
            description_tag = product.select_one(".visually-hidden")
            if description_tag:
                description_text = description_tag.get_text(strip=True)
                description, fabrics, measurements, manufactured_in = parse_description(
                    description_text
                )
            else:
                description, fabrics, measurements, manufactured_in = (
                    "N/A",
                    "N/A",
                    "N/A",
                    "N/A",
                )

            products.append(
                {
                    "title": title,
                    "price": price,
                    "product_url": product_url,
                    "image_url": image_url,
                    "description": description,
                    "fabrics": fabrics,
                    "measurements": measurements,
                    "manufactured_in": manufactured_in,
                }
            )
        except Exception as e:
            print(f"Error processing product: {e}")

    return products


def scrape_all_products(collection_url):
    products_per_page = 20
    page = 1
    all_products = []

    while True:
        url = f"{collection_url}?page={page}"
        print(f"Scraping page {page}")
        products = scrape_page(url)
        if not products:
            break
        all_products.extend(products)
        page += 1
        time.sleep(1)  # Add delay to avoid getting blocked

    return all_products

In [11]:
# Main script execution
all_products = scrape_all_products(collection_url)

Scraping page 1


Scraping page 2


Scraping page 3


Scraping page 4


Scraping page 5


Scraping page 6


Scraping page 7


Scraping page 8


Scraping page 9


Scraping page 10


Scraping page 11


Scraping page 12


Scraping page 13


Scraping page 14


Scraping page 15


Scraping page 16


Scraping page 17


Scraping page 18


Scraping page 19


Scraping page 20


Scraping page 21


Scraping page 22


Scraping page 23


Scraping page 24


Scraping page 25


Scraping page 26


Scraping page 27


Scraping page 28


Scraping page 29


Scraping page 30


Scraping page 31


Scraping page 32


Scraping page 33


Scraping page 34


Scraping page 35


Scraping page 36


Scraping page 37


Scraping page 38


Scraping page 39


Scraping page 40


In [12]:
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_products)

In [13]:
df["date_fetch"] = today
df["parent"] = parent
df["brand"] = brand_formal

In [14]:
len(df)

780

In [15]:
# Output

In [16]:
# Local files
df.to_csv(CSV_OUT, index=False)
df.to_json(JSON_OUT, indent=4, orient="records")

In [17]:
# Paths for S3 storage
S3_BUCKET = "stilesdata.com"
S3_CSV_KEY = f"products/data/{brand}.csv"
S3_JSON_KEY = f"products/data/{brand}.json"

# Initialize boto3 client with environment variables
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("MY_AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("MY_AWS_SECRET_ACCESS_KEY"),
    aws_session_token=os.getenv("MY_AWS_SESSION_TOKEN"),
)

# Upload the CSV file to S3
s3_client.upload_file(str(CSV_OUT), S3_BUCKET, S3_CSV_KEY)
print(f"CSV file uploaded to s3://{S3_BUCKET}/{S3_CSV_KEY}")

# Upload the JSON file
s3_client.upload_file(str(JSON_OUT), S3_BUCKET, S3_JSON_KEY)
print(f"JSON file uploaded to s3://{S3_BUCKET}/{S3_JSON_KEY}")

CSV file uploaded to s3://stilesdata.com/products/data/brandy_melville.csv


JSON file uploaded to s3://stilesdata.com/products/data/brandy_melville.json
