# Fetch products from Ottogi

> This notebook collects and processes listings on the company's product page.

In [1]:
# Python tools and Jupyter config

In [2]:
import os
import json
import boto3
import requests
import pandas as pd
import jupyter_black
from time import sleep
from pathlib import Path
from random import randint
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
today = pd.Timestamp.today().strftime("%Y-%m-%d")

In [4]:
# Headers for requests

In [5]:
headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Referer": "https://www.ottogi.co.kr/en/product/product_search",
}

In [6]:
# Variables

In [7]:
brand = "ottogi"
parent = "Ottogi Corporation"
brand_formal = "Ottogi"

In [8]:
# BASE = Path(__file__).resolve().parent
BASE = Path.cwd()
CSV_OUT = BASE / f"../../../data/brands/{brand}.csv"
JSON_OUT = BASE / f"../../../data/brands/{brand}.json"

In [9]:
# Get categories

In [10]:
# Function to extract categories and create a lookup dictionary
def get_category_lookup():
    cat_url = "https://www.ottogi.co.kr/en/product/product_cat"
    cat_response = requests.get(cat_url, headers=headers)
    cat_soup = BeautifulSoup(cat_response.text, "html.parser")
    categories = {}
    category_elements = cat_soup.find_all("div", class_="prd_item")

    for element in category_elements:
        category_name = (
            element.find("p", class_="hashtag").get_text(strip=True).replace("#", "")
        )
        category_url = element.find("a")["href"]
        category_id = category_url.split("idx=")[-1]
        categories[category_id] = category_name

    return categories

In [11]:
# Get the category lookup dictionary
category_lookup = get_category_lookup()

In [12]:
# Get product information

In [13]:
# Set up initial parameters
params = {
    "sword": "",
    "page": "1",
    "pageSize": "20",
}

base_url = "https://www.ottogi.co.kr"
all_products = []
page = 1

In [14]:
# Collect cookies by first visiting the main page
main_page_response = requests.get("https://www.ottogi.co.kr/en/product/product_search")
cookies = main_page_response.cookies

while True:
    # Update the page number in params
    params["page"] = str(page)

    # Request the product list for the current page
    response = requests.get(
        "https://www.ottogi.co.kr/en/product/product_search_list_json",
        params=params,
        cookies=cookies,
        headers=headers,
    )

    data = response.json()
    products = data["data"]
    items_count = data["itemsCount"]

    # If there are no products on the current page, break the loop
    if not products:
        break

    # Append the products to the all_products list
    all_products.extend(products)

    # Increment the page number for the next iteration
    page += 1

In [15]:
# Convert the collected products to a DataFrame
df = pd.DataFrame(all_products)

In [16]:
df["categorySeqFirst"] = df["categorySeqFirst"].astype(str).str.replace(".0", "")

In [17]:
df["categorySeqFirstName"] = df["categorySeqFirst"].map(category_lookup)

In [18]:
df["date_fetch"] = today
df["parent"] = parent
df["brand"] = brand_formal

In [19]:
# Output

In [20]:
# Local files
df.to_csv(CSV_OUT, index=False)
df.to_json(JSON_OUT, indent=4, orient="records")

In [21]:
# Paths for S3 storage
S3_BUCKET = "stilesdata.com"
S3_CSV_KEY = f"products/data/{brand}.csv"
S3_JSON_KEY = f"products/data/{brand}.json"

# Initialize boto3 client with environment variables
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("MY_AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("MY_AWS_SECRET_ACCESS_KEY"),
    aws_session_token=os.getenv("MY_AWS_SESSION_TOKEN"),
)

# Upload the CSV file to S3
s3_client.upload_file(str(CSV_OUT), S3_BUCKET, S3_CSV_KEY)
print(f"CSV file uploaded to s3://{S3_BUCKET}/{S3_CSV_KEY}")

# Upload the JSON file
s3_client.upload_file(str(JSON_OUT), S3_BUCKET, S3_JSON_KEY)
print(f"JSON file uploaded to s3://{S3_BUCKET}/{S3_JSON_KEY}")

CSV file uploaded to s3://stilesdata.com/products/data/ottogi.csv


JSON file uploaded to s3://stilesdata.com/products/data/ottogi.json
