# Banchan dishes
> This notebook scrapes banchan dishes from Maangchi's fab [recipes page](https://www.maangchi.com/recipes/banchan). 

---

#### Import Python tools and Jupyter config

In [1]:
import requests
import pandas as pd
import jupyter_black
from bs4 import BeautifulSoup
jupyter_black.load()

---

## Fetch

#### Scrape data

In [2]:
# Base URL for banchan recipes
base_url = "https://www.maangchi.com/recipes/banchan/page/{}"
page = 1  # Starting page number
banchan_dishes = []

while True:
    # Make a request to the current page
    response = requests.get(base_url.format(page))
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all recipe cards
    recipe_cards = soup.find_all("div", class_="taxonomy-card")

    # If no recipe cards are found, we've reached the last page
    if not recipe_cards:
        break

    # Loop through the recipe cards to extract information
    for card in recipe_cards:
        post_url = card.find("a")["href"]
        title = card.find("a")["title"]
        dish_name = card.find("h3").get_text()

        # Handle cases where phonetic or Korean may be missing or structured differently
        p_contents = card.find("p").contents
        phonetic = p_contents[0].strip() if len(p_contents) > 0 else None
        korean = p_contents[2].strip() if len(p_contents) > 2 else None

        # Check if the image exists before trying to access its 'src' attribute
        img_tag = card.find("img")
        thumbnail_url = img_tag["src"] if img_tag else None

        # Dictionary for each dish
        banchan_dish = {
            "dish": dish_name,
            "phonetic": phonetic,
            "korean": korean,
            "slug": post_url,
            "thumbnail": thumbnail_url,
        }

        # Add the dish dictionary to the list
        banchan_dishes.append(banchan_dish)

    # Move to the next page
    page += 1

In [3]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(banchan_dishes)

In [4]:
# Remove url prefixes
df["slug"] = df["slug"].str.replace("https://www.maangchi.com/recipe/", "")
df["thumbnail"] = df["thumbnail"].str.replace(
    "https://www.maangchi.com/wp-content/uploads/", ""
)

In [5]:
# First five entries
df.head()

Unnamed: 0,dish,phonetic,korean,slug,thumbnail
0,Sautéed cucumbers,Oi-bokkeum,오이볶음,oi-bokkeum,2024/09/oi-bokkeum-150x150.jpg
1,Korean meatballs with sauce,Wanja,완자,wanja,2024/08/wanja-150x150.jpg
2,Shredded cabbage pickles,Yangbaechu-pickle,양배추피클,yangbaechu-pickle,2024/05/ybc-pickle-150x150.jpg
3,Pan-fried seasoned Spanish mackerel,Samchi-yangnyeom-gui,삼치양념구이,samchi-yangnyeom-gui,2024/04/spanishmackerel-150x150.jpg
4,Spicy stir-fried blood sausage,Sundae-bokkeum,순대볶음,sundae-bokkeum,2024/02/sundae-bokkeum-150x150.jpg


---

## Exports

#### CSV

In [6]:
df.to_csv("data/processed/banchan_dishes.csv", index=False)

#### JSON

In [7]:
df.to_json("data/processed/banchan_dishes.json", orient="records", index=4)