# WK Kellog Co. products. They're gr-r-reat!

#### Load Python tools and Jupyter config

In [1]:
import re
import requests
import pandas as pd
import jupyter_black
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange
import altair as alt
import altair_stiles as altstiles

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 100
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [3]:
brand = "kellogs"
color = "#c91245"
color2 = "#003da5"

---

## Scrape Kelloggs products

#### Headers for the requests

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

---

## Brands

#### Get details about the company's major brand verticals (Frosted Flakes, Special K, Corn Flakes, etc.)

In [5]:
brands_response = requests.get(
    "https://www.wkkellogg.com/en-us/our-foods/our-brands.html", headers=headers
)
brands_content = BeautifulSoup(brands_response.text, "html.parser")

#### Make a list of the company's brand URLs

In [6]:
brand_urls = []

for b in brands_content.find_all("div", class_="grid-item"):
    url = f'https://www.wkkellogg.com{b.find("a")["href"]}'
    # img_tag = b.find("img")
    # alt_text = img_tag.get("alt")
    brand_urls.append(url.replace("http://www.wkkellogg.ca", ""))

---

## Product list

#### Loop through the brands to collect product detail page urls and other info

In [7]:
prod_list = []

for u in tqdm(brand_urls):
    products_response = requests.get(f"{u}", headers=headers)
    products_content = BeautifulSoup(products_response.text, "html.parser")

    # Check if the h2 tag exists
    product_headline_tag = products_content.find("h2")
    if product_headline_tag:
        product_tagline = product_headline_tag.text
    else:
        product_tagline = ""  # or any default value you prefer

    product_div = products_content.find(
        "div", class_="titlecontentblock__content__body content_intro_text"
    )

    # Check if product_div and its child p tag exist
    if product_div:
        product_desc_tag = product_div.find("p")
        if product_desc_tag:
            product_desc = product_desc_tag.text
        else:
            product_desc = None  # or any default value you prefer
    else:
        product_desc = None  # or any default value you prefer

    for prod in products_content.find_all("div", class_="products-list-product"):
        product_url = prod.find("a", class_="track")["href"]
        product_name = prod.find("a", class_="track").text.strip()
        product_image = f'http:{prod.find("img")["src"]}'

        prod_dict = {
            "product_name": product_name,
            "product_tagline": product_tagline,
            "product_url": product_url,
            "product_image": product_image,
            "product_desc": product_desc,
        }
        prod_list.append(prod_dict)

  0%|          | 0/20 [00:00<?, ?it/s]

#### Into a dataframe

In [8]:
prod_list_df = pd.DataFrame(prod_list)

---

## Product details

#### Loop through product page urls

In [9]:
prod_pages_urls = prod_list_df["product_url"].to_list()

#### For each url, capture product name, tagline, ingredients, nutritional info and related info

In [10]:
product_page_list = []

for prod_url in tqdm(prod_pages_urls):
    prod_page_response = requests.get(prod_url, headers=headers)
    prod_page_content = BeautifulSoup(prod_page_response.text, "html.parser")

    details = prod_page_content.find("div", class_="product-detail-overview")
    try:
        product_site = details.find_all("a", class_="cta")[1]["href"]
    except:
        continue
    product_description = (
        details.find("p", itemprop="description").text.strip().replace("\n", "")
    )
    product_category = details.find("a", class_="product-category-back").text
    product_headline = details.find("h1", itemprop="name").text
    product_ingredients = (
        prod_page_content.find("div", class_="nutrition__body__ingredients")
        .text.strip()
        .replace("\n", "")
        .replace("IngredientsIngredients: ", "")
    )
    try:
        product_smart_label = prod_page_content.find(
            "a", class_="cta cta--light cta--outlined smartlabel-cta"
        )["href"]
    except:
        continue

    # Find the product reviews section and extract the data-bv-product-id attribute
    product_reviews_div = prod_page_content.find("div", class_="productreviews")
    product_id = None
    if product_reviews_div:
        product_id_div = product_reviews_div.find("div", {"data-bv-product-id": True})
        if product_id_div:
            product_id = product_id_div["data-bv-product-id"].replace("Product_", "")

    # Extract nutritional information from product_smart_label URL
    if product_smart_label:
        smart_label_response = requests.get(product_smart_label, headers=headers)
        smart_label_content = BeautifulSoup(smart_label_response.text, "html.parser")

        # Find the nutrition section and extract relevant information
        nutrition_div = smart_label_content.find("div", id="nutrition")
        if nutrition_div:
            nutrition_values = nutrition_div.find_all("span", class_="header2")
            calories = nutrition_values[0].text.strip()
            sat_fat = nutrition_values[1].text.strip()
            sodium = nutrition_values[2].text.strip()
            sugar = nutrition_values[3].text.strip()

            # Add nutritional information to the product dictionary
            product_page_dict = {
                "product_id": product_id,
                "product_category": product_category,
                "product_headline": product_headline,
                "product_site": product_site,
                "product_description": product_description,
                "product_ingredients": product_ingredients,
                "product_smart_label": product_smart_label,
                "calories": calories,
                "saturated_fat": sat_fat,
                "sodium": sodium,
                "sugar": sugar,
            }

            product_page_list.append(product_page_dict)

  0%|          | 0/98 [00:00<?, ?it/s]

#### Put list of product dictionaries into a tidy dataframe

In [11]:
products_df = pd.DataFrame(product_page_list)

In [12]:
df = products_df[
    [
        "product_id",
        "product_category",
        "product_headline",
        "calories",
        "saturated_fat",
        "sodium",
        "sugar",
        "product_site",
        "product_description",
        "product_ingredients",
        "product_smart_label",
    ]
].copy()

#### Clean up values

In [13]:
df["calories"] = df["calories"].astype(int)

In [14]:
df["sodium_unit"] = df["sodium"].str.split(" ", expand=True)[1]
df["sodium"] = df["sodium"].str.split(" ", expand=True)[0].astype(int)

In [15]:
df["sugar_unit"] = df["sugar"].str.split(" ", expand=True)[1]
df["sugar"] = df["sugar"].str.split(" ", expand=True)[0].astype(int)

In [16]:
df["saturated_fat_unit"] = df["saturated_fat"].str.split(" ", expand=True)[1]
df["saturated_fat"] = df["saturated_fat"].str.split(" ", expand=True)[0].astype(float)

#### The result

In [17]:
df.head(2)

Unnamed: 0,product_id,product_category,product_headline,calories,saturated_fat,sodium,sugar,product_site,product_description,product_ingredients,product_smart_label,sodium_unit,sugar_unit,saturated_fat_unit
0,4508509,Kellogg's Frosted Flakes®,Kellogg's Frosted Flakes® Chocolate Milkshake,150,1.0,210,13,https://www.frostedflakes.com/en_US/home.html,Rich chocolate with sweet vanilla flavor generously covers each flake and amps up the fun by giv...,"Milled corn, sugar, cocoa processed with alkali, hydrogenated vegetable oil (coconut, soybean an...",https://smartlabel.kelloggs.com/Product/Index/00038000179969,mg,g,g
1,683835,Kellogg's Frosted Flakes®,Kellogg's Frosted Flakes® Strawberry Milkshake,140,0.0,190,11,https://www.frostedflakes.com/en_US/home.html,"With ripe, juicy strawberry flavor coupled with rich, creamy notes, this cereal turns the milk i...","Milled corn, sugar, malt flavor, contains 2% or less of salt, vegetable juice for color, natural...",https://smartlabel.kelloggs.com/Product/Index/00038000273384,mg,g,g


#### Total products

In [18]:
len(df)

91

---

## Aggregate

#### Sugar, sodium, saturated fat and calories, by brand category

In [19]:
brand_nutrition = (
    (
        df.groupby("product_category")
        .agg(
            {
                "calories": "mean",
                "saturated_fat": "mean",
                "sodium": "mean",
                "sugar": "mean",
            }
        )
        .round(2)
    )
    .sort_values("calories", ascending=False)
    .reset_index()
)

#### Sort desc by calories

In [20]:
brand_nutrition

Unnamed: 0,product_category,calories,saturated_fat,sodium,sugar
0,Mueslix®,250.0,0.0,150.0,17.0
1,Smart Start®,240.0,0.0,260.0,18.0
2,Cracklin’ Oat Bran®,230.0,3.5,65.0,16.0
3,Bear Naked®,212.5,1.83,87.5,9.58
4,Frosted Mini-Wheats®,205.0,0.12,35.0,11.88
5,Kashi®,196.67,0.4,88.0,9.47
6,Kellogg’s Raisin Bran®,182.5,0.0,202.5,16.0
7,Special K®,176.84,0.66,252.11,10.79
8,Insta Bowl,172.5,2.62,187.5,14.5
9,Rice Krispies®,155.0,0.12,146.25,11.0


---

## Charts

In [21]:
base = alt.Chart(brand_nutrition).encode(
    x=alt.X("calories:Q", axis=alt.Axis(tickCount=6), title="Calories per serving"),
    y=alt.Y("product_category:O", title="").sort("-x"),
    text="calories",
)
chart = base.mark_bar(color=color) + base.mark_text(align="left", dx=2).properties(
    width=300, height=400, title="Kellog's: Average calories by brand category"
)
chart.padding = {"left": -1}
chart

In [22]:
base = alt.Chart(brand_nutrition).encode(
    x=alt.X("sugar:Q", axis=alt.Axis(tickCount=6), title="Calories per serving"),
    y=alt.Y("product_category:O", title="").sort("-x"),
    text="sugar",
)
chart = base.mark_bar(color=color2) + base.mark_text(align="left", dx=2).properties(
    width=300, height=400, title="Kellog's: Average grams of sugar by brand category"
)
chart.padding = {"left": -1}
chart

---

## Exports

#### All products - JSON

In [23]:
df.to_json(
    f"data/processed/{brand.lower().replace(' ', '_')}_products.json",
    indent=4,
    orient="records",
)

#### All products - CSV

In [24]:
df.to_csv(f"data/processed/{brand.lower().replace(' ', '_')}_products.csv", index=False)

#### Nutrition by brand - JSON

In [25]:
brand_nutrition.to_json(
    f"data/processed/{brand.lower().replace(' ', '_')}_nutrition.json",
    indent=4,
    orient="records",
)

#### Nutrition by brand - CSV

In [26]:
brand_nutrition.to_csv(
    f"data/processed/{brand.lower().replace(' ', '_')}_nutrition.csv", index=False
)