# Starbucks menus over time
> This notebook collects Starbucks menus over time by reading historical snapshots of them from the Wayback Machine. The files are nested JSON used to feed the company's [menu page](https://www.starbucks.com/menu). They are then parsed, flattened into a tabular structure and concatenated into a large, time-series dataframe that can be used for analysis.

#### Load Python tools and Jupyter config

In [1]:
import json
import requests
import pandas as pd
import jupyter_black
from time import sleep
from random import randint
from tqdm.notebook import tqdm

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [3]:
today = pd.Timestamp("today").strftime("%Y%m%d")

---

## Metadata

#### Headers for request

In [4]:
headers = {
    "accept": "application/json",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
}

#### JSON endpoint that points to menu snapshot files via the Internet Archive

In [5]:
url = "https://web.archive.org/cdx/search/cdx?url=https://www.starbucks.com/bff/ordering/menu&output=json"

#### Request the metaata file in JSON format

In [6]:
r = requests.get(url).json()

#### Read successful snapshots into a metadata dataframe

In [7]:
archive_src = pd.DataFrame(r, columns=r[0]).drop(0).query("statuscode == '200'")

In [8]:
archive_src.head()

Unnamed: 0,urlkey,timestamp,original,mimetype,statuscode,digest,length
1,"com,starbucks)/bff/ordering/menu",20191001092759,https://www.starbucks.com/bff/ordering/menu,application/json,200,PLSKAKMIVDJ37TMIYACESYUZSQWJFISX,33936
2,"com,starbucks)/bff/ordering/menu",20191119201052,https://www.starbucks.com/bff/ordering/menu,application/json,200,AI2ZRVLDD257QBRYS2CDCWNZVMBAPI36,48381
3,"com,starbucks)/bff/ordering/menu",20191222092754,https://www.starbucks.com/bff/ordering/menu,application/json,200,OYOPJS6VIBB6CR2DGKAPX7USFGTMDOL6,41225
4,"com,starbucks)/bff/ordering/menu",20200220090433,https://www.starbucks.com/bff/ordering/menu,application/json,200,PWEE2752UGGWB3OCXUO2RW4TQMUQW3EP,48545
5,"com,starbucks)/bff/ordering/menu",20200312004801,https://www.starbucks.com/bff/ordering/menu,application/json,200,3VI2GAROJ6YUO7BWLXVXNGF25MDH76J6,41321


---

## Process

#### Clean up dates and url needed to request individual snapshots over time

In [9]:
archive_src["datetime"] = pd.to_datetime(archive_src["timestamp"])
archive_src["date"] = pd.to_datetime(archive_src["timestamp"]).dt.date

In [10]:
archive_src["url"] = (
    "https://web.archive.org/web/"
    + archive_src["timestamp"].astype(str)
    + "if_/"
    + archive_src["original"].astype(str)
)

In [11]:
archive_src["month_year"] = pd.to_datetime(archive_src["timestamp"]).dt.strftime(
    "%m-%Y"
)

#### Clean up the dataframe, limiting it to just one snapshot per month

In [12]:
archive_df = (
    archive_src.sort_values(["datetime", "date"], ascending=True)
    .drop_duplicates("date", keep="last")
    .drop(["mimetype", "urlkey", "digest", "statuscode", "original"], axis=1)
).reset_index(drop=True)

---

## Fetch

#### Function to flatten JSON we get from the archived Starbucks menus

In [13]:
def flatten_structure(entries, path=None, level=0):
    if path is None:
        path = []
    flattened_data = []

    for entry in entries:
        current_path = path + [entry.get("name", "")]
        extended_path = current_path + [""] * (3 - len(current_path))

        # Process products in the current entry
        for product in entry.get("products", []):
            product_info = {
                "main_category": extended_path[0] if len(extended_path) > 0 else "",
                "sub_category": extended_path[1] if len(extended_path) > 1 else "",
                "category_detail": extended_path[2] if len(extended_path) > 2 else "",
                "product_name": product.get("name"),
                "product_form": product.get("formCode"),
                "display_order": product.get("displayOrder"),
                "product_number": product.get("productNumber"),
                "product_type": product.get("productType"),
                "availability": product.get("availability"),
                "path": product.get("uri"),
                "sizes": [size.get("sizeCode") for size in product.get("sizes", [])],
            }
            flattened_data.append(product_info)

        if "children" in entry:
            flattened_data.extend(
                flatten_structure(entry["children"], path=current_path, level=level + 1)
            )

    return flattened_data

#### Create list of archive urls

In [14]:
archive_urls = archive_df["url"].to_list()

#### Loop through them, requesting data, storing each result in a list of flattened dataframes

In [15]:
menus = []

for url in tqdm(archive_urls):
    response = requests.get(url, headers=headers)
    json_data = response.json()["menus"]
    flattened_data = flatten_structure(json_data)
    df = pd.DataFrame(flattened_data)
    df["timestamp"] = url.split("/")[4].replace("if_", "")
    menus.append(df)
    sleep(randint(2, 5))

  0%|          | 0/148 [00:00<?, ?it/s]

#### Concatenate into a large dataframe

In [16]:
historial_df = pd.concat(menus).reset_index(drop=True)

#### Process dates

In [17]:
historial_df["date_captured"] = pd.to_datetime(historial_df["timestamp"]).dt.date
historial_df["year_captured"] = pd.to_datetime(historial_df["timestamp"]).dt.year

#### Just the columns we need

In [97]:
df = historial_df[
    [
        "product_number",
        "main_category",
        "sub_category",
        "category_detail",
        "product_type",
        "product_name",
        "product_form",
        "date_captured",
        "year_captured",
    ]
].copy()

---

## Exports

In [44]:
df.head()

Unnamed: 0,product_number,main_category,sub_category,category_detail,product_name,date_captured,year_captured
0,406,Drinks,Hot Coffees,Americanos,Caffè Americano,2019-10-01,2019
1,2122559,Drinks,Hot Coffees,Americanos,Starbucks® Blonde Caffè Americano,2019-10-01,2019
2,873068625,Drinks,Hot Coffees,Brewed Coffees,Blonde Roast,2019-10-01,2019
3,478,Drinks,Hot Coffees,Brewed Coffees,Caffè Misto,2019-10-01,2019
4,479,Drinks,Hot Coffees,Brewed Coffees,Featured Dark Roast,2019-10-01,2019


In [49]:
df["date_captured"] = pd.to_datetime(df["date_captured"])

  df["date_captured"] = pd.to_datetime(df["date_captured"])


In [131]:
df_slim = df[
    [
        "main_category",
        "sub_category",
        "category_detail",
        "product_type",
        "product_name",
        "product_form",
        "date_captured",
    ]
].copy()

In [164]:
hierarchical = (
    df.groupby(
        [
            "date_captured",
            "main_category",
            "sub_category",
            # "category_detail",
            # "product_type",
        ]
    )["product_name"]
    .count()
    .reset_index(name="count")
)

In [165]:
hierarchical["date_captured"] = pd.to_datetime(hierarchical["date_captured"])

In [166]:
dates_to_keep = [
    "2024-03-01",
]

In [185]:
df_observable = hierarchical[
    hierarchical["date_captured"] == "2024-03-30 00:00:00"
].reset_index(drop=True)

In [186]:
df_observable

Unnamed: 0,date_captured,main_category,sub_category,count
0,2024-03-30,At Home Coffee,VIA® Instant,5
1,2024-03-30,At Home Coffee,Whole Bean,18
2,2024-03-30,Drinks,Cold Coffees,27
3,2024-03-30,Drinks,Cold Drinks,30
4,2024-03-30,Drinks,Frappuccino® Blended Beverages,21
5,2024-03-30,Drinks,Hot Coffees,35
6,2024-03-30,Drinks,Hot Drinks,7
7,2024-03-30,Drinks,Hot Teas,10
8,2024-03-30,Drinks,Iced Teas,18
9,2024-03-30,Drinks,Oleato™,6


In [187]:
def create_hierarchical_structure_aggregated(df):
    structure = {"name": "root", "children": []}
    for main_category, main_group in df.groupby("main_category"):
        main_category_dict = {"name": main_category, "children": []}
        for _, row in main_group.iterrows():
            # Directly create sub_category nodes with aggregate values
            sub_category_dict = {"name": row["sub_category"], "value": row["count"]}
            main_category_dict["children"].append(sub_category_dict)
        structure["children"].append(main_category_dict)
    return structure

In [188]:
hierarchical_data = create_hierarchical_structure(df_observable)

# Convert to JSON
hierarchical_json = json.dumps(hierarchical_data, indent=4)

# Save to a JSON file
json_filename = "data/processed/menu_observable_structure.json"
with open(json_filename, "w") as f:
    f.write(hierarchical_json)

#### JSON format

In [20]:
df.to_json("data/processed/starbucks_historical_menus.json", indent=4, orient="records")

#### CSV format

In [None]:
df.to_csv("data/processed/starbucks_historical_menus.csv", index=False)
df.to_csv(f"data/processed/starbucks_historical_menus_{today}.csv", index=False)

In [1]:
!jupyter nbconvert --to script --no-prompt --output 01-fetch-menu-archive 01-fetch-menu-archive.ipynb

[NbConvertApp] Converting notebook 01-fetch-menu-archive.ipynb to script
[NbConvertApp] Writing 6994 bytes to 01-fetch-menu-archive.py
