# Final Project Code

## Outline
1. Getting the best recipe from AllRecipes
2. Getting products from Trader Joe's
3. Getting products from SaveMart
4. Getting products from Safeway
5. Getting products from Target
6. Getting products from Davis Food Co-op

## Getting the best recipe code

note: user has to input desired search url

code function:
1. extracts all recipe links from the search page
2. scrapes title, stars, and number of ratings from each of the links extracted
3. sorts each recipe according to the descending number of ratings, then descending stars
4. converts dictionary into data frame for organization

In [None]:
import requests 
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import parse_qs, urlparse

# function that extracts each recipe link from search page (url)
def extractlinks(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #extract anchor tags w/ links
    cards = soup.find_all('a', href = True)
    links = [link['href'] for link in cards if '/recipe/' in link['href']]
    
    return list(set(links))

# function that scrapes title, stars, number of ratings    
def scrapedetails(rurl, searchq):
    response = requests.get(rurl)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #scrape title
    telement = soup.select_one("h1.article-heading")
    title = telement.get_text(strip = True) if telement else "n/a"
    
    #scrape stars (0 - 5)
    selement = soup.select_one("div.mm-recipes-review-bar__rating.mntl-text-block.text-label-300")
    stars = float(selement.get_text(strip = True)) if selement else 0.0
    
    #scrape number of ratings
    relement = soup.select_one("div.mm-recipes-review-bar__rating-count.mntl-text-block.text-label-300")
    
    #if/else to determine if rating number exists. 
    if relement:
        #note: if exists, it strips html of text (such as </div>) and ()
        ratingtext = relement.get_text(strip = True)
        ratingnum = int(ratingtext.strip("()").replace(",", ""))
    else:
        ratingnum = 0
    
    #matching search keywords with title
    if searchq.lower() not in title.lower():
        return None
    
    #dictionary for details scraped
    return {
        "title": title,
        "stars": stars,
        "ratings": ratingnum
    }
    
# function that fetches recipe from search page, then uses scrapedetails() to scrape specifics
def scrapeall(url, searchq):
    links = extractlinks(url)
    
    #list to store recipe links
    allrecipes = []
    
    #loops through each link and calls previous function to scrape details
    for link in links:
        rdetails = scrapedetails(link, searchq)
        if rdetails:
            #update list to store dictionary
            allrecipes.append(rdetails)
    
    return allrecipes

# sort function that sorts by descending number of ratings first, then descending number of stars
def sort(recipes): 
    rsorted = sorted(recipes, key = lambda x: (-x['ratings'], -x['stars']))
    return rsorted

# extract serach query from url
def extractsearchq(url):
    query = parse_qs(urlparse(url).query)
    search = query.get("q", [""])[0]
    searchq = search.replace("+", " ")
    return searchq

# main function
def main():
    url = input("paste search url here:")
    # specifically https://www.allrecipes.com/search?q=chocolate+chip+cookies for our project
    searchq = extractsearchq(url)
    recipes = scrapeall(url, searchq)
    rsorted = sort(recipes)
    
    #convert to data frame for organization purposes
    df = pd.DataFrame(rsorted)
    df.insert(0, "rank", range(1, 1 + len(df)))
    print(df.to_string(index = False))
    return df.head(1)["url"][0] # return url of the top recipe

# call main function
if __name__ == '__main__':
    top_recipe = main()

In [None]:
# extracts the ingredients given an allrecipe recipe link
def get_ingredients(url:str):
    # Send a GET request to fetch the page content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find the <div> with the specific id
    div = soup.find('div', id='mm-recipes-structured-ingredients_1-0')

    # Find all <ul> elements within the <div>
    ul_elements = div.find_all('ul')

    # Initialize empty lists to store the ingredient data
    quantities = []
    units = []
    ingredient_names = []

    # Loop through all <ul> elements
    for ul in ul_elements:
        # Find all <li> elements within the <ul>
        li_elements = ul.find_all('li')

        # Loop through each <li> to extract the ingredients
        for li in li_elements:
            # Find the <p> elements within each <li>
            p_elements = li.find_all('p')
             
            # Assuming the <p> elements have <span> elements for quantity, unit, and ingredient
            for p in p_elements:
                spans = p.find_all('span')  # Find all <span> elements within each <p>
                
                if len(spans) == 3:  # We expect 3 spans: quantity, unit, and ingredient name
                    quantity = spans[0].get_text(strip=True)
                    unit = spans[1].get_text(strip=True)
                    ingredient_name = spans[2].get_text(strip=True)
                    
                    # Append the data to the lists
                    quantities.append(quantity)
                    units.append(unit)
                    ingredient_names.append(ingredient_name)

    # Create a pandas DataFrame from the extracted data
    df = pd.DataFrame({
        'quantity': quantities,
        'unit': units,
        'ingredient name': ingredient_names
    })
    return df

In [None]:
ingredients_df = get_ingredients(top_recipe)

In [1]:
import requests
import lxml.html as lx
import pandas as pd
import time
import re
import numpy as np
import matplotlib.pyplot as plt

## Initialize list of ingredients

In [None]:
ingredients_list = ["unsalted butter","all-purpose flour","white sugar",
                    "vanilla extract","brown sugar",
                    "large eggs","baking soda","semisweet chocolate chips",
                    "chopped walnuts"]

## Initialize dictionary to store results of each search query

In [3]:
# initialize dictionaries for dataframe results for each store
def initialize_ingredients_dict():
    global ingredients_list
    # Initialize dictionary with ingredients as keys and None as values
    ingredients_dict = {ingredient: None for ingredient in ingredients_list}
    return ingredients_dict

# initialize dictionary for each store
tjs = initialize_ingredients_dict()
svmrt = initialize_ingredients_dict()
sfwy = initialize_ingredients_dict()

## Function to filter for sufficient quantity and right product and sort by price

In [4]:
# Function to preprocess the text by removing spaces and punctuation and converting to lowercase
def preprocess(text:str):
    if not isinstance(text,str):
        return None
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [None]:
# Define the function to filter rows based on keywords
def filter_rows_by_keywords(df:pd.DataFrame,keywords:list,exclude_keywords:list,must_have:list,min_matches=2):
    # Function to count how many keywords are found in a row
    def count_keywords(row, keywords:list):
        return sum(keyword.lower() in row for keyword in keywords)
    
    # Function to check if any must exclude keywords are in a row
    def contains_exclude_keywords(row, exclude_keywords:list):
        return any(exclude_keyword.lower() in row for exclude_keyword in exclude_keywords)
    
    # Function to check if all must-have keywords are found in a row
    def contains_must_have_keywords(row, must_have_keywords:list):
        return all(keyword.lower() in row for keyword in must_have_keywords)

    df['Name'] = df["Name"].str.lower() # case insensitive matching
    
    # filter based on must-have, nice-to-have, and must exclude keywords
    # product must have at least 2 nice-to-have keywords to be considered
    return df[
        df["Name"].apply(lambda row: count_keywords(row, keywords) >= min_matches) &
        ~df["Name"].apply(lambda row: contains_exclude_keywords(row, exclude_keywords) if exclude_keywords else False) &
        df["Name"].apply(lambda row: contains_must_have_keywords(row, must_have))
        ]

# Function to check if a product has enough quantity
def check_sufficient_quantity(ingredient:str,row):
    # define acceptable quantities for each ingredient
    qties = {
        "unsalted butter":{"ea":2,"sticks":2,"lb":.5,"oz":8},
        "white sugar":{"oz":7,"lb":.44},
        "all-purpose flour":{"lb":3*0.275578,"oz":3*4.40925},
        "vanilla extract":{"fl oz":.2,'oz':.2},
        "dark brown sugar":{"oz":7,"lb":.44},
        "large eggs":{"doz":2/12,"ct":2,"count":2,"ea":2},
        "baking soda":{"lb":.0125,"oz":.2},
        "semisweet chocolate chips":{"oz":12},
        "chopped walnuts":{"oz":4.4,"lb":.275}
    }
    unit = preprocess(row['Unit']) # standardize unit of measurement syntax
    quantity = row['Quantity'] # retrieve the quantity
    
    # Check if product exists in qties
    required = qties[ingredient]
    
    # Check if the required quantity for the given unit is met
    if unit in required:
        required_quantity = required[unit] # retrieve the minimum required quantity
        return quantity >= required_quantity # check if the quantity is the minimum
    return False

# Combined function to filter based on acceptable, must have, and unacceptable keywords
def keyword_and_qty(df:pd.DataFrame,ingredient:str):
    # acceptable keywords for each ingredient
    # slight overlap with the must-have keywords
    keywords = {
        "unsalted butter":["unsalted","butter","quarter","stick"],
        "white sugar":["granulated","white","cane","pure"],
        "all-purpose flour":["purpose","flour","all"],
        "vanilla extract":["vanilla","extract","pure"],
        "dark brown sugar":["brown","sugar","pure"],
        "large eggs":["large","egg"],
        "baking soda":["baking","soda"],
        "semisweet chocolate chips":["chocolate",'chip','semi','sweet'],
        "chopped walnuts":["walnut","chopped"]
    }
    # must_have keywords for each ingredient
    must_have = {
        "unsalted butter":["unsalted","butter"],
        "white sugar":["sugar"],
        "all-purpose flour":["purpose","flour","all"],
        "vanilla extract":["vanilla","extract"],
        "dark brown sugar":["brown","sugar","dark"],
        "large eggs":["large","egg"],
        "baking soda":["baking","soda"],
        "semisweet chocolate chips":["chocolate",'chip','semi','sweet'],
        "chopped walnuts":["walnut"]
    }
    # unacceptable keywords
    bad_keywords = {
        "unsalted butter":["peanut","nut","almond"],
        "white sugar":["chicken","turbinado","coconut","gum","brown","drink",'powdered'],
        "all-purpose flour":["almond",'coconut'],
        "vanilla extract":["almond","mint","paste","bean"],
        "dark brown sugar":["turbinado","coconut",'oatmeal','sauce'],
        "large eggs":["hard","boiled","tuna","chicken","beef"],
        "baking soda":["toothpaste","litter"],
        "semisweet chocolate chips":['white'],
        "chopped walnuts":["almond","pecan","peanut","butter"]
    }
    df_keyword = filter_rows_by_keywords(df,keywords[ingredient],bad_keywords[ingredient],must_have[ingredient])
    if df_keyword.empty: # return empty dataframe
        return df_keyword
    df_fil = df_keyword[ # filter for items w/ sufficient quantity
        df_keyword.apply(lambda row: check_sufficient_quantity(ingredient,row),axis=1)
    ]
    return df_fil

In [None]:
# functions to filter the dataframe based on ingredient
# mainly checks for sufficient quantity
def fil_res(df:pd.DataFrame,ingredient):
    df_fil = keyword_and_qty(df,ingredient)
    return df_fil.sort_values(ascending=True,by="Price").head(1) # get minimum

## Function to sum prices together

In [None]:
# Function to sum the "Price" column if all dataframes are populated
def sum_prices_if_not_empty(df_dict):
    total_price = 0
    
    # Check if any dataframe is empty
    for df in df_dict.values():
        if df.empty:
            return 0  # Return 0 if any dataframe is empty
    
    # If no dataframe is empty, sum the "Price" column from all dataframes
    for df in df_dict.values():
        total_price += df['Price'].sum()
    
    return total_price

## Trader Joe's
Getting products from Trader Joe's

In [None]:
# function creates a dataframe w/ all the product information
# based on a search
# arguments: the ingredient name
def tj_info(ingredient:str):
    url = "https://www.traderjoes.com/api/graphql"
    i = 1 # counting parameter
    # initialize list to store all results
    results = []
    # run through the pages of the search results and save results to the list
    # realized that we needed this after butter didn't have relevant results until the later on
    while True:
        query = {"operationName":"SearchProducts",
                 "variables":{"storeCode":"182","availability":"1","published":"1","search":ingredient,"currentPage":i,"pageSize":15},
                 "query":"query SearchProducts($search: String, $pageSize: Int, $currentPage: Int, $storeCode: String = \"182\", $availability: String = \"1\", $published: String = \"1\") {\n  products(\n    search: $search\n    filter: {store_code: {eq: $storeCode}, published: {eq: $published}, availability: {match: $availability}}\n    pageSize: $pageSize\n    currentPage: $currentPage\n  ) {\n    items {\n      category_hierarchy {\n        id\n        url_key\n        description\n        name\n        position\n        level\n        created_at\n        updated_at\n        product_count\n        __typename\n      }\n      item_story_marketing\n      product_label\n      fun_tags\n      primary_image\n      primary_image_meta {\n        url\n        metadata\n        __typename\n      }\n      other_images\n      other_images_meta {\n        url\n        metadata\n        __typename\n      }\n      context_image\n      context_image_meta {\n        url\n        metadata\n        __typename\n      }\n      published\n      sku\n      url_key\n      name\n      item_description\n      item_title\n      item_characteristics\n      item_story_qil\n      use_and_demo\n      sales_size\n      sales_uom_code\n      sales_uom_description\n      country_of_origin\n      availability\n      new_product\n      promotion\n      price_range {\n        minimum_price {\n          final_price {\n            currency\n            value\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      retail_price\n      nutrition {\n        display_sequence\n        panel_id\n        panel_title\n        serving_size\n        calories_per_serving\n        servings_per_container\n        details {\n          display_seq\n          nutritional_item\n          amount\n          percent_dv\n          __typename\n        }\n        __typename\n      }\n      ingredients {\n        display_sequence\n        ingredient\n        __typename\n      }\n      allergens {\n        display_sequence\n        ingredient\n        __typename\n      }\n      created_at\n      first_published_date\n      last_published_date\n      updated_at\n      related_products {\n        sku\n        item_title\n        primary_image\n        primary_image_meta {\n          url\n          metadata\n          __typename\n        }\n        price_range {\n          minimum_price {\n            final_price {\n              currency\n              value\n              __typename\n            }\n            __typename\n          }\n          __typename\n        }\n        retail_price\n        sales_size\n        sales_uom_description\n        category_hierarchy {\n          id\n          name\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    total_count\n    page_info {\n      current_page\n      page_size\n      total_pages\n      __typename\n    }\n    __typename\n  }\n}\n"}
        time.sleep(.5) # try not to get banned
        # retrieve website information
        response = requests.post(url,json=query)
        # check for error
        response.raise_for_status()
        res = response.json() # convert to json
        # if no products are returned
        # escape the loops
        if not res["data"]["products"]["items"]:
            break
        results.append(res)
        i += 1 # increment counting parameter
        print(i) # diagnostic statement
    # create list to store product information
    prod = [[
        subitem["item_title"],
        float(subitem["retail_price"]),
        subitem["sales_size"],
        subitem["sales_uom_description"]
        ] for item in results for subitem in item["data"]["products"]["items"]]
    # organize results into dataframe
    df = pd.DataFrame(data=prod,columns=[
        "Name","Price","Quantity","Unit"
    ])
    df_fil = fil_res(df,ingredient)
    return df_fil

In [None]:
# get search results from Trader Joe's
for ingredient in tjs.keys():
    print(ingredient) # diagnostic statment
    ing_df = tj_info(ingredient) # get the search results from Trader Joe's
    tjs[ingredient] = ing_df # update dictionary

In [None]:
# print results
for ingredient in tjs:
    print(tjs[ingredient])

In [None]:
sum_prices_if_not_empty(tjs) # calculate total price

## SaveMart
Getting products from SaveMart

In [None]:
# function to retrieve the quantity and unit from each product name
def extract_quantity_and_unit(product_name):
    # This regex handles potential spacing issues and units at the end of the string
    regex = r"(\d+(\.\d+)?)\s*(fl\.?\s?oz\.?|oz\.?|ea|lbs?\.*|g|ml|kg|sticks|box|pound|gr|cups|tbsp|bottle|jar|ct|pk|count)(?=\s|$)"
    
    # Search for the pattern in the product name
    match = re.search(regex, product_name.lower())
    # if the pattern exists, get the quantity and unit
    if match:
        quantity = match.group(1) # the actual number
        unit = match.group(3) # unit of measurement
        return quantity, unit
    else:
        return None, None

# get product information from each search
# and organize into dataframe
# query: copied from the developer tools tab
def svmrt_info(ingredient:str):
    query = {"query":"\n    query MerchandisedCategory($adCapabilityPackId: AdCapabilityPackId!, $networkId: String!, $siteId: String!, $limit: Int!, $offset: Int!, $taxonomyNodeId: String!, $adsScreenName: String!, $cookie: String, $sortBy: ProductSortDimension, $productQueryMatch: ProductMatchInput!, $storeId: String) {\n  heroTopAd: singleAd(\n    adCapabilityPackId: $adCapabilityPackId\n    adParams: [{key: \"position\", value: \"HeroTop\"}, {key: \"asn\", value: $adsScreenName}, {key: \"ca\", value: $taxonomyNodeId}, {key: \"sz\", value: \"375x*\"}]\n    networkId: $networkId\n    siteId: $siteId\n    placementId: \"HeroTop\"\n  ) {\n    ...ad_ref_list\n  }\n  heroBottomAd: singleAd(\n    adCapabilityPackId: $adCapabilityPackId\n    adParams: [{key: \"position\", value: \"HeroBottom\"}, {key: \"asn\", value: $adsScreenName}, {key: \"ca\", value: $taxonomyNodeId}, {key: \"sz\", value: \"375x*\"}]\n    networkId: $networkId\n    siteId: $siteId\n    placementId: \"HeroBottom\"\n  ) {\n    ...ad_ref_list\n  }\n  browseSubcategories(id: $taxonomyNodeId) @skip(if: true)\n  navOptionsCategories(id: $taxonomyNodeId)\n  taxonomy(id: \"Mobile/P+C\") @skip(if: true) {\n    __typename\n    ...taxonomy\n  }\n  merchandisedDisplayableObjects(\n    id: $taxonomyNodeId\n    taxonomyId: \"Mobile/P+C\"\n    positions: [$adsScreenName]\n  ) @skip(if: true) {\n    __typename\n    ... on DisplayableAd {\n      ...displayable_ad\n    }\n    ... on CategoryPreview {\n      __typename\n      ads {\n        ...ad_with_placement\n      }\n      categoryId\n      ... on ProductCategoryPreview {\n        items {\n          ...product_summary\n        }\n        prefixItems {\n          ...product_summary_with_ad_telemetry\n        }\n      }\n    }\n  }\n  products(\n    limit: $limit\n    cookie: $cookie\n    offset: $offset\n    taxonomyId: \"Mobile/P+C\"\n    sortBy: $sortBy\n    match: $productQueryMatch\n    storeId: $storeId\n  ) {\n    ...product_result\n  }\n}\n    \n    fragment ad_ref_list on AdRefList {\n  __typename\n  adRef\n  adList {\n    ...ad_with_placement\n  }\n}\n    \n\n    fragment ad_with_placement on AdWithPlacement {\n  __typename\n  position\n  ad {\n    __typename\n    ...ad_with_telemetry\n  }\n  placementId\n}\n    \n\n    fragment ad_with_telemetry on AdDTO {\n  __typename\n  adTelemetry {\n    ...ad_telemetry\n  }\n  aid\n  aiid\n  height\n  pid\n  width\n  version\n  ... on CollectionAdDTO {\n    placements {\n      ...collection_ad_placement\n    }\n    ... on CarouselCollectionDTO {\n      randomizationStyle\n    }\n    title\n    subtitle\n  }\n  ... on ImageAdDTO {\n    image {\n      ...ad_image_asset\n    }\n  }\n  ... on ImageCarouselAdDTO {\n    carouselPages {\n      ...carousel_page\n    }\n  }\n  ... on LayoutAdDTO {\n    rows {\n      __typename\n      height\n      columns {\n        __typename\n        width\n        adRef\n      }\n    }\n  }\n  ... on RoundedImageAdDTO {\n    image {\n      ...ad_image_asset\n    }\n  }\n  ... on RoundedImageCarouselAdDTO {\n    carouselPages {\n      ...carousel_page\n    }\n  }\n}\n    \n\n    fragment ad_telemetry on AdTelemetry {\n  __typename\n  aid\n  aiid\n  dimensions {\n    __typename\n    key\n    value\n  }\n}\n    \n\n    fragment collection_ad_placement on PlacementDTO {\n  __typename\n  height\n  id\n  width\n  ... on LeafPlacementDTO {\n    adRef\n    ... on CarouselPlacementDTO {\n      displayDurationMs\n    }\n  }\n  ... on LayoutRowPlacementDTO {\n    placements {\n      ... on LeafPlacementDTO {\n        adRef\n        ... on CarouselPlacementDTO {\n          displayDurationMs\n        }\n      }\n    }\n  }\n}\n    \n\n    fragment ad_image_asset on ImageAsset {\n  __typename\n  id\n  densities {\n    __typename\n    swiftlyDensity\n    swiftlyDeviceClass\n    url\n  }\n  altText\n  revision\n  action {\n    __typename\n    action\n    name\n    target\n  }\n}\n    \n\n    fragment carousel_page on CarouselPage {\n  __typename\n  durationMs\n  imageAsset {\n    __typename\n    ...ad_image_asset\n  }\n}\n    \n\n    fragment taxonomy on Taxonomy {\n  allowedTypes\n  displayName\n  id\n  graph {\n    children\n    displayName\n    id\n    parents\n    renderingTemplate\n    type\n    images {\n      ...image_ref\n    }\n  }\n}\n    \n\n    fragment image_ref on ImageRef {\n  __typename\n  altText\n  caption\n  image(desiredDensity: ThreeX) {\n    ...image_file\n  }\n  images(desiredDensities: [ThreeX]) {\n    ...image_file\n  }\n  type {\n    ...image_type\n  }\n}\n    \n\n    fragment image_file on ImageFile {\n  __typename\n  density\n  device\n  uri\n}\n    \n\n    fragment image_type on ImageType {\n  __typename\n  description\n  height\n  type\n  width\n}\n    \n\n    fragment displayable_ad on DisplayableAd {\n  ad {\n    ...ad_ref_list\n  }\n}\n    \n\n    fragment product_summary on ProductSummary {\n  __typename\n  brand\n  categories\n  eligibilities {\n    __typename\n    eligibilityId\n  }\n  legacyPriceTag: fdPriceTag {\n    ...legacy_price_tag\n  }\n  id\n  images {\n    ...image_ref\n  }\n  ordinal\n  webPrice {\n    failure {\n      type\n      message\n      displayMessage\n    }\n    success {\n      basePrice\n      promoPrice\n      promoText\n      isSale\n    }\n  }\n  price {\n    ...price_result\n  }\n  shortDescription\n  tags {\n    ...product_tag\n  }\n  title\n  type\n}\n    \n\n    fragment legacy_price_tag on FamilyDollarPriceTag {\n  __typename\n  discountType\n  finalPrice {\n    ...legacy_price_detail\n  }\n  finalSavings {\n    ...legacy_savings_detail\n  }\n  priceUri\n  regularPrice {\n    ...legacy_price_detail\n  }\n  scenario\n  tags {\n    ...product_tag\n  }\n  termsAndConditions\n}\n    \n\n    fragment legacy_price_detail on PriceDetail {\n  __typename\n  buyQuantity\n  displayPrice\n  effectiveDate\n  expirationDate\n  getQuantity\n  mixAndMatchId\n  model\n  price\n  quantityLimit\n  quantityMinimum\n  soldBy\n  type\n}\n    \n\n    fragment legacy_savings_detail on SavingsDetail {\n  __typename\n  mustBuyAtLeast\n  savingsAmount\n  savingsDisplayAmount\n  savingsDisplayPercent\n  savingsPercent\n  savingsQuantity\n}\n    \n\n    fragment product_tag on Tag {\n  __typename\n  code\n  description\n  id\n  images {\n    ...image_ref\n  }\n  title\n  type\n  value\n}\n    \n\n    fragment price_result on PriceResult {\n  __typename\n  failure {\n    __typename\n    message\n    type\n  }\n  success {\n    ...price\n  }\n}\n    \n\n    fragment price on Price {\n  __typename\n  base {\n    ...price_model_properties\n  }\n  cost {\n    ...cost\n  }\n  info {\n    ...price_info\n  }\n  mixAndMatchId\n  promotion {\n    ...price_model_properties\n  }\n  savings {\n    ...savings\n  }\n  scenario\n  soldBy\n  unit\n}\n    \n\n    fragment price_model_properties on PriceModelProperties {\n  __typename\n  activeThrough {\n    ...active_through\n  }\n  amountOff {\n    ...money\n  }\n  buyQuantity\n  getQuantity\n  model\n  percentOff\n  price {\n    ...money\n  }\n  type\n}\n    \n\n    fragment active_through on ActiveThrough {\n  __typename\n  end {\n    ...active_date\n  }\n  start {\n    ...active_date\n  }\n}\n    \n\n    fragment active_date on ActiveDate {\n  __typename\n  date\n  time\n}\n    \n\n    fragment money on Money {\n  __typename\n  currency\n  denomination\n  value\n}\n    \n\n    fragment cost on Cost {\n  __typename\n  purchaseQuantity\n  totalCost {\n    ...money\n  }\n  type\n}\n    \n\n    fragment price_info on PriceInfo {\n  __typename\n  code\n  id\n  type\n  value\n}\n    \n\n    fragment savings on Savings {\n  __typename\n  percentSaved\n  savingsQuantity\n  totalSavingsAmount {\n    ...money\n  }\n  type\n}\n    \n\n    fragment product_summary_with_ad_telemetry on ProductSummaryWithAdTelemetry {\n  __typename\n  adTelemetry {\n    ...ad_telemetry\n  }\n  product {\n    ...product_summary\n  }\n}\n    \n\n    fragment product_result on ProductResult {\n  __typename\n  cookie\n  count\n  fieldFacets {\n    ...field_facet\n  }\n  overlapCount\n  prefixCount\n  prefixProducts {\n    ...product_summary_with_ad_telemetry\n  }\n  products {\n    ...product_summary\n  }\n  spelling {\n    __typename\n    corrected\n    original\n  }\n}\n    \n\n    fragment field_facet on FieldFacet {\n  __typename\n  displayName\n  entries {\n    ...field_facet_entry\n  }\n  field\n}\n    \n\n    fragment field_facet_entry on FieldFacetEntry {\n  __typename\n  count\n  entry\n}\n    ",
             "variables":{"adCapabilityPackId":"AD_CARACAL_1","adsScreenName":"ProductSearch","limit":50,"offset":0,"taxonomyNodeId":"Mobile/P+C/Product/RootCategoryId","networkId":"TSMC0.0000000002","siteId":"sm-shopperWeb","cookie":"","sortBy":"Featured","storeId":"604","productQueryMatch":{"search":{"keyword":ingredient}}}}
    url = "https://savemart.com/graphql"
    # get information from savemart
    response = requests.post(url,json=query)
    response.raise_for_status() # check for error
    response = response.json() # convert to json
    # initialize list to store product details
    prod = []
    # run through each product and add to prod list
    for item in response["data"]["products"]["products"]:
        # Check if "webPrice" exists and is not None
        if item.get("webPrice") and item["webPrice"].get("success"):
            # Access the basePrice only if it's safe to do so
            # base price is the price without any sales
            # from the base price key
            price = item["webPrice"]["success"].get("basePrice")
            # retrieve the price
            try: # most prices are in $ format
                actual_price = re.findall(r'\$(.*?)</span>', price)[0]
            except: # handle the price w/ 99¢
                actual_price = re.findall(r'>(.*?)</span>', price)[0]
                # strip the 99¢
                actual_price = "0."+actual_price.strip("¢")
            # retrieve quantity and unit
            try: 
                qty,unit = extract_quantity_and_unit(item["title"])
            except: # return none if fails
                qty = None
                unit = None
            # update list
            prod.append([item["brand"],actual_price,item["title"],qty,unit])
        else: # if no webprice or success, nothing is returned
            print("Price details not available")
    # create dataframe w/ results
    svmrt = pd.DataFrame(data=prod,columns=[
        "Brand","Price","Name","Quantity","Unit"
    ])
    # convert columns to numeric
    svmrt["Price"] = pd.to_numeric(svmrt['Price'], errors='coerce')
    svmrt["Quantity"] = pd.to_numeric(svmrt['Quantity'], errors='coerce')
    svmrt_fil = fil_res(svmrt,ingredient)
    return svmrt_fil # return the dataframe

In [None]:
# run through list of ingredients
# and get results from savemart
for ingredient in svmrt.keys():
    time.sleep(.2) # try not to get banned
    ing_df = svmrt_info(ingredient) # get information from SaveMart
    svmrt[ingredient] = ing_df # update dictionary

In [None]:
# check results
for ingredient in svmrt:
    print(svmrt[ingredient])

In [None]:
sum_prices_if_not_empty(svmrt)

## Safeway
Get products from Safeway

In [26]:
# function to get the search results form safeway
def safeway_search(ingredient:str):
    # initialize cookies
    s = requests.Session()
    s.cookies
    url = 'https://www.safeway.com/abs/pub/xapi/search/products'
    i = 0 # start value
    rows = 30 # results per query
    all_res = [] # list to store all results
    # loop through the search page results
    # and exit when no results are returned
    while True:  
        # search parameters
        params = {
            'q': ingredient,
            'rows': rows,
            'start': i, 
            'search-type': 'keyword',
            'request-id': '3998633988543',
            'storeid': 3132, 
            'pagename': 'search',
            'url': 'www.safeway.com', 
            'pageurl': 'www.safeway.com', 
            'dvid': 'web-autosuggest', 
            'facet': 'false',
            'visitorId': ""
        }
        headers = {'ocp-apim-subscription-key': 'e914eec9448c4d5eb672debf5011cf8f'}
        # get the information from safeway
        r = requests.get(url, params, headers = headers)
        # if no results were returned
        # escape the loop
        if not r.json()["response"]["docs"]:
            break
        # update list w/ all results
        all_res.append(r.json()["response"]["docs"])
        i += rows # move to next page
        print(i) # diagnostic statement
    return all_res

In [32]:
# get the quantity and unit of measurement for each item
# which is after the dash
def extract_after_dash_regex(input_string):
    # Regular expression to find everything after " - "
    match = re.search(r'[–-]\s*(.+)', input_string)
    if match: # if the dash expression exists, return part after the dash
        return match.group(1)  # Return the part after " - "
    return ""  # Return an empty string if no match is found

# extract the quantity and unit of measurement from the string
def extract_quantity_and_unit(input_string):
    # Regular expression to match a quantity (number) followed by a unit of measurement (letters)
    pattern = r'(\d+(\.\d+)?)\s*([A-Za-z\.]+(?:\s[A-Za-z\.]+)?)'
    match = re.search(pattern, input_string) # find the qty and unit of measurement
    if match: # if the qty and unit of measurement exist, extract them
        quantity = float(match.group(1))  # Extract the quantity (number)
        unit = match.group(3)  # Extract the unit of measurement
        return quantity, unit
    return None, None  # Return None if no match is found

# get the search results from safeway
# and organize into a pandas dataframe
def get_saf_prod(ingredient:str):
    # get search results
    search_res = safeway_search(ingredient)
    # initialize list to store most important results
    search_list = []
    # run through search results and populate list
    for item in search_res:
        for subitem in item:
            name = subitem["name"] # product name
            price = subitem["basePrice"] # non-sale price
            # extract the qty and units of measurement
            qty_meas = extract_after_dash_regex(name)
            qty,unit = extract_quantity_and_unit(qty_meas)
            # organize results into list
            subitem_lst = [name,price,qty,unit]
            # update master list
            search_list.append(subitem_lst)
    # organize results into dataframe
    df = pd.DataFrame(data=search_list,columns=["Name","Price","Quantity","Unit"])
    df = fil_res(df,ingredient)
    return df

In [None]:
# run through the ingredients
# and search for them on safeway
for ingredient in sfwy.keys():
    time.sleep(.2) # try not to get banned
    ing_df = get_saf_prod(ingredient) # get ingredients from safeway
    sfwy[ingredient] = ing_df # update dictionary

In [None]:
# check results
for ingredient in sfwy.keys():
    print(sfwy[ingredient])

In [None]:
# calculate total price
sum_prices_if_not_empty(sfwy)

## Target

In [None]:
import requests
import re
import pandas as pd
from urllib.parse import quote, unquote
from html import unescape
import json

# Endpoint and headers
url = "https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v2?"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}

# List of ingredients to be able to loop
ingredients = ["unsalted butter", "granulated sugar", "brown sugar", "eggs", "vanilla extract", "baking soda", "all purpose flour", "semi-sweet chocolate chip", "chopped walnuts"]

# Parameters for the API
parameters = {
    "key": "9f36aeafbe60771e321a7cc95a78140772ab3e96",
    "channel": "WEB",
    "count": 24,
    "default_purchasability_filter": "true",
    "include_dmc_dmr": "true",
    "include_sponsored": "true",
    "new_search": "true",
    "platform": "desktop",
    "pricing_store_id": 2455,
    "spellcheck": "true",
    "store_ids": [2455, 2268, 2408, 310, 3384],
    "visitor_id": "01939852639E02019551A1FC131A3326",
    "zip": 95616
}

def target_scraper(ingredient, regex_pattern):
    all_results = []
    offset = 0

    while True:
        # Update parameters dynamically for each request
        encoded_ingredient = quote(ingredient)
        parameters["keyword"] = ingredient
        parameters["page"] = f"/s/{encoded_ingredient}"
        parameters["offset"] = offset

        response = requests.get(url, params=parameters, headers=headers)
        
        # Check if response is successful
        if response.status_code != 200:
            print(f"Error fetching data for {ingredient}: {response.status_code}")
            break

        # Load response data
        try:
            data = response.json()
            products = data["data"]["search"]["products"]
        except (KeyError, json.JSONDecodeError):
            print(f"Error parsing data for {ingredient}")
            break
        
        # If no more products, exit loop
        if not products:
            break

        # Extract product name and price
        for product in products:
            title = product['item']['product_description']['title']
            price = product['price']['current_retail']
            decoded_title = unescape(unquote(title))

            # Filter products based on the regex pattern
            if re.search(regex_pattern, decoded_title, flags=re.IGNORECASE):
                all_results.append({'ingredient': ingredient, 'product_name': decoded_title, 'price': price})

        # Increment offset for next page
        offset += 24

    return pd.DataFrame(all_results)

# Define ingredient list and their filtering terms
filter_terms = {
    "unsalted butter": r"unsalted butter",
    "granulated sugar": r"granulated sugar",
    "brown sugar": r" dark brown sugar",
    "eggs": r"eggs",
    "vanilla extract": r"vanilla extract",
    "baking soda": r"baking soda",
    "all purpose flour": r"all purpose flour|all\-purpose flour",
    "semi-sweet chocolate chip": r"semi sweet chocolate|semi\-sweet chocolate",
    "chopped walnuts": r"chopped walnuts"
}

# Loop through each ingredient and collect data
main_df = pd.DataFrame()
for ingredient, regex_pattern in filter_terms.items():
    print(f"Scraping for: {ingredient}")
    ingredient_df = target_scraper(ingredient, regex_pattern)
    main_df = pd.concat([main_df, ingredient_df], ignore_index=True)

main_df.reset_index(drop=True, inplace=True)

def extract_amount(product_name):
    # Extracing the amount by using common units
    matches = re.findall(r"(\d+\.?\d*)\s?(fl\.?\s?oz|oz|lb|lbs|g|kg|ml|l|ct|pcs|pack|case|floz)", product_name, re.IGNORECASE)
    if matches:
        return f"{matches[-1][0]} {matches[-1][1].replace('.', '').lower()}"
    return None

# Creates an amount value for each of the products
main_df["amount"] = main_df["product_name"].apply(extract_amount)

# Remove rows with None in the "amount" column
main_df = main_df[main_df["amount"].notna()]

def filter_eggs(df):
    # Filtering out egg products that contain the word large, candy, or hard since they are not the type of eggs we are looking for
    return df[~((df["ingredient"] == "eggs") & 
                ((df["product_name"].str.contains("hard", case=False)) | 
                 (~df["product_name"].str.contains("large", case=False)) |
                 (df["product_name"].str.contains("candy", case=False))))]

filtered_df = filter_eggs(main_df)

# Filter for filtering out products that don't meet our minimum required amount per ingredient
FILTER_CRITERIA = {
    "unsalted butter": {"lb": 0.5, "ct": 2},
    "granulated sugar": {"lb": 0.5},
    "brown sugar": {"lb": 0.5, "oz": 7},
    "eggs": {"ct": 2},
    "all purpose flour": {"lb": 1, "g": 120, "oz": 4},
    "semi-sweet chocolate chip": {"oz": 12},
    "chopped walnuts": {"oz": 6},
    # Baking soda and vanilla extract will not need filtering as all products have enough (1-2 tablespoons)
}

def meets_criteria(row):
    # Filtering out products that dont meet the filtering criteria
    ingredient = row["ingredient"]
    
    # If the ingredient is baking soda or vanilla extract, keep it as is as these products have enough
    if ingredient in ["baking soda", "vanilla extract"]:
        return True

    amount_unit = row["amount"].split()
    
    if len(amount_unit) != 2:
        return False
    
    amount, unit = amount_unit
    amount = float(amount)
    
    # Check if the unit and amount match the criteria for this ingredient
    if ingredient in FILTER_CRITERIA and unit in FILTER_CRITERIA[ingredient]:
        return amount >= FILTER_CRITERIA[ingredient][unit]
    
    return False

filtered_df = filtered_df[filtered_df.apply(meets_criteria, axis=1)]

filtered_df.reset_index(drop=True, inplace=True)

filtered_df = filtered_df.copy()

# Sort the DataFrame to prioritize dark brown sugar and lowest price for each ingredient
filtered_df.sort_values(by=['ingredient', 'price'], ascending=[True, True], inplace=True)

# Drop duplicates to keep only the cheapest product per unique ingredient
cheapest_ingredients = filtered_df.drop_duplicates(subset='ingredient', keep='first')

cheapest_ingredients.reset_index(drop=True, inplace=True)

print(cheapest_ingredients[["ingredient","product_name","price","amount"]])

## Davis Co-op

In [None]:
# Endpoint URL and header
url = "https://daviscoop.storebyweb.com/s/1000-1/api/b/"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3"}

ingredients = ["unsalted butter", "granulated sugar", "dark brown sugar", "eggs", "vanilla extract", "baking soda", "all purpose flour", "semi-sweet chocolate chip", "walnuts"]

parameters = {"facets": {}, "ps": 32, "s": "", "g": []}

def coop_scraper(ingredient, regex_pattern):
    all_results = []
    pn = 1

    while True:
        parameters["pn"] = pn
        parameters["q"] = ingredient

        response = requests.post(url, json=parameters, headers=headers)
        
        # Check if response is successful
        if response.status_code != 200:
            print(f"Error fetching data for {ingredient}: {response.status_code}")
            break

        # Load response data
        try:
            data = response.json()
            products = data.get("items",[])
        except (KeyError, json.JSONDecodeError):
            print(f"Error parsing data for {ingredient}")
            break
        
        # If no more products, exit loop
        if not products:
            break

        # Extract product name and price
        for product in products:
            title = product['name']
            price = product['actualPrice']
            amount = product['size']
            brand = product['brand']
            decoded_title = unescape(unquote(title))

            # Filter products based on the regex pattern
            if re.search(regex_pattern, decoded_title, flags=re.IGNORECASE):
                all_results.append({'ingredient': ingredient, 'product_name': decoded_title, 'price': price, 'amount': amount, "brand": brand})

        if len(products) < parameters["ps"]:
            break

        # Increment offset for next page
        pn += 1
    
    return pd.DataFrame(all_results)

# Define ingredient list and their filtering terms
filter_terms = {
    "unsalted butter": r"unsalted butter",
    "granulated sugar": r"granulated sugar",
    "brown sugar": r"dark brown sugar",
    "eggs": r"eggs",
    "vanilla extract": r"vanilla extract",
    "baking soda": r"baking soda",
    "all purpose flour": r"all purpose flour|all\-purpose flour",
    "semi-sweet chocolate chip": r"semi sweet chocolate|semi\-sweet chocolate",
    "walnuts": r"walnuts"
}

main2_df = pd.DataFrame()
for ingredient, regex_pattern in filter_terms.items():
    print(f"Scraping for: {ingredient}")
    ingredient_df = coop_scraper(ingredient, regex_pattern)
    main2_df = pd.concat([main2_df, ingredient_df], ignore_index=True)

main2_df.reset_index(drop=True, inplace=True)

def filter_eggs(df):
    # Exclude products that contain words that signal on egg products we are not looking for
    return df[~((df["ingredient"] == "eggs") & 
                ((df["product_name"].str.contains("hard", case=False)) | 
                 (~df["product_name"].str.contains("large", case=False)) |
                 (df["product_name"].str.contains("candy", case=False))))]

def filter_brown_sugar(df):
    # Exclude products that contain keywords like wafels that we are not looking for
    return df[~((df["ingredient"] == "brown sugar") & 
                ((df["product_name"].str.contains("creamer", case=False)) | 
                 (df["product_name"].str.contains("maple", case=False)) |
                 (df["product_name"].str.contains("replacement", case=False)) |
                 (df["product_name"].str.contains("wafels", case=False)) |
                 (df["product_name"].str.contains("ice", case=False)) |
                 (df["product_name"].str.contains("milk", case=False)) |
                 (df["product_name"].str.contains("latte", case=False))))]

def filter_walnuts(df):
    # Excluding walnut products from sprouted as they are not the type of walnuts we are looking for
    return df[~((df["ingredient"] == "walnuts") & 
                ((df["product_name"].str.contains("sprouted", case=False))))]

filtered_df2 = filter_eggs(main2_df)
filtered_df2 = filter_brown_sugar(filtered_df2)
filtered_df2 = filter_walnuts(filtered_df2)

# Filter for filtering out products that don't meet our minimum required amount per ingredient
FILTER_CRITERIA = {
    "unsalted butter": {"oz.": 8},
    "granulated sugar": {"oz.": 8},
    "brown sugar": {"lb.": 0.5, "oz.": 7},
    "eggs": {"ct.": 2},
    "all purpose flour": {"lb.": 1, "g.": 120, "oz.": 4},
    "semi-sweet chocolate chip": {"oz.": 12},
    "walnuts": {"oz.": 6},
    # Baking soda and vanilla extract will not need filtering as all products have enough (1-2 tablespoons)
}

def meets_criteria(row):
    # Filtering out products that dont meet the filtering criteria
    ingredient = row["ingredient"]
    
    # If the ingredient is baking soda or vanilla extract, keep it as is as these products have enough
    if ingredient in ["baking soda", "vanilla extract"]:
        return True

    amount_unit = row["amount"].split()
    
    if len(amount_unit) != 2:
        return False
    
    amount, unit = amount_unit
    amount = float(amount)
    
    # Check if the unit and amount match the criteria for this ingredient
    if ingredient in FILTER_CRITERIA and unit in FILTER_CRITERIA[ingredient]:
        return amount >= FILTER_CRITERIA[ingredient][unit]
    
    return False

def preprocess_amount_column(df):
    # Some rows have invalid format so clean out the data frame to be able to run the filtering code by making sure every value in "amount" is a valid format
    cleaned_rows = []
    for index, row in df.iterrows():
        amount_unit = row["amount"].split()
        if len(amount_unit) == 2:  # Only keep rows with exactly two parts: number and unit
            try:
                float(amount_unit[0])
                cleaned_rows.append(row)
            except ValueError:
                continue
    return pd.DataFrame(cleaned_rows)

# Clean the 'amount' column
filtered_df2 = preprocess_amount_column(filtered_df2)

filtered_df2 = filtered_df2[filtered_df2.apply(meets_criteria, axis=1)]

filtered_df2.reset_index(drop=True, inplace=True)

# Ensure a clean DataFrame
filtered_df2 = filtered_df2.copy()

# Sort the DataFrame to prioritize dark brown sugar and lowest price for each ingredient
filtered_df2.sort_values(by=['ingredient', 'price'], ascending=[True, True], inplace=True)

# Drop duplicates to keep only the cheapest product per unique ingredient
cheapest_ingredients = filtered_df2.drop_duplicates(subset='ingredient', keep='first')

cheapest_ingredients.reset_index(drop=True, inplace=True)

print(cheapest_ingredients)