In [1]:
import requests
from bs4 import BeautifulSoup
from time import sleep

In [42]:
def scrape_stilltasty(index,category):
    # URL of the page to scrape
    url = f'https://www.stilltasty.com/Fooditems/index/{index}'

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        #Get item name
        item_name = soup.find('h2').text.strip()

        # Find storage location elements
        storage_location_elements = soup.find_all('div', class_='food-storage-left')
        # Initialize a dictionary to store values
        shelf_life = {}
        # Loop through storage location elements and find the associated shelf life
        for e in storage_location_elements:
            storage_location = e.text.strip()
            # Find associated shelf life information
            shelf_life_info_element = e.find_next_sibling('div')
            shelf_life_info = shelf_life_info_element.text.strip()
            shelf_life[storage_location] = shelf_life_info
        
        #Find food tips
        tips = soup.find('div', class_='food-tips').text.strip()
        #Remove Author Info
        n = tips.find('About Our Author')
        tips = tips[:n]

        return {'item_name': item_name, 'category': category, 'url': url, 'shelf_life': shelf_life, 'food_tips': tips}
    else:
        # Print an error message if the request was not successful
        print(f"Error: Failed to retrieve data from {url}. Status code: {response.status_code}")

In [3]:
def get_index_numbers(url,target_string,get_names=False):
    index_numbers = []
    object_names = []

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all links with href attribute containing target_string
        links = soup.find_all('a', href=lambda href: href and target_string in href)
        #Remove image links
        links = [link for link in links if not link.find('img')]
        # Extract index numbers from href attributes
        for link in links:
            href = link['href']
            index_number = href.split('/')[-1]
            index_numbers.append(index_number)
            if get_names:
                object_name = link.text.strip()
                object_names.append(object_name)
            
        return index_numbers, object_names 
    
    else:
        # Print an error message if the request was not successful
        print(f"Error: Failed to retrieve data from {url}. Status code: {response.status_code}")

In [4]:
# URL of the StillTasty index page
url = 'https://www.stilltasty.com/Fooditems/index'
target_string = "/searchitems/index/"
category_index_numbers, categories = get_index_numbers(url,target_string,True)
print(category_index_numbers)
print(categories)

['26', '25', '9', '27', '7', '28', '6', '31', '30', '5']
['Fruits', 'Vegetables', 'Dairy & Eggs', 'Meat & Poultry', 'Fish & Shellfish', 'Nuts, Grains & Pasta', 'Condiments & Oils', 'Snacks & Baked Goods', 'Herbs & Spices', 'Beverages']


In [11]:
all_index_numbers = []
all_categories = []
for x,i in enumerate(category_index_numbers):
    url = f'https://www.stilltasty.com/searchitems/index/{i}'
    #Initial results pages
    results_pages = [f'{i}']+get_index_numbers(url,"/searchitems/index/")[0]
    results_pages = list(set(results_pages))
    results_pages.sort()
    new_pages = results_pages
    while new_pages:
        print(new_pages)
        for j in new_pages:         
            print(f"Now scraping index {j}")
            url = f'https://www.stilltasty.com/searchitems/index/{j}'
            index_numbers = get_index_numbers(url,"/Fooditems/index/")[0]
            if index_numbers:
                all_index_numbers = all_index_numbers + index_numbers
                all_categories = all_categories + [categories[x]]*len(index_numbers)
            print(f'Completed index {j}')
            sleep(2)
        #Check for additional pages
        print('Getting new pages')
        url = f'https://www.stilltasty.com/searchitems/index/{new_pages[-1]}'
        results_pages = results_pages+new_pages
        new_pages = [k for k in get_index_numbers(url,"/searchitems/index/")[0] if k not in results_pages]
        new_pages = list(set(new_pages))
        new_pages.sort()

['26', '26?page=2', '26?page=3', '26?page=4', '26?page=5', '26?page=6', '26?page=7', '26?page=8', '26?page=9']
Now scraping index 26
Completed index 26
Now scraping index 26?page=2
Completed index 26?page=2
Now scraping index 26?page=3
Completed index 26?page=3
Now scraping index 26?page=4
Completed index 26?page=4
Now scraping index 26?page=5
Completed index 26?page=5
Now scraping index 26?page=6
Completed index 26?page=6
Now scraping index 26?page=7
Completed index 26?page=7
Now scraping index 26?page=8
Completed index 26?page=8
Now scraping index 26?page=9
Completed index 26?page=9
Getting new pages
['25', '25?page=2', '25?page=3', '25?page=4', '25?page=5', '25?page=6', '25?page=7', '25?page=8', '25?page=9']
Now scraping index 25
Completed index 25
Now scraping index 25?page=2
Completed index 25?page=2
Now scraping index 25?page=3
Completed index 25?page=3
Now scraping index 25?page=4
Completed index 25?page=4
Now scraping index 25?page=5
Completed index 25?page=5
Now scraping index

In [12]:
len(all_index_numbers)

2356

In [44]:
len(all_categories)

2356

In [13]:
len(set(all_index_numbers))

2356

In [None]:
food_dict = {}
for i in all_index_numbers[:3]:
    # scrape_stilltasty(i)

In [14]:
from collections import Counter
Counter(all_categories)

Counter({'Fruits': 223,
         'Vegetables': 344,
         'Dairy & Eggs': 197,
         'Meat & Poultry': 350,
         'Fish & Shellfish': 196,
         'Nuts, Grains & Pasta': 229,
         'Condiments & Oils': 182,
         'Snacks & Baked Goods': 357,
         'Herbs & Spices': 134,
         'Beverages': 144})

In [43]:
scrape_stilltasty('18595','Vegetables')

{'item_name': 'TURNIPS - FRESH, RAW',
 'category': 'Vegetables',
 'url': 'https://www.stilltasty.com/Fooditems/index/18595',
 'shelf_life': {'Refrigerator': '2-3 weeks', 'Freezer': '8-10 months'},
 'food_tips': 'Shelf Life Tips\nHow long do raw turnips last? The precise answer to that question depends to a large extent on storage conditions - keep raw turnips refrigerated. \r\nTo maximize the shelf life of raw turnips, refrigerate in plastic bag.\r\nHow long do raw turnips last in the fridge? Properly stored, raw turnips will typically last for 2 to 3 weeks in the refrigerator. \r\nCan you freeze turnips? Yes, to freeze: (1) Wash, peel and cut into 1/2-inch cubes; (2) Blanch (plunge into boiling water) for two minutes and chill quickly in ice cold water; (3) Drain off excess moisture, package in airtight containers or freezer bags and freeze immediately. \r\nHow long do turnips last in the freezer? Properly stored, turnips will maintain best quality in the freezer for about 10 months, 