In [1]:
import requests
from bs4 import BeautifulSoup
from time import sleep

In [2]:
def scrape_stilltasty(index):
    # URL of the page to scrape
    url = f'https://www.stilltasty.com/Fooditems/index/{index}'

    # Send a GET request to the URL
    response = requests.get(url)

    # Initialize a dictionary to store values
    shelf_life = {}

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find first storage location
        storage_location_elements = soup.find_all('div', class_='food-storage-left')

        for e in storage_location_elements:
            storage_location = e.text.strip()
            # Find associated shelf life information
            shelf_life_info_element = e.find_next_sibling('div')
            shelf_life_info = shelf_life_info_element.text.strip()
            shelf_life[storage_location] = shelf_life_info

        return shelf_life
    else:
        # Print an error message if the request was not successful
        print(f"Error: Failed to retrieve data from {url}. Status code: {response.status_code}")

In [13]:
def get_index_numbers(url,result_type,get_names=False):
    index_numbers = []
    object_names = []

    if result_type == 'Food Items Indices':
        target_string = "/Fooditems/index/"
    if result_type == 'Search Results Indices':
        target_string = "/searchitems/index/"
    else:
        target_string = ''

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all links with href attribute containing target_string
        links = soup.find_all('a', href=lambda href: href and target_string in href)
        #Remove image links
        links = [link for link in links if not link.find('img')]
        # Extract index numbers from href attributes
        for link in links:
            href = link['href']
            index_number = href.split('/')[-1]
            index_numbers.append(index_number)
            if get_names:
                object_name = link.text.strip()
                object_names.append(object_name)
            
        return index_numbers, object_names 
    
    else:
        # Print an error message if the request was not successful
        print(f"Error: Failed to retrieve data from {url}. Status code: {response.status_code}")

In [40]:
# URL of the StillTasty index page
url = 'https://www.stilltasty.com/Fooditems/index'
target_string = "/searchitems/index/"
category_index_numbers, categories = get_index_numbers(url,target_string,True)
print(category_index_numbers)
print(categories)

['26', '25', '9', '27', '7', '28', '6', '31', '30', '5']
['Fruits', 'Vegetables', 'Dairy & Eggs', 'Meat & Poultry', 'Fish & Shellfish', 'Nuts, Grains & Pasta', 'Condiments & Oils', 'Snacks & Baked Goods', 'Herbs & Spices', 'Beverages']


In [15]:
print(len(category_index_numbers))
print(len(categories))

10
10


In [67]:
all_index_numbers = []
all_categories = []
for x,i in enumerate(category_index_numbers[1:2]):
    url = f'https://www.stilltasty.com/searchitems/index/{i}'
    #Initial results pages
    results_pages = [f'{i}']+get_index_numbers(url,"/searchitems/index/")[0]
    results_pages = list(set(results_pages))
    results_pages.sort()
    new_pages = results_pages
    while new_pages:
        print(new_pages)
        for j in new_pages:         
            print(f"Now scraping index {j}")
            url = f'https://www.stilltasty.com/searchitems/index/{j}'
            index_numbers = get_index_numbers(url,"/Fooditems/index/")[0]
            if index_numbers:
                all_index_numbers = all_index_numbers + index_numbers
                all_categories = all_categories + [categories[x]]*len(index_numbers)
            print(f'Completed index {j}')
            sleep(3)
        #Check for additional pages
        print('Getting new pages')
        url = f'https://www.stilltasty.com/searchitems/index/{new_pages[-1]}'
        new_pages = [k for k in get_index_numbers(url,"/searchitems/index/")[0] if k not in new_pages+results_pages]
        new_pages = list(set(new_pages))
        new_pages.sort()

print(all_index_numbers)
print(all_categories)

['25', '25?page=2', '25?page=3', '25?page=4', '25?page=5', '25?page=6', '25?page=7', '25?page=8', '25?page=9']
Now scraping index 25
Completed index 25
Now scraping index 25?page=2
Completed index 25?page=2
Now scraping index 25?page=3
Completed index 25?page=3
Now scraping index 25?page=4
Completed index 25?page=4
Now scraping index 25?page=5
Completed index 25?page=5
Now scraping index 25?page=6
Completed index 25?page=6
Now scraping index 25?page=7
Completed index 25?page=7
Now scraping index 25?page=8
Completed index 25?page=8
Now scraping index 25?page=9
Completed index 25?page=9
Getting new pages
['25?page=10', '25?page=11', '25?page=12', '25?page=13']
Now scraping index 25?page=10
Completed index 25?page=10
Now scraping index 25?page=11
Completed index 25?page=11
Now scraping index 25?page=12
Completed index 25?page=12
Now scraping index 25?page=13
Completed index 25?page=13
Getting new pages
['25?page=14']
Now scraping index 25?page=14
Completed index 25?page=14
Getting new pag

KeyboardInterrupt: 

In [64]:
url = f'https://www.stilltasty.com/searchitems/index/{new_pages[-1]}'
new_pages = [k for k in get_index_numbers(url,"/searchitems/index/")[0] if k not in new_pages+results_pages]

In [65]:
url

'https://www.stilltasty.com/searchitems/index/25?page=9'

In [66]:
new_pages

['25?page=10', '25?page=11', '25?page=12', '25?page=13', '25?page=10']

In [35]:
len(set(all_index_numbers))

182

In [36]:
all_index_numbers

['18767',
 '16353',
 '16354',
 '16367',
 '16368',
 '18799',
 '18800',
 '16454',
 '16455',
 '16539',
 '16540',
 '18801',
 '18802',
 '16596',
 '18961',
 '18966',
 '18803',
 '18804',
 '16679',
 '16680',
 '18768',
 '16819',
 '16820',
 '16841',
 '16842',
 '16875',
 '16876',
 '16906',
 '16907',
 '18632',
 '17980',
 '17981',
 '18789',
 '17193',
 '18805',
 '18806',
 '17239',
 '17240',
 '18962',
 '18967',
 '17261',
 '17262',
 '17316',
 '17317',
 '17318',
 '17319',
 '17320',
 '17321',
 '17322',
 '17323',
 '17324',
 '17325',
 '17326',
 '17327',
 '17328',
 '17329',
 '17330',
 '17331',
 '17408',
 '17409',
 '18763',
 '18807',
 '18808',
 '17417',
 '17418',
 '17422',
 '17423',
 '18810',
 '18809',
 '18540',
 '18541',
 '17472',
 '17473',
 '17515',
 '17647',
 '17654',
 '17655',
 '17668',
 '17669',
 '17675',
 '18733',
 '18960',
 '18965',
 '17765',
 '17766',
 '17791',
 '17792',
 '17793',
 '17794',
 '17795',
 '17796',
 '17797',
 '17798',
 '17799',
 '17800',
 '18846',
 '18847',
 '17803',
 '17804',
 '17805',


In [None]:
for i in all_index_numbers:
    # scrape_stilltasty(i)

In [26]:
from collections import Counter
Counter(all_categories)

Counter({'Fruits': 250,
         'Vegetables': 250,
         'Dairy & Eggs': 225,
         'Meat & Poultry': 250,
         'Fish & Shellfish': 225,
         'Nuts, Grains & Pasta': 250,
         'Condiments & Oils': 225,
         'Snacks & Baked Goods': 250,
         'Herbs & Spices': 175,
         'Beverages': 175})

In [None]:
scrape_stilltasty('17634')