In [1]:
import requests
import pandas as pd
import json
import warnings

import os

FIRST_PAGE = 1
LAST_PAGE = 13

In [2]:
# load .csv files to pandas dataframe
CUR_DIR = os.getcwd()
DB_LIST_PATH = os.path.join(CUR_DIR, 'plant_list.csv')
DB_LIST_ENTRIES = ['id', 'common_name', 'scientific_name', 'other_name', 'family', 'hybrid', 'authority', 'subspecies', 'cultivar', 'variety', 'species_epithet', 'genus', 'regular_url', 'medium_url', 'small_url', 'thumbnail']

# check if file exists and load it, else create a new dataframe
if os.path.exists(DB_LIST_PATH):
    plant_list_df = pd.read_csv(DB_LIST_PATH)
    print(plant_list_df.head())
    current_page = int((plant_list_df['id'].max() / 30) + 1)
    if current_page > LAST_PAGE:
        print("WARNING: Current page exceeds the last page. Plant list should be complete!")
    print(f"Next page to scrape [current page]: {current_page}")
else:
    plant_list_df = pd.DataFrame(columns=DB_LIST_ENTRIES)
    print("No plant list found. Creating a new one.")
    current_page = FIRST_PAGE

No plant list found. Creating a new one.


In [3]:
PERENUAL_API_URL = "https://perenual.com/api/v2/species-list"
PERENUAL_API_KEY = "sk-CIqR67f7e96325b299717"

In [None]:
wait_time = 5 # minutes

while current_page < LAST_PAGE:
    url = f"{PERENUAL_API_URL}?key={PERENUAL_API_KEY}&page={current_page}&indoor=1"
    print(f"requesting {url}")

    payload = {}
    headers = {}
    
    response = requests.request("GET", url, headers=headers, data=payload)

    if response.status_code == 200:
        # get the data
        data = response.json()
        data = data['data']

        # iterate over the data and add it to the dataframe
        for entry in data:
            # check if the entry already exists in the dataframe
            if entry['id'] not in plant_list_df['id'].values:
                # create df row
                row = pd.DataFrame([{
                    'id': entry['id'],
                    'common_name': entry.get('common_name', "null"),
                    'scientific_name': entry.get('scientific_name', "null"),
                    'other_name': entry.get('other_name', "null"),
                    'family': entry.get('family', "null"),
                    'hybrid': entry.get('hybrid', "null"),
                    'authority': entry.get('authority', "null"),
                    'subspecies': entry.get('subspecies', "null"),
                    'cultivar': entry.get('cultivar', "null"),
                    'variety': entry.get('variety', "null"),
                    'species_epithet': entry.get('species_epithet', "null"),
                    'genus': entry.get('genus', "null"),
                    'regular_url': entry['default_image']['regular_url'] if 'default_image' in entry and entry['default_image'] is not None and 'regular_url' in entry['default_image'] else 'null',
                    'medium_url': entry['default_image']['medium_url'] if 'default_image' in entry and entry['default_image'] is not None and 'medium_url' in entry['default_image'] else 'null',
                    'small_url': entry['default_image']['small_url'] if 'default_image' in entry and entry['default_image'] is not None and 'small_url' in entry['default_image'] else 'null',
                    'thumbnail': entry['default_image']['thumbnail'] if 'default_image' in entry and entry['default_image'] is not None and 'thumbnail' in entry['default_image'] else 'null'
                }])
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    plant_list_df = pd.concat([plant_list_df, row], ignore_index=True)
                print(f"\tadded {entry['id']}: {entry['common_name']}")
            else:
                print(f"\t{entry['id']}: {entry['common_name']} already exists in the dataframe. Skipping...")

        current_page += 1
        wait_time = 5

    elif response.status_code == 429:
        print("Max API requests reached. Try again tomorrow.")
        print(f"Set current_page manually to {current_page} to continue scraping.")
        break
    else:
        print(f"Error: {response.status_code}")
        print(f"Wait for {wait_time} minutes...")
        
        time.sleep(int(wait_time) * 60) # wait for the specified time
        if wait_time < 60:
            wait_time += 5 

print("Saving plant list to CSV...")
plant_list_df.to_csv(DB_LIST_PATH, index=False)
print("Plant list saved.")

requesting https://perenual.com/api/v2/species-list?key=sk-CIqR67f7e96325b299717&page=1&indoor=1
	added 425: flowering-maple
	added 426: flowering-maple
	added 427: flowering-maple
	added 428: flowering-maple
	added 434: Jacob's coat
	added 502: hot water plant
	added 540: desert rose
	added 543: maidenhair fern
	added 546: delta maidenhair fern
	added 549: urn plant
	added 551: stalked aeonium
	added 552: lipstick plant
	added 625: Philippine evergreen
	added 626: Chinese evegreen
	added 627: Chinese evergreen
	added 628: Chinese evergreen
	added 667: onion
	added 710: elephant's ear
	added 711: Amazonian elephant's ear
	added 712: elephant's ear
	added 713: elephant's ear
	added 714: elephant's ear
	added 715: giant elephant's ear
	added 716: elephant's ear
	added 717: elephant's ear
	added 718: elephant's ear
	added 721: aloe
	added 727: aloe
	added 728: aloe
	added 747: Joseph's coat
requesting https://perenual.com/api/v2/species-list?key=sk-CIqR67f7e96325b299717&page=2&indoor=1
	a