In [4]:
"""Production script to extract data from each plant of the 50.
data should be a list in dictionaries ready for transformation at the next stage"""
import logging
from requests import get, exceptions

from transform import convert_to_dataframe, parse_botanist_data, parse_origin_location, clean_image_data, clean_scientific_name, format_watered_column, format_recording_taken, capitalise_plant_name, process_temperature_column


logger = logging.getLogger(__name__)  # Create logger for this module
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
# (The basicConfig line is here for demonstration. In production it may be set elsewhere.)


def get_max_plant_id(base_url: str) -> int:
    """Returns greatest ID predicted by max_plants_on_display."""
    response = get(base_url, timeout=10)
    data = response.json()
    if data and "plants_on_display" in data:
        return data["plants_on_display"]
    raise ValueError("Failed to fetch status information. Status code: %s" %
                     response.status_code)


def extract_plant_batch() -> list[dict]:
    """Returns list of dictionaries for all successful plant get requests."""
    base_url = "https://data-eng-plants-api.herokuapp.com/"
    plant_data_list = []
    max_plant_id = get_max_plant_id(base_url)
    logger.info("Data for %d plants is available...", max_plant_id)
    # Adds 10% leeway to account for missing plants
    max_plant_id += max_plant_id // 10

    for plant_id in range(1, max_plant_id + 1):
        try:
            plant_data = get_plant_data(base_url, plant_id)
            plant_data_list.append(plant_data)
        except ValueError as e:
            logger.error(
                "Error fetching data for plant ID %d: %s", plant_id, e)

    logger.info("Retrieved data for %d plants.", len(plant_data_list))
    return plant_data_list


def get_plant_data(base_url: str, plant_id: int) -> dict:
    """
    Returns data for a specific plant based on its ID number.

    Raises:
        ValueError: If the data for the plant cannot be fetched.
    """
    url = f"{base_url}plants/{plant_id}"
    try:
        response = get(url, timeout=7)
    except exceptions.ReadTimeout as e:
        raise ValueError("Request timed out for plant ID %d: %s" %
                         (plant_id, e)) from e

    if response.status_code == 200:
        plant_data = response.json()
        logger.info("Successfully fetched data for Plant ID %d", plant_id)
        return plant_data

    raise ValueError("Failed to fetch data for plant ID %d. Status code: %d" %
                     (plant_id, response.status_code))


def transform_and_clean_data(raw_data:list[dict]):
    """Returns dataframe that has been cleaned."""
    df = convert_to_dataframe(raw_data)
    df = clean_image_data(df)
    df = clean_scientific_name(df)
    df = format_recording_taken(df)
    df = parse_botanist_data(df)
    df = format_watered_column(df)
    df = parse_origin_location(df)
    df = capitalise_plant_name(df)
    df = process_temperature_column(df)
    return df

if __name__ == "__main__":
    main_data = extract_plant_batch()
    dataframe_sample = (transform_and_clean_data(main_data))
    dataframe_sample.sample()

    logger.info("%s", main_data)


2025-02-06 15:51:13,865 - INFO - Data for 49 plants is available...
2025-02-06 15:51:14,182 - INFO - Successfully fetched data for Plant ID 1
2025-02-06 15:51:19,464 - INFO - Successfully fetched data for Plant ID 2
2025-02-06 15:51:26,545 - ERROR - Error fetching data for plant ID 3: Request timed out for plant ID 3: HTTPSConnectionPool(host='data-eng-plants-api.herokuapp.com', port=443): Read timed out. (read timeout=7)
2025-02-06 15:51:33,621 - ERROR - Error fetching data for plant ID 4: Request timed out for plant ID 4: HTTPSConnectionPool(host='data-eng-plants-api.herokuapp.com', port=443): Read timed out. (read timeout=7)
2025-02-06 15:51:38,800 - INFO - Successfully fetched data for Plant ID 5
2025-02-06 15:51:45,868 - ERROR - Error fetching data for plant ID 6: Request timed out for plant ID 6: HTTPSConnectionPool(host='data-eng-plants-api.herokuapp.com', port=443): Read timed out. (read timeout=7)
2025-02-06 15:51:52,942 - ERROR - Error fetching data for plant ID 7: Request ti

In [5]:
dataframe_sample

Unnamed: 0,last_watered,name,plant_id,recording_taken,soil_moisture,temperature,scientific_name,image_license,image_license_name,image_license_url,image_original_url,botanist_email,botanist_name,botanist_phone,region,country
0,2025-02-06 13:54:32,Venus Flytrap,1,2025-02-06 15:51:14,93.108081,12.064644,,,,,,gertrude.jekyll@lnhm.co.uk,Gertrude Jekyll,001-481-273-3691x127,South Whittier,US
1,2025-02-06 14:10:54,Corpse Flower,2,2025-02-06 15:51:19,94.517316,9.218329,,,,,,carl.linnaeus@lnhm.co.uk,Carl Linnaeus,(146)994-1635x35992,Efon-Alaaye,NG
2,2025-02-06 14:56:18,Pitcher Plant,5,2025-02-06 15:51:38,96.858272,11.248668,Sarracenia Catesbaei,451.0,CC0 1.0 Universal (CC0 1.0) Public Domain Dedi...,https://creativecommons.org/publicdomain/zero/...,https://perenual.com/storage/image/upgrade_acc...,carl.linnaeus@lnhm.co.uk,Carl Linnaeus,(146)994-1635x35992,Jashpurnagar,IN
3,2025-02-06 13:23:01,Bird Of Paradise,8,2025-02-06 15:51:53,91.120703,11.607922,Heliconia Schiedeana 'Fire And Ice',451.0,CC0 1.0 Universal (CC0 1.0) Public Domain Dedi...,https://creativecommons.org/publicdomain/zero/...,https://perenual.com/storage/image/upgrade_acc...,eliza.andrews@lnhm.co.uk,Eliza Andrews,(846)669-6651x75948,Bonoua,CI
4,2025-02-06 14:12:43,Cactus,9,2025-02-06 15:51:55,94.248445,10.641553,Pereskia Grandifolia,451.0,CC0 1.0 Universal (CC0 1.0) Public Domain Dedi...,https://creativecommons.org/publicdomain/zero/...,https://perenual.com/storage/image/upgrade_acc...,gertrude.jekyll@lnhm.co.uk,Gertrude Jekyll,001-481-273-3691x127,Weimar,DE
5,2025-02-06 13:04:57,Dragon Tree,10,2025-02-06 15:51:57,90.33749,13.986311,,,,,,gertrude.jekyll@lnhm.co.uk,Gertrude Jekyll,001-481-273-3691x127,Split,HR
6,2025-02-06 13:37:24,Asclepias Curassavica,11,2025-02-06 15:51:58,92.439652,9.521462,Asclepias Curassavica,4.0,Attribution License,https://creativecommons.org/licenses/by/2.0/,https://perenual.com/storage/species_image/100...,gertrude.jekyll@lnhm.co.uk,Gertrude Jekyll,001-481-273-3691x127,Kahului,US
7,2025-02-06 13:13:51,Brugmansia X Candida,12,2025-02-06 15:51:59,90.679374,12.897716,,,,,,eliza.andrews@lnhm.co.uk,Eliza Andrews,(846)669-6651x75948,Longview,US
8,2025-02-06 13:36:47,Canna Striata,13,2025-02-06 15:52:00,92.562791,9.615672,,,,,,eliza.andrews@lnhm.co.uk,Eliza Andrews,(846)669-6651x75948,Bensheim,DE
9,2025-02-06 14:17:54,Colocasia Esculenta,14,2025-02-06 15:52:01,94.687506,13.229965,Colocasia Esculenta,4.0,Attribution License,https://creativecommons.org/licenses/by/2.0/,https://perenual.com/storage/species_image/201...,gertrude.jekyll@lnhm.co.uk,Gertrude Jekyll,001-481-273-3691x127,Gainesville,US
