# Demonstrate GPT-vision

### This is a demo notebook meant to illustrate how easy it is to get flower colors from photos, using the flower color + phenology project data as an example.

In [1]:
import pandas as pd
import numpy as np
from openai import OpenAI
import time

# load GPT client
client = OpenAI(
    api_key='your-key-goes-here' # fill in your api key here
)

# 0) Load and prepare data

### This is a big CSV of inaturalist observations

In [2]:
# load in the data
### these are raw observations ###
inat_data = pd.read_csv('../data/combined_raw_inaturalist_export.csv')
inat_data.head()

  inat_data = pd.read_csv('../data/combined_raw_inaturalist_export.csv')


Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,user_name,created_at,updated_at,...,geoprivacy,taxon_geoprivacy,coordinates_obscured,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id
0,47,"March 21, 2008",2008-03-21,,Central Time (US & Canada),7,lisa_and_robb,,2008-03-25 11:21:54 UTC,2023-03-12 05:50:05 UTC,...,,open,False,,,Texas Bluebonnet,Lupinus texensis,Texas bluebonnet,Plantae,49564.0
1,8009,2010-07-20,2010-07-20,,Mountain Time (US & Canada),453,leighannemcc,,2010-08-06 19:30:15 UTC,2020-06-04 22:16:05 UTC,...,,,False,,,Solanum rostratum,Solanum rostratum,buffalo-bur,Plantae,62642.0
2,32167,"September 22, 2011 10:58",2011-09-22,2011-09-22 14:58:00 UTC,Eastern Time (US & Canada),2370,ctracey,Christopher Tracey,2011-09-23 03:14:29 UTC,2022-05-17 18:47:54 UTC,...,,,False,,,Fendler's sandmat,Euphorbia fendleri,Fendler's Sandmat,Plantae,148407.0
3,35715,"October 14, 2011 2:17:52 PM CDT",2011-10-14,2011-10-14 19:17:52 UTC,Central Time (US & Canada),2281,cullen,Cullen Hanks,2011-10-15 04:38:10 UTC,2022-05-12 15:24:40 UTC,...,,,False,,,Spigelia texana,Spigelia texana,Texas pinkroot,Plantae,520535.0
4,38015,"March 13, 2011 15:36",2011-03-13,2011-03-13 20:36:00 UTC,Central Time (US & Canada),2670,atxnaturalist,Kari Gaukler,2011-11-03 03:55:14 UTC,2020-11-22 14:54:48 UTC,...,,,False,,,Texas baby blue eyes,Nemophila phacelioides,Texas baby blue eyes,Plantae,120550.0


## Filtering the dataset

### Hybrids

In [3]:
# screen out all hybrid names (with the 'x' character)
hybrid_mask = ~np.array(['x' in str(i).split() for i in inat_data.scientific_name])
print(np.sum(~hybrid_mask))
inat_data = inat_data[hybrid_mask]

40


In [4]:
# there is a special character for x that we also have to screen out!
hybrid_mask = ~np.array(['×' in str(i).split() for i in inat_data.scientific_name])
print(np.sum(~hybrid_mask))
inat_data = inat_data[hybrid_mask]

2992


### Single words

In [5]:
# screen out all scientific names that are one word
single_names_mask = ~np.array([len(str(i).split())==1 for i in inat_data.scientific_name])
print(np.sum(~single_names_mask))
inat_data = inat_data[single_names_mask]

2805


### Add binomial name column to ignore subspecific ID

In [6]:
# make a column that just keeps the binomial nomenclature
inat_data['binomial'] = [' '.join(str(i).split()[:2]) for i in inat_data.scientific_name]

# Goal:

## For this project I'm interested in **FOR EACH SPECIES** in the dataset, getting the color of that species. To do this I will make a list of all of the unique species, and I will access a single "default" photo for each species from iNaturalist. This is the photo from which I will extract flower color.

# 1) Get the unique species from the DataFrame based on the `binomial` column

In [7]:
# what are the unique species in the dataset
unique_species = np.unique(inat_data.binomial)
len(unique_species)

13378

In [8]:
unique_species[:10]

array(['Abdra brachycarpa', 'Abelmoschus esculentus', 'Abronia ameliae',
       'Abronia ammophila', 'Abronia angustifolia', 'Abronia elliptica',
       'Abronia fragrans', 'Abronia glabrifolia', 'Abronia gracilis',
       'Abronia latifolia'], dtype=object)

### Sort by frequency (arbitrary choice on my part)

In [9]:
unique_species, counts = np.unique(inat_data.binomial, return_counts=True)

In [10]:
sorted_unique_species = unique_species[np.argsort(counts)[::-1]]

In [11]:
sorted_unique_species[:10]

array(['Trillium grandiflorum', 'Dipterostemon capitatus',
       'Trillium erectum', 'Sanguinaria canadensis', 'Trillium ovatum',
       'Ficaria verna', 'Claytonia virginica', 'Erodium cicutarium',
       'Cypripedium acaule', 'Lamium purpureum'], dtype=object)

## Example of getting a default photo for an arbitrary taxon

In [12]:
import pyinaturalist

In [13]:
res = pyinaturalist.get_taxa('Monarda fistulosa')

In [14]:
total_results = res['total_results']
page = res['page']
per_page = res['per_page']
results = res['results']

photo = results[0]['default_photo']['medium_url']
photo

[32m'https://inaturalist-open-data.s3.amazonaws.com/photos/47763/medium.jpg'[0m

# 2) Get the list of urls to photos (in my case, one for each species)

In [15]:
# change the range here
startidx = 0
stopidx = 15

# get the taxa
urls = []
for taxon in sorted_unique_species[startidx:stopidx]: # for each taxon name in the range of idxs...
    res = pyinaturalist.get_taxa(taxon) # pull up the taxon
    total_results = res['total_results'] # extract the results
    page = res['page'] # dig down through the layers...
    per_page = res['per_page']
    results = res['results']

    if not len(results):
        photo=np.nan
    else:
        if results[0]['default_photo']:
            photo = results[0]['default_photo']['medium_url'] # get the url of the representative photo
        else:
            photo = np.nan
    urls.append(photo)
    time.sleep(0.5)

### now match the url up to the species name in a dataframe

In [16]:
inat_taxon_df = pd.DataFrame([sorted_unique_species[startidx:stopidx],urls],index=['binomial','photo_url']).T
inat_taxon_df

Unnamed: 0,binomial,photo_url
0,Trillium grandiflorum,https://inaturalist-open-data.s3.amazonaws.com...
1,Dipterostemon capitatus,https://inaturalist-open-data.s3.amazonaws.com...
2,Trillium erectum,https://inaturalist-open-data.s3.amazonaws.com...
3,Sanguinaria canadensis,https://inaturalist-open-data.s3.amazonaws.com...
4,Trillium ovatum,https://inaturalist-open-data.s3.amazonaws.com...
5,Ficaria verna,https://inaturalist-open-data.s3.amazonaws.com...
6,Claytonia virginica,https://static.inaturalist.org/photos/18391000...
7,Erodium cicutarium,https://inaturalist-open-data.s3.amazonaws.com...
8,Cypripedium acaule,https://inaturalist-open-data.s3.amazonaws.com...
9,Lamium purpureum,https://inaturalist-open-data.s3.amazonaws.com...


In [17]:
# get rid of the one that has no photo url (if any)
inat_taxon_df = inat_taxon_df[inat_taxon_df['photo_url'].notna()]
inat_taxon_df

Unnamed: 0,binomial,photo_url
0,Trillium grandiflorum,https://inaturalist-open-data.s3.amazonaws.com...
1,Dipterostemon capitatus,https://inaturalist-open-data.s3.amazonaws.com...
2,Trillium erectum,https://inaturalist-open-data.s3.amazonaws.com...
3,Sanguinaria canadensis,https://inaturalist-open-data.s3.amazonaws.com...
4,Trillium ovatum,https://inaturalist-open-data.s3.amazonaws.com...
5,Ficaria verna,https://inaturalist-open-data.s3.amazonaws.com...
6,Claytonia virginica,https://static.inaturalist.org/photos/18391000...
7,Erodium cicutarium,https://inaturalist-open-data.s3.amazonaws.com...
8,Cypripedium acaule,https://inaturalist-open-data.s3.amazonaws.com...
9,Lamium purpureum,https://inaturalist-open-data.s3.amazonaws.com...


# 3) Feed each picture link to GPT and get the flower color results

In [18]:
flower_present_list = []
color_list = []
conf_list = []
for idx in range(len(inat_taxon_df.photo_url)):
    # asking GPT the question
    response=client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Please adhere to very specific formatting in your response: \
                    three words separated onto three lines (one word per line). The first line should indicate \
                    'YES' or 'NO' to answer whether there is a flower present. The second line should be one \
                    word from the following list, to best describe the flower color in the photo: ['BLUE', 'BROWN', \
                    'GREEN', 'ORANGE', 'PINK', 'PURPLE', 'RED', 'MAROON', 'WHITE', 'YELLOW','UNKNOWN','NAN']. The \
                    flowers might not match these categories perfectly. Do the best you can. If in doubt, please \
                    be conservative and choose 'unknown'. The third line should indicate your assessment of the \
                    subjectivity of the answer -- it should either be LOW, MEDIUM, or HIGH, where HIGH means that \
                    the choice of color assignment seems highly subjective."},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": inat_taxon_df.photo_url.iloc[idx],
                        },
                    },
                ],
            }
        ],
        max_tokens=300,
    )
    
    # parsing the response
    flower_present, color, conf = response.choices[0].message.content.split()
    
    # saving the results
    flower_present_list.append(flower_present)
    color_list.append(color)
    conf_list.append(conf)
    
    # wait a little while between requests (probably not necessary)
    time.sleep(1.0)
    
    # print out progress (every 25 queries)
    if not idx%25:
        print(idx)

0


### Add the columns to our dataframe

In [19]:
inat_taxon_df['flower_present'] = flower_present_list
inat_taxon_df['subjectivity'] = conf_list
inat_taxon_df['gpt_color'] = color_list

In [20]:
inat_taxon_df

Unnamed: 0,binomial,photo_url,flower_present,subjectivity,gpt_color
0,Trillium grandiflorum,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,WHITE
1,Dipterostemon capitatus,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,PURPLE
2,Trillium erectum,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,WHITE
3,Sanguinaria canadensis,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,WHITE
4,Trillium ovatum,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,WHITE
5,Ficaria verna,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,YELLOW
6,Claytonia virginica,https://static.inaturalist.org/photos/18391000...,YES,LOW,PINK
7,Erodium cicutarium,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,PINK
8,Cypripedium acaule,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,PINK
9,Lamium purpureum,https://inaturalist-open-data.s3.amazonaws.com...,YES,LOW,PINK


### Un-comment these lines to save the CSV

In [21]:
# filename = f'../data/gpt_labeled_taxon_photos_{startidx}_to_{stopidx}.csv'
# inat_taxon_df.to_csv(filename,index=False)