# Merging GPT-labeled taxon-specific colors with the iNaturalist occurrence data

## README:
Here we are reading in the products from the previous two notebooks -- the iNaturalist occurrence data of plants that are in flower and the taxon-specific color labels from GPT. We are combining them here to produce one big dataset that includes iNaturalist occurrences with their associated color labels: `./data/fulldata_cleaned_matched_GPT_colors.csv`

In [86]:
import numpy as np
import seaborn as sns
import os
import toyplot
import matplotlib.pyplot as plt
import pandas as pd
from dateutil import parser
from shapely.geometry import Point, Polygon
import geopandas as gpd
from geopandas import GeoDataFrame
import imageio
from IPython.display import Image
import rasterio

In [87]:
%pwd

'/Users/patrickmckenzie/googledrive/projects/flower_color_phenology'

In [89]:
%cd flower_color_phenology/

/Users/patrickmckenzie/googledrive/projects/flower_color_phenology


In [11]:
coldf = pd.read_csv('./data/FULL_gpt_labeled_taxon.csv')
inatdata = pd.read_csv('./data/combined_raw_inaturalist_export.csv')
# let's make a column for the first two words of every scientific name in the data
inatdata['binomial'] = [' '.join(str(i).split( )[:2]) for i in inatdata.scientific_name]
color_list = []
for obs_sp in list(inatdata.binomial):
    subrows = coldf[coldf.binomial.eq(obs_sp)]
    if len(subrows):
        color = np.random.choice(subrows.gpt_color).lower()
    else:
        color = np.nan
    color_list.append(color)
    
inatdata['color'] = color_list

inatdata_plus_color = inatdata[~inatdata.color.isna()]

len(inatdata_plus_color.index)

  inatdata = pd.read_csv('./data/combined_raw_inaturalist_export.csv')


1758405

In [10]:
len(inatdata)

1763883

In [18]:
inatdata_plus_color = inatdata_plus_color[~inatdata_plus_color.color.eq('nan')]

In [19]:
len(inatdata_plus_color.index)

1675600

In [20]:
inatdata_plus_color = inatdata_plus_color[~inatdata_plus_color.color.eq('unknown')]

In [21]:
len(inatdata_plus_color.index)

1675263

### Remove the hybrid or single-name taxa

In [22]:
# screen out all hybrid names (with the 'x' character)
hybrid_mask = ~np.array(['x' in str(i).split() for i in inatdata_plus_color.scientific_name])
print(np.sum(~hybrid_mask))
inatdata_plus_color = inatdata_plus_color[hybrid_mask]

1


In [23]:
len(inatdata_plus_color.index)

1675262

In [24]:
# there is a special character for x that we also have to screen out!
hybrid_mask = ~np.array(['×' in str(i).split() for i in inatdata_plus_color.scientific_name])
print(np.sum(~hybrid_mask))
inatdata_plus_color = inatdata_plus_color[hybrid_mask]

354


In [25]:
# screen out all scientific names that are one word
single_names_mask = ~np.array([len(str(i).split())==1 for i in inatdata_plus_color.scientific_name])
print(np.sum(~single_names_mask))
inatdata_plus_color = inatdata_plus_color[single_names_mask]

0


In [27]:
len(inatdata_plus_color.index)

1674908

### Add the day of year column

In [29]:
days_list = []
for date in inatdata_plus_color.observed_on:
    dt = parser.parse(date)
    day_of_year = dt.timetuple().tm_yday
    days_list.append(day_of_year)
    
inatdata_plus_color['day_of_year'] = days_list
inatdata_plus_color

Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,user_name,created_at,updated_at,...,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id,binomial,color,day_of_year
0,47,"March 21, 2008",2008-03-21,,Central Time (US & Canada),7,lisa_and_robb,,2008-03-25 11:21:54 UTC,2023-03-12 05:50:05 UTC,...,,,Texas Bluebonnet,Lupinus texensis,Texas bluebonnet,Plantae,49564.0,Lupinus texensis,blue,81
1,8009,2010-07-20,2010-07-20,,Mountain Time (US & Canada),453,leighannemcc,,2010-08-06 19:30:15 UTC,2020-06-04 22:16:05 UTC,...,,,Solanum rostratum,Solanum rostratum,buffalo-bur,Plantae,62642.0,Solanum rostratum,yellow,201
2,32167,"September 22, 2011 10:58",2011-09-22,2011-09-22 14:58:00 UTC,Eastern Time (US & Canada),2370,ctracey,Christopher Tracey,2011-09-23 03:14:29 UTC,2022-05-17 18:47:54 UTC,...,,,Fendler's sandmat,Euphorbia fendleri,Fendler's Sandmat,Plantae,148407.0,Euphorbia fendleri,green,265
3,35715,"October 14, 2011 2:17:52 PM CDT",2011-10-14,2011-10-14 19:17:52 UTC,Central Time (US & Canada),2281,cullen,Cullen Hanks,2011-10-15 04:38:10 UTC,2022-05-12 15:24:40 UTC,...,,,Spigelia texana,Spigelia texana,Texas pinkroot,Plantae,520535.0,Spigelia texana,white,287
4,38015,"March 13, 2011 15:36",2011-03-13,2011-03-13 20:36:00 UTC,Central Time (US & Canada),2670,atxnaturalist,Kari Gaukler,2011-11-03 03:55:14 UTC,2020-11-22 14:54:48 UTC,...,,,Texas baby blue eyes,Nemophila phacelioides,Texas baby blue eyes,Plantae,120550.0,Nemophila phacelioides,blue,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1763878,187151184,2023-10-11 10:18:05-07:00,2023-10-11,2023-10-11 17:18:05 UTC,Pacific Time (US & Canada),573539,bmacm,Barbara Millett,2023-10-11 18:55:36 UTC,2023-10-11 22:10:28 UTC,...,,,greenspot nightshade,Solanum douglasii,greenspot nightshade,Plantae,64105.0,Solanum douglasii,purple,284
1763879,187152623,2023-10-11 10:08:36-07:00,2023-10-11,2023-10-11 17:08:36 UTC,Pacific Time (US & Canada),573539,bmacm,Barbara Millett,2023-10-11 19:08:46 UTC,2023-10-11 22:46:44 UTC,...,,,scarlet monkeyflower,Erythranthe cardinalis,scarlet monkeyflower,Plantae,319974.0,Erythranthe cardinalis,orange,284
1763880,187155066,2023-10-11 12:30:00-07:00,2023-10-11,2023-10-11 19:30:00 UTC,Pacific Time (US & Canada),7396362,noahhaas,,2023-10-11 19:30:21 UTC,2023-10-12 03:10:47 UTC,...,,,mule fat,Baccharis salicifolia,mule fat,Plantae,57913.0,Baccharis salicifolia,pink,284
1763881,187157269,2023-10-11 10:15:01-07:00,2023-10-11,2023-10-11 17:15:01 UTC,Pacific Time (US & Canada),573539,bmacm,Barbara Millett,2023-10-11 19:48:42 UTC,2023-10-11 20:27:06 UTC,...,,,White Sweetclover,Melilotus albus,White Sweetclover,Plantae,58907.0,Melilotus albus,white,284


# Write out the cleaned, GPT-color-labeled dataset!

In [30]:
inatdata_plus_color.to_csv('./data/fulldata_cleaned_matched_GPT_colors.csv',index=False)