In [65]:
import pandas as pd
import numpy as np
import toyplot
import time

In [2]:
%pwd

'/Users/patrickmckenzie/googledrive/projects/flower_color_phenology/notebooks'

In [3]:
%cd ..

/Users/patrickmckenzie/googledrive/projects/flower_color_phenology


# Open iNaturalist observations csv

In [4]:
inat_data = pd.read_csv('./data/combined_raw_inaturalist_export.csv')

  inat_data = pd.read_csv('./data/combined_raw_inaturalist_export.csv')


In [5]:
inat_data.columns

Index(['id', 'observed_on_string', 'observed_on', 'time_observed_at',
       'time_zone', 'user_id', 'user_login', 'user_name', 'created_at',
       'updated_at', 'quality_grade', 'license', 'url', 'image_url',
       'sound_url', 'tag_list', 'description', 'num_identification_agreements',
       'num_identification_disagreements', 'captive_cultivated',
       'oauth_application_id', 'place_guess', 'latitude', 'longitude',
       'positional_accuracy', 'private_place_guess', 'private_latitude',
       'private_longitude', 'public_positional_accuracy', 'geoprivacy',
       'taxon_geoprivacy', 'coordinates_obscured', 'positioning_method',
       'positioning_device', 'species_guess', 'scientific_name', 'common_name',
       'iconic_taxon_name', 'taxon_id'],
      dtype='object')

In [41]:
len(inat_data.index)

1763883

In [118]:
# let's (way later) make a column for the first two words of every scientific name in the data
inat_data['binomial'] = [' '.join(str(i).split( )[:2]) for i in inat_data.scientific_name]

# What are the most frequent species?

In [8]:
species, counts = np.unique([' '.join(str(i).split( )[:2]) for i in inat_data.scientific_name],return_counts=True)

In [11]:
species_num_obs_df = pd.DataFrame([species,counts],index=['species','num_obs']).T

In [16]:
species_num_obs_df.sort_values('num_obs',ascending=False)[:20]

Unnamed: 0,species,num_obs
13095,Trillium grandiflorum,12838
4142,Dipterostemon capitatus,12324
13092,Trillium erectum,10808
11480,Sanguinaria canadensis,10059
13102,Trillium ovatum,9984
5389,Ficaria verna,9840
3007,Claytonia virginica,9176
4873,Erodium cicutarium,8690
3729,Cypripedium acaule,8048
7097,Lamium purpureum,7742


Something interesting jumps out here. The most-observed species in our dataset of flowering-labeled North American iNat observatinos is Trillium grandiflorum. Overall in North America on iNat, there are 35,497 observations of this species. So **12,838 / 35,497 = 0.36 of these are labeled as flowering.**

In contrast, there 7,742 observations of Lamium purpureum labeled as flowering in this dataset, in contrast to 50,333 total observations of this species in North America that I see on the iNaturalist website. So **only 0.15 of the Lamium purpureum observations have phenology labels.**

Finally, let's look at white clover (Trifolium repens). Only 5,504 of those observations have flowering phenology labels, despite there being 79,678 observations. That amounts to only 0.07 of the observations having flowering phenology labels!

# What's the distribution of species abundance?

In [35]:
toyplot.bars(np.histogram(list(species_num_obs_df.num_obs),bins=100));

# The VAST majority of species labeled as "flowering" are represented by very few records. Does this match iNaturalist observations of flowering plants overall?

# Another way to look at this:

In [36]:
species_num_obs_df

Unnamed: 0,species,num_obs
0,Abdra brachycarpa,13
1,Abelia ×,22
2,Abelmoschus,1
3,Abelmoschus esculentus,5
4,Abronia,2
...,...,...
13842,Zornia reticulata,2
13843,Zostera marina,2
13844,Zoysia matrella,1
13845,Zoysia pacifica,1


# 99,509 of the 1,763,883 observations come from the most common 10 species 

In [47]:
species_num_obs_df.sort_values('num_obs',ascending=False)[:10].num_obs.sum()

99509

# 298,820 of the 1,763,883 observations come from the most common 50 species 

In [44]:
species_num_obs_df.sort_values('num_obs',ascending=False)[:50].num_obs.sum()

298820

# 729,842 of the 1,763,883 observations come from the most common 250 species 

In [43]:
species_num_obs_df.sort_values('num_obs',ascending=False)[:250].num_obs.sum()

729842

# Remember: There are 13,846 total species in the dataset!

# Use pyinaturalist to get raw occurrence counts in inat

### First, subset our data:

In [70]:
subdf = species_num_obs_df.sample(1000)

In [71]:
import pyinaturalist

In [72]:
nelat = 54

nelng = -59

swlat = 24

swlng = -130


In [None]:
trillium
counts_list = []
for species_idx in range(len(subdf.index)):
    taxon_name = subdf.species.iloc[species_idx]
    counts = pyinaturalist.get_observation_species_counts(quality_grade='research',
                                                          nelat=nelat,
                                                          nelng=nelng,
                                                          swlat=swlat,
                                                          swlng=swlng,
                                                          taxon_id=47125, # angiosperms
                                                          taxon_name=taxon_name
                                                         )
    counts_list.append(counts['total_results'])
    time.sleep(1)
    if not species_idx%50:
        print(species_idx)

In [81]:
taxon_name='Trillium grandiflorum'
counts = pyinaturalist.get_observation_species_counts(quality_grade='research',
                                                          nelat=nelat,
                                                          nelng=nelng,
                                                          swlat=swlat,
                                                          swlng=swlng,
                                                          taxon_id=47125, # angiosperms
                                                          taxon_name=taxon_name
                                                         )

In [85]:
counts['results'][0]['count']

[1;36m34842[0m

In [88]:
counts_list = []
for species_idx in range(len(subdf.index)):
    taxon_name = subdf.species.iloc[species_idx]
    counts = pyinaturalist.get_observation_species_counts(quality_grade='research',
                                                          nelat=nelat,
                                                          nelng=nelng,
                                                          swlat=swlat,
                                                          swlng=swlng,
                                                          taxon_id=47125, # angiosperms
                                                          taxon_name=taxon_name
                                                         )
    counts_list.append(counts['results'][0]['count'])
    time.sleep(1)
    if not species_idx%50:
        print(species_idx)

0
50
100
150
200
250
300
350
400
450
500


IndexError: list index out of range

In [90]:
species_idx

[1;36m532[0m

In [91]:
#counts_list = []
for species_idx in range(532,len(subdf.index)):
    taxon_name = subdf.species.iloc[species_idx]
    counts = pyinaturalist.get_observation_species_counts(quality_grade='research',
                                                          nelat=nelat,
                                                          nelng=nelng,
                                                          swlat=swlat,
                                                          swlng=swlng,
                                                          taxon_id=47125,  # angiosperms
                                                          taxon_name=taxon_name
                                                         )
    # Check if counts['results'] is not empty
    if counts['results']:
        counts_list.append(counts['results'][0]['count'])
    else:
        # Append nan if counts['results'] is empty
        counts_list.append(np.nan)

    time.sleep(1)
    if not species_idx % 50:
        print(species_idx)


550
600
650
700
750
800
850
900
950


In [96]:
toyplot.bars(np.histogram(np.array(counts_list)[~np.isnan(counts_list)],bins=20));

Doesn't look that much different from the distribution of occurrences labeled as flowering.

# Is there a correlation between species that have many total observations and those that have many flowering-labeled observations?

In [98]:
subdf['total_counts_list'] = counts_list
subdf

Unnamed: 0,species,num_obs,total_counts_list
10307,Polygala verticillata,64,480.0
5039,Escobaria duncanii,1,33.0
9993,Physaria ovalifolia,6,115.0
2579,Ceanothus pauciflorus,120,709.0
6754,Ipomopsis congesta,37,713.0
...,...,...,...
2501,Castilleja salaisolaveae,11,11.0
9197,Paronychia albomarginata,4,4.0
7030,Koenigia phytolaccifolia,10,154.0
1963,Calochortus ownbeyi,4,4.0


In [102]:
toyplot.scatterplot(list(subdf.total_counts_list),list(subdf.num_obs),size=10);

In [106]:
from scipy.stats import linregress

# Fitting a line
slope, intercept, r_value, p_value, std_err = linregress(list(subdf.dropna().total_counts_list),list(subdf.dropna().num_obs))

# Generate values for the best fit line
x_fit = np.linspace(subdf.dropna().total_counts_list.min(), subdf.dropna().total_counts_list.max(), 100)
y_fit = slope * x_fit + intercept

In [109]:
canvas = toyplot.Canvas(width=600, height=600)
axes = canvas.cartesian()
mark1 = axes.scatterplot(list(subdf.total_counts_list),list(subdf.num_obs),size=10)
mark2 = axes.plot(x_fit,y_fit)

# Now let's introduce color to the equation...

In [112]:
color_matching = pd.read_csv('./data/cleaned_matched_colors.csv',index_col=0)

In [120]:
num_occs = []
for row in list(color_matching.species_name):
    num_occs.append(len(inat_data[inat_data.binomial.eq(row.capitalize())].index))

In [123]:
color_matching['num_occs_flowering'] = num_occs

In [125]:
red_df = color_matching[color_matching.color.eq('red')]

In [126]:
red_df

Unnamed: 0,species_name,color,num_occs_flowering
0,abelmoschus esculentus,red,5
34,achillea millefolium,red,5680
40,achyranthes aspera,red,2
49,adonis aestivalis,red,1
52,adonis aestivalis,red,1
...,...,...,...
3184,trifolium pratense,red,6053
3203,tropaeolum majus,red,486
3242,vaccinium angustifolium,red,373
3361,watsonia meriana,red,1


In [127]:
red_counts_list = []
for species_idx in range(len(red_df.index)):
    taxon_name = red_df.species_name.iloc[species_idx].capitalize()
    counts = pyinaturalist.get_observation_species_counts(quality_grade='research',
                                                          nelat=nelat,
                                                          nelng=nelng,
                                                          swlat=swlat,
                                                          swlng=swlng,
                                                          taxon_id=47125,  # angiosperms
                                                          taxon_name=taxon_name
                                                         )
    # Check if counts['results'] is not empty
    if counts['results']:
        red_counts_list.append(counts['results'][0]['count'])
    else:
        # Append nan if counts['results'] is empty
        red_counts_list.append(np.nan)

    time.sleep(1)
    if not species_idx % 50:
        print(species_idx)


0
50
100


IndexError: single positional indexer is out-of-bounds

In [130]:
red_df['num_occs_total'] = red_counts_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  red_df['num_occs_total'] = red_counts_list


In [131]:
red_df

Unnamed: 0,species_name,color,num_occs_flowering,num_occs_total
0,abelmoschus esculentus,red,5,190
34,achillea millefolium,red,5680,90092
40,achyranthes aspera,red,2,59
49,adonis aestivalis,red,1,6
52,adonis aestivalis,red,1,6
...,...,...,...,...
3184,trifolium pratense,red,6053,61194
3203,tropaeolum majus,red,486,9536
3242,vaccinium angustifolium,red,373,6554
3361,watsonia meriana,red,1,29


In [132]:
canvas = toyplot.Canvas(width=600, height=600)
axes = canvas.cartesian()
mark1 = axes.scatterplot(list(red_df.num_occs_total),list(red_df.num_occs_flowering),size=10)
#mark2 = axes.plot(x_fit,y_fit)

In [162]:
from scipy.stats import linregress

# Fitting a line
slope, intercept, r_value, p_value, std_err = linregress(list(red_df.num_occs_total),list(red_df.num_occs_flowering))

# Generate values for the best fit line
red_x_fit = np.linspace(red_df.num_occs_total.min(), red_df.num_occs_total.max(), 100)
red_y_fit = slope * red_x_fit + intercept

In [163]:
std_err

[1;36m0.0025243055287638098[0m

In [138]:
white_df = color_matching[color_matching.color.eq('white')].drop_duplicates()
len(white_df.index)

[1;36m464[0m

In [139]:
white_counts_list = []
for species_idx in range(len(white_df.index)):
    taxon_name = white_df.species_name.iloc[species_idx].capitalize()
    counts = pyinaturalist.get_observation_species_counts(quality_grade='research',
                                                          nelat=nelat,
                                                          nelng=nelng,
                                                          swlat=swlat,
                                                          swlng=swlng,
                                                          taxon_id=47125,  # angiosperms
                                                          taxon_name=taxon_name
                                                         )
    # Check if counts['results'] is not empty
    if counts['results']:
        white_counts_list.append(counts['results'][0]['count'])
    else:
        # Append nan if counts['results'] is empty
        white_counts_list.append(np.nan)

    time.sleep(1)
    if not species_idx % 50:
        print(species_idx)


0
50
100
150
200
250
300
350
400
450


In [140]:
white_df['num_occs_total'] = white_counts_list

In [154]:
white_df = white_df.dropna()

In [155]:
from scipy.stats import linregress

# Fitting a line
slope, intercept, r_value, p_value, std_err = linregress(list(white_df.num_occs_total),list(white_df.num_occs_flowering),)

# Generate values for the best fit line
white_x_fit = np.linspace(white_df.num_occs_total.min(), white_df.num_occs_total.max(), 100)
white_y_fit = slope * white_x_fit + intercept

In [161]:
std_err

[1;36m0.002034308211526399[0m

In [166]:
# show the relationship 
canvas = toyplot.Canvas(width=600, height=600)
axes = canvas.cartesian(xlabel = 'num_total_occurrences',ylabel='num_labeled_flowering')
mark3 = axes.scatterplot(list(white_df.num_occs_total),list(white_df.num_occs_flowering),size=10,color='black')
mark4 = axes.plot(white_x_fit,white_y_fit,color='black')
mark1 = axes.scatterplot(list(red_df.num_occs_total),list(red_df.num_occs_flowering),size=10,color='red')
mark2 = axes.plot(red_x_fit,red_y_fit,color='red')