In [275]:
# Imports
import pandas as pd
import numpy as np

In [282]:
# Load data
df = pd.read_csv("Allspecies_GBIFrecords_fieldnotes_complete.csv")

  df = pd.read_csv("Allspecies_GBIFrecords_fieldnotes_complete.csv")


In [283]:
# Check columns 
df.columns

Index(['gbifID', 'institutionCode', 'collectionCode', 'occurrenceID',
       'catalogNumber', 'recordNumber', 'recordedBy', 'fieldNumber',
       'eventDate', 'year', 'month', 'day', 'eventRemarks', 'countryCode',
       'stateProvince', 'locality', 'decimalLongitude', 'decimalLatitude',
       'elevation', 'habitat', 'references', 'basisOfRecord', 'taxonID',
       'scientificNameID', 'family', 'genus', 'species', 'scientificName',
       'taxonomicStatus', 'acceptedScientificName', 'datasetKey', 'mediaType',
       'occurrenceRemarks', 'dynamicProperties', 'fieldNotes',
       'reproductiveCondition', 'hasGeospatialIssues', 'hasCoordinate',
       'coordinateUncertaintyInMeters', 'issue', 'image_url',
       'acceptedScientificName_corr', 'Country_name', 'Binned_latitude'],
      dtype='object')

In [284]:
# Select the most important columns 
cols = ["gbifID", "institutionCode", "recordedBy", "eventDate", "year", 
       "month", "day", "Country_name", "decimalLongitude", "decimalLatitude", 
       "Binned_latitude", "elevation", "family", "genus", "species", 
       "acceptedScientificName_corr", "occurrenceRemarks", "dynamicProperties", 
       "fieldNotes", "image_url", "reproductiveCondition"]
df_filt = df[cols]

In [285]:
# Change acceptedScientificName_corr column name
df_filt = df_filt.rename(columns={"acceptedScientificName_corr":"acceptedScientificName"})

In [286]:
# Merge the three columns with fieldnotes information into a single one
fn_cols = ['occurrenceRemarks', 'dynamicProperties', 'fieldNotes']
df_filt['FieldNotes'] = df_filt[fn_cols].apply(lambda row: '; '.join(row.values.astype(str)), axis=1).copy()

In [287]:
df_filt['FieldNotes']

0        [Invalid Lat.: 04Â° 49' 509"S. Invalid Long.: ...
1        Cestrum megalophyllum voucher BioBot11285 matu...
2        Cestrum megalophyllum voucher BioBot10266 ribu...
3        Erythroxylum macrophyllum voucher BioBot10378 ...
4        Casearia sylvestris isolate PECB029 ribulose-1...
                               ...                        
68109    nan; "{""determinationfiledas"": ""Yes"", ""co...
68110    nan; "{""determinationfiledas"": ""Yes"", ""co...
68111    nan; "{""habit"":""Tree (10m). Light colored b...
68112    nan; "{""habit"":""Arbol de ca. 30 m de alto, ...
68113    nan; "{""habit"":""shrub 4 4 tall; fruit black...
Name: FieldNotes, Length: 68114, dtype: object

In [248]:
# Delete independent columns for fieldnotes
#df_filt.drop(['occurrenceRemarks', 'dynamicProperties', 'fieldNotes'], axis=1, inplace=True)

In [288]:
# Add a column with a filter per year: before 1970, between 1970 and 2010, and since 2011
# Define a function for classifying records by year
def add_year_col(row):
    if row['year'] < 1970:
        val = '<1970'
    elif (row['year'] >= 1970) and (row['year'] <= 2010):
        val = '1970-2010'
    else:
        val = '>2011'
    return val

#create new column using the function above
df_filt['Year_interval'] = df_filt.apply(add_year_col, axis=1)

In [289]:
# Count the number of values of the new column
df_filt['Year_interval'].value_counts()

1970-2010    51572
>2011        11838
<1970         4704
Name: Year_interval, dtype: int64

In [290]:
# Add a column with scientific names at the species level without authorities and subspecies
df_filt['acceptedScientificName_splevel'] = df_filt['acceptedScientificName'].str.extract('(^(?:\S+\s+){1,2})', expand=False)
df_filt['acceptedScientificName_splevel'] = df_filt['acceptedScientificName_splevel'].str.rstrip()

In [291]:
df_filt['acceptedScientificName_splevel']

0            Cavendishia bracteata
1            Cestrum megalophyllum
2            Cestrum megalophyllum
3        Erythroxylum macrophyllum
4              Casearia sylvestris
                   ...            
68109         Vernonanthura patens
68110         Vernonanthura patens
68111              Trema micrantha
68112              Trema micrantha
68113             Myrcia splendens
Name: acceptedScientificName_splevel, Length: 68114, dtype: object

In [215]:
# Export column at species level to compare it with the original list
sp_list_GBIF = pd.DataFrame(df_filt['acceptedScientificName_splevel'].unique())
sp_list_GBIF.to_csv("sp_list_GBIF.csv")

In [292]:
# Add a column with the number of records by species
df_filt["n_records_GBIF"] = pd.Series()

for i, row in df_filt.iterrows():   
    count_temp = len(df_filt[df_filt["acceptedScientificName"] == row["acceptedScientificName"]])
    df_filt.at[i,'n_records_GBIF']=count_temp

  df_filt["n_records_GBIF"] = pd.Series()


In [293]:
# Verify the correct total number of records 
test = df_filt.groupby(['acceptedScientificName', 'n_records_GBIF']).size().reset_index(name='count')
test["count"].sum()

68114

In [294]:
# Add a column with the interval of the number of records: 0-10, 11-100, 101-500, >500
# Define a function for classifying records by ther numbers
def add_nrecords_col(row):
    if (row['n_records_GBIF'] >= 0) and (row['n_records_GBIF'] <=10):
        val = '0-10'
    elif (row['n_records_GBIF'] >= 11) and (row['n_records_GBIF'] <=100):
        val = '11-100'
    elif (row['n_records_GBIF'] >= 101) and (row['n_records_GBIF'] <=500):
        val = '101-500'
    else:
        val = '>500'
    return val

#create new column using the function above
df_filt['Nrecords_interval'] = df_filt.apply(add_nrecords_col, axis=1)

In [295]:
# Verify the number of intervas by species
test1 = df_filt.groupby(['acceptedScientificName', 'Nrecords_interval']).size().reset_index(name='count')

In [296]:
# Count the number of values of the new column
test1["Nrecords_interval"].value_counts()

11-100     188
0-10       127
101-500     72
>500        36
Name: Nrecords_interval, dtype: int64

In [297]:
# Add a column with the names from the original database and the ones added by GBIF
# Load dataframe with the original names
orig_names = pd.read_csv("../../List_species/List_all_species_withauth_phenology_proj_UDLA2022_def_tnrs.csv")

#Convert df into a list
orig_names = orig_names["Scientific_name_auth"].values.tolist()

In [260]:
# Export list of accepted names to compare them to the initial db
test3 = pd.DataFrame(df_filt['acceptedScientificName'].unique())
test3.to_csv("acc_names_final.csv", index=False)

In [298]:
# Add a column to verify if the name was in the original database
df_filt["OrigDB_vs_GBIF"] = pd.Series()

for i in df_filt.index:
    if df_filt['acceptedScientificName'][i] in orig_names:
        df_filt.at[i,'OrigDB_vs_GBIF'] = "OriginalDB"
    else : #if both conditions not verified
        df_filt.at[i,'OrigDB_vs_GBIF'] = "GBIF"

  df_filt["OrigDB_vs_GBIF"] = pd.Series()


In [299]:
# Count the number of values of the new column
df_filt["OrigDB_vs_GBIF"].value_counts()

OriginalDB    65096
GBIF           3018
Name: OrigDB_vs_GBIF, dtype: int64

In [300]:
# Export final dataset as csv
df_filt.to_csv("Allspecies_GBIFrecords_fieldnotes_filtered.csv", index=False)

In [301]:
# Export final dataset as parquet
df_filt.to_parquet("Allspecies_GBIFrecords_fieldnotes_filtered.parquet", index=False)