In [2]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [239]:
# Load data
df = pd.read_csv("Allspecies_GBIFrecords_fieldnotes_filtered.csv")

In [240]:
# Check columns 
df.columns

Index(['gbifID', 'institutionCode', 'recordedBy', 'eventDate', 'year', 'month',
       'day', 'Country_name', 'decimalLongitude', 'decimalLatitude',
       'Binned_latitude', 'elevation', 'family', 'genus', 'species',
       'acceptedScientificName', 'occurrenceRemarks', 'dynamicProperties',
       'fieldNotes', 'image_url', 'reproductiveCondition', 'FieldNotes',
       'Year_interval', 'acceptedScientificName_splevel', 'n_records_GBIF',
       'Nrecords_interval', 'OrigDB_vs_GBIF'],
      dtype='object')

In [241]:
# Delete binned latitude and longitude
df.drop(['Binned_latitude'], axis=1, inplace=True)

## Remove true duplicates from the dataset

In [242]:
# Size of df before deleting true duplicates
df.shape

(68114, 26)

In [243]:
#  Drop true duplicates 
dup_cols = ['acceptedScientificName', 'decimalLongitude', 'decimalLatitude','year', 'recordedBy']
df = df.drop(df.loc[:, dup_cols].dropna().duplicated().loc[lambda x: x].index)

In [244]:
# Size of df after deleteing true duplicates
df.shape

(54818, 26)

In [245]:
# Check if true duplicates were deleted 
(df.loc[:, dup_cols].dropna().duplicated().loc[lambda x: x].index).shape

(0,)

In [None]:
df = df.drop(df.loc[:, dup_cols].dropna().duplicated().loc[lambda x: x].index)

In [247]:
df = df.drop_duplicates()

In [249]:
# Reset indices of the dataframe
df = df.reset_index(drop=True)

In [250]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54759 entries, 0 to 54758
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gbifID                          54759 non-null  int64  
 1   institutionCode                 54568 non-null  object 
 2   recordedBy                      53248 non-null  object 
 3   eventDate                       54759 non-null  object 
 4   year                            54759 non-null  float64
 5   month                           54759 non-null  float64
 6   day                             54107 non-null  float64
 7   Country_name                    54759 non-null  object 
 8   decimalLongitude                54759 non-null  float64
 9   decimalLatitude                 54759 non-null  float64
 10  elevation                       41330 non-null  float64
 11  family                          54759 non-null  object 
 12  genus                           

## Obtain original names changed by GBIF 

In [12]:
# Export column at species level to compare it with the original list - obtain scientifc names of the original list
# to be changed
sp_list_GBIF = pd.DataFrame(df['acceptedScientificName_splevel'].unique())
sp_list_GBIF.to_csv("sp_list_GBIF.csv", index=False)

## Rename and delete names added by GBIF (not present in the search list)

In [251]:
# Select and export records with GBIF species (not present in the original DB)
sp_list_onlyGBIF = pd.DataFrame(df[df["OrigDB_vs_GBIF"] == "GBIF"]["acceptedScientificName"].unique())
sp_list_onlyGBIF.to_csv("sp_list_only_GBIF.csv", index=False)

In [252]:
# Rename scientific names with subspecies that were added by GBIF and not present on the original list
# These subspecies had the species name as synonym in TROPICOS
df['acceptedScientificName'] = df['acceptedScientificName'].replace(
                                            ['Alchornea glandulosa subsp. glandulosa',
                                             'Brosimum utile subsp. allenii (Woodson) C.C.Berg',
                                             'Geonoma undata subsp. edulis (H.Wendl. ex Spruce) A.J.Hend.',
                                             'Geonoma undata subsp. undata',
                                             'Guarea glabra subsp. excelsa (Kunth) T.D.Penn.',
                                             'Guarea glabra subsp. glabrescens (Hook. & Arn.) T.D.Penn.',
                                             'Guarea glabra subsp. microcarpa (C.DC.) T.D.Penn.',
                                             'Guarea macrophylla subsp. pendulispica (C.DC.) Penn.',
                                             'Handroanthus chrysanthus subsp. chrysanthus',
                                             'Handroanthus chrysanthus subsp. pluvicola (A.H.Gentry) S.O.Grose',
                                             'Inga nobilis subsp. nobilis',
                                             'Pourouma bicolor subsp. bicolor',
                                             'Pourouma bicolor subsp. chocoana (Standl.) C.C. Berg & Heusden',
                                             'Prestoea acuminata var. acuminata',
                                             'Prestoea acuminata var. dasystachys (Burret) A.J. Hend. & Galeano',
                                             'Prestoea acuminata var. montana (Graham) A.J. Hend. & Galeano',
                                             'Roupala montana var. impressiuscula (Mez) K.S. Edwards',
                                             'Salacia cordata subsp. cordata',
                                             'Saurauia tomentosa var. sprucei (Sprague) Soejarto',
                                             'Brosimum utile subsp. occidentale C.C.Berg',
                                             'Ficus americana subsp. guianensis (Desv.) C.C.Berg',
                                             'Inga nobilis subsp. quaternata (Poepp.) T.D.Penn.'
                                            ],
                                            ['Alchornea glandulosa Poepp.',
                                             'Brosimum utile (Kunth) Oken',
                                             'Geonoma undata Klotzsch',
                                             'Geonoma undata Klotzsch',
                                             'Guarea glabra Vahl',
                                             'Guarea glabra Vahl',
                                             'Guarea glabra Vahl',
                                             'Guarea macrophylla Vahl',
                                             'Handroanthus chrysanthus (Jacq.) S.O. Grose',
                                             'Handroanthus chrysanthus (Jacq.) S.O. Grose',
                                             'Inga nobilis Willd.',
                                             'Pourouma bicolor Mart.',
                                             'Pourouma bicolor Mart.',
                                             'Prestoea acuminata (Willd.) H.E. Moore',
                                             'Prestoea acuminata (Willd.) H.E. Moore',
                                             'Prestoea acuminata (Willd.) H.E. Moore',
                                             'Roupala montana Aubl.',
                                             'Salacia cordata (Miers) Mennega',
                                             'Saurauia tomentosa (Kunth) Spreng.',
                                             'Brosimum utile subsp. occidentale C.C. Berg',
                                             'Ficus americana subsp. guianensis (Desv. ex Ham.) C.C. Berg',
                                             'Inga nobilis subsp. quaternata (Poepp.) T.D. Penn.'
                                             ])

In [253]:
# Delete records with subspecies that were added by GBIF and not present on the original list
# These subspecies had not the species name as synonym in TROPICOS
def filter_rows_by_values(df, col, values):
    return df[~df[col].isin(values)].copy()

df_filt = filter_rows_by_values(df, 'acceptedScientificName', ['Alchornea glandulosa subsp. iricurana (Casar.) Secco',
                                         'Ficus americana subsp. americana',
                                         'Geonoma undata subsp. appuniana (Spruce) A.J.Hend.',
                                         'Geonoma undata subsp. pulcherrima (Burret) A.J.Hend.',
                                         'Geonoma undata subsp. stenothyrsa (Burret) A.J.Hend.',
                                         'Guarea macrophylla subsp. pachycarpa (C.DC.) Penn.',
                                         'Guarea macrophylla subsp. spicaeflora (A.Juss.) Penn.',
                                         'Guarea macrophylla subsp. tuberculata (Vell.) Penn.',
                                         'Guarea macrophylla subsp. spiciflora (A.Juss.) T.D.Penn.',
                                         'Handroanthus chrysanthus subsp. meridionalis (A.H.Gentry) S.O.Grose',
                                         'Pourouma bicolor subsp. digitata (Trécul) Berg & Heusden',
                                         'Pourouma bicolor subsp. tessmannii (Mildbr.) C.C. Berg & Heusden',
                                         'Roupala montana var. brasiliensis (Klotzsch) K.S. Edwards',
                                         'Roupala montana var. montana',
                                         'Roupala montana var. paraensis (Sleumer) K.S. Edwards',
                                         'Salacia cordata subsp. petenensis (Lundell) Lombardi'])

In [255]:
# Delete column of original db vs GBIF
df_filt.drop(['OrigDB_vs_GBIF'], axis=1, inplace=True)

In [256]:
# Add a column with the names from the original database and the ones added by GBIF
# Load dataframe with the original names
orig_names = pd.read_csv("../../List_species/List_all_species_withauth_phenology_proj_UDLA2022_def_tnrs.csv")

#Convert df into a list
orig_names = orig_names["Scientific_name_auth"].values.tolist()

In [257]:
# Add a column to verify if the name was in the original database
df_filt["OrigDB_vs_GBIF"] = pd.Series()

for i in df_filt.index:
    if df_filt['acceptedScientificName'][i] in orig_names:
        df_filt.at[i,'OrigDB_vs_GBIF'] = "OriginalDB"
    else : #if both conditions not verified
        df_filt.at[i,'OrigDB_vs_GBIF'] = "GBIF"

  df_filt["OrigDB_vs_GBIF"] = pd.Series()


In [258]:
# Count the number of values of the new column
df_filt["OrigDB_vs_GBIF"].value_counts()

OriginalDB    53744
GBIF            405
Name: OrigDB_vs_GBIF, dtype: int64

In [259]:
# Select and export records with GBIF species (not present in the original DB)
sp_list_onlyGBIF_process = pd.DataFrame(df_filt[df_filt["OrigDB_vs_GBIF"] == "GBIF"]["acceptedScientificName"].unique())
sp_list_onlyGBIF_process.to_csv("sp_list_only_GBIF_afterprocessing.csv", index=False)

In [260]:
df_filt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54149 entries, 0 to 54758
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gbifID                          54149 non-null  int64  
 1   institutionCode                 53962 non-null  object 
 2   recordedBy                      52645 non-null  object 
 3   eventDate                       54149 non-null  object 
 4   year                            54149 non-null  float64
 5   month                           54149 non-null  float64
 6   day                             53510 non-null  float64
 7   Country_name                    54149 non-null  object 
 8   decimalLongitude                54149 non-null  float64
 9   decimalLatitude                 54149 non-null  float64
 10  elevation                       41005 non-null  float64
 11  family                          54149 non-null  object 
 12  genus                           

## FieldNotes single column (without dyamic properties)

In [261]:
# Delete column of merged fieldnotes
df_filt.drop(['FieldNotes'], axis=1, inplace=True)

In [262]:
# Merge two columns (occurrenceRemarks and fieldNotes) with fieldnotes information into a single one
fn_cols = ['occurrenceRemarks', 'fieldNotes']
df_filt['FieldNotes'] = df_filt[fn_cols].apply(lambda row: '; '.join(row.values.astype(str)), axis=1).copy()

In [263]:
# Check the new column
df_filt['FieldNotes']

0        [Invalid Lat.: 04Â° 49' 509"S. Invalid Long.: ...
1        Cestrum megalophyllum voucher BioBot11285 matu...
2        Cestrum megalophyllum voucher BioBot10266 ribu...
3        Erythroxylum macrophyllum voucher BioBot10378 ...
4        Casearia sylvestris isolate PECB029 ribulose-1...
                               ...                        
54754                                             nan; nan
54755                                             nan; nan
54756                                             nan; nan
54757                                             nan; nan
54758                                             nan; nan
Name: FieldNotes, Length: 54149, dtype: object

In [264]:
# Check the dynamic prop column
df_filt[df_filt['dynamicProperties'].notnull()]['dynamicProperties'].iloc[500]

'Muestra de tejido: si'

In [265]:
# Calculate the % of records that only have dynamic prop 
print(f"{df_filt[['fieldNotes', 'occurrenceRemarks']].apply(lambda row: row.notnull().any(), axis=1).mean()*100:.2f} % or records have at least one of them")

86.35 % or records have at least one of them


In [266]:
df_filt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54149 entries, 0 to 54758
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gbifID                          54149 non-null  int64  
 1   institutionCode                 53962 non-null  object 
 2   recordedBy                      52645 non-null  object 
 3   eventDate                       54149 non-null  object 
 4   year                            54149 non-null  float64
 5   month                           54149 non-null  float64
 6   day                             53510 non-null  float64
 7   Country_name                    54149 non-null  object 
 8   decimalLongitude                54149 non-null  float64
 9   decimalLatitude                 54149 non-null  float64
 10  elevation                       41005 non-null  float64
 11  family                          54149 non-null  object 
 12  genus                           

## Obtain binned latitude and longitue columns 

In [268]:
# Obtain the max latitude
max(df_filt["decimalLatitude"])

47.625

In [269]:
# Obtain the min latitude
min(df_filt["decimalLatitude"])

-52.0225

In [270]:
# Create a list with the bins for the binned latitude column
start_lat = -60
n_lat = 70
interval_lat = 20
bins_lat = np.arange(start_lat, n_lat, interval_lat)

In [271]:
bins_lat

array([-60, -40, -20,   0,  20,  40,  60])

In [272]:
# Create a column with the binned latitude values
df_filt['Binned_latitude'] = pd.cut(df_filt['decimalLatitude'], bins_lat)

In [273]:
# Convert column type into object
df_filt["Binned_latitude"] = df_filt["Binned_latitude"].astype('string')

In [274]:
# Check binned latitude counts
df_filt['Binned_latitude'].value_counts()

(0, 20]       27077
(-20, 0]      15368
(-40, -20]     9361
(20, 40]       2341
(-60, -40]        1
(40, 60]          1
Name: Binned_latitude, dtype: Int64

In [275]:
# Delete records with extreme latitude values
df_filt = filter_rows_by_values(df_filt, 'Binned_latitude', ['(-60, -40]', '(40, 60]'])

In [276]:
# Check binned latitude counts
df_filt['Binned_latitude'].value_counts()

(0, 20]       27077
(-20, 0]      15368
(-40, -20]     9361
(20, 40]       2341
Name: Binned_latitude, dtype: Int64

In [277]:
# Obtain the max longitude
max(df_filt["decimalLongitude"])

-26.26

In [278]:
# Obtain the min longitude
min(df_filt["decimalLongitude"])

-177.433

In [279]:
# Create a list with the bins for the binned longitude column
start_long = -180
n_long = -10
interval_long = 20
bins_long = np.arange(start_long, n_long, interval_long)

In [280]:
# Create a column with the binned longitude values
df_filt['Binned_longitude'] = pd.cut(df_filt['decimalLongitude'], bins_long)

In [281]:
# Convert column type into object
df_filt["Binned_longitude"] = df_filt["Binned_longitude"].astype("string")

In [282]:
# Check binned longitude counts
df_filt['Binned_longitude'].value_counts()

(-60, -40]      17527
(-80, -60]      17429
(-100, -80]     14269
(-40, -20]       3640
(-120, -100]     1242
(-160, -140]       36
(-140, -120]        3
(-180, -160]        1
Name: Binned_longitude, dtype: Int64

In [283]:
# Delete records with extreme longitude values
df_filt = filter_rows_by_values(df_filt, 'Binned_longitude', ['(-180, -160]'])

In [284]:
# Check binned longitude counts
df_filt['Binned_longitude'].value_counts()

(-60, -40]      17527
(-80, -60]      17429
(-100, -80]     14269
(-40, -20]       3640
(-120, -100]     1242
(-160, -140]       36
(-140, -120]        3
Name: Binned_longitude, dtype: Int64

## Export final dataset

In [287]:
# Export final dataset as csv
df_filt.to_csv("Allspecies_GBIFrecords_fieldnotes_filtered_def.csv", index=False)

In [291]:
# Export final dataset as parquet
df_filt.to_parquet("Allspecies_GBIFrecords_fieldnotes_filtered_def.parquet", index=False)

## Obtain stratified validation dataset

In [None]:
# This sklearn function did not work
#df_train, df_test = train_test_split(df_filt, test_size=3000, random_state=42, stratify=df_filt[["Year_interval", 
 #                                                                                                "Nrecords_interval",
  #                                                                                               "Binned_latitude"]])
                                                                                                 #"Binned_longitude"]])

In [293]:
# Load final dataset
df_filt = pd.read_csv("Allspecies_GBIFrecords_fieldnotes_filtered_def.csv")

In [295]:
# Functions to create stratified proportions and samples
def stratified_sample(df, strata, size=None, seed=None, keep_index= True):
    '''
    It samples data from a pandas dataframe using strata. These functions use
    proportionate stratification:
    n1 = (N1/N) * n
    where:
        - n1 is the sample size of stratum 1
        - N1 is the population size of stratum 1
        - N is the total population size
        - n is the sampling size
    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    :seed: sampling seed
    :keep_index: if True, it keeps a column with the original population index indicator
    
    Returns
    -------
    A sampled pandas dataframe based in a set of strata.
    Examples
    --------
    >> df.head()
    	id  sex age city 
    0	123 M   20  XYZ
    1	456 M   25  XYZ
    2	789 M   21  YZX
    3	987 F   40  ZXY
    4	654 M   45  ZXY
    ...
    # This returns a sample stratified by sex and city containing 30% of the size of
    # the original data
    >> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
    Requirements
    ------------
    - pandas
    - numpy
    '''
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)

    # controlling variable to create the dataframe or append to it
    first = True 
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        qry=''
        for s in range(len(strata)):
            stratum = strata[s]
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']

            if type(value) == str:
                value = "'" + str(value) + "'"
            
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
            else:
                qry = qry + stratum + ' == ' + str(value)
        
        # final dataframe
        if first:
            stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            first = False
        else:
            tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            stratified_df = stratified_df.append(tmp_df, ignore_index=True)
    
    return stratified_df



def stratified_sample_report(df, strata, size=None):
    '''
    Generates a dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    Returns
    -------
    A dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    '''
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
    return tmp_grpd


def __smpl_size(population, size):
    '''
    A function to compute the sample size. If not informed, a sampling 
    size will be calculated using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    Parameters
    ----------
        :population: population size
        :size: sample size (default = None)
    Returns
    -------
    Calculated sample size to be used in the functions:
        - stratified_sample
        - stratified_sample_report
    '''
    if size is None:
        cochran_n = round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
        n = round(cochran_n/(1+((cochran_n -1) /population)))
    elif size >= 0 and size < 1:
        n = round(population * size)
    elif size < 0:
        raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
    elif size >= 1:
        n = size
    return n

In [296]:
df_filt.shape

(54146, 28)

In [297]:
# Obtain the stratified validation dataset proportions
stratif_df_all = stratified_sample_report(df_filt, ["Year_interval", "Nrecords_interval", "Binned_latitude", "Binned_longitude"], 3000)

# Export the result as csv
stratif_df_all.to_csv("Stratified_val_df_prop_all.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['size'] = 1


In [298]:
# Dataset with only records that have link to images
df_img = df_filt[df_filt['image_url'].notnull()].reset_index(drop=True)

In [299]:
df_img.shape

(4870, 28)

In [300]:
# Obtain the stratified validation dataset proportions that have images with nrecords
stratif_df_img_nrec = stratified_sample(df_img, ["Year_interval", "Nrecords_interval", "Binned_latitude", "Binned_longitude"], 3000, keep_index= False, seed=100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['size'] = 1
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index

In [301]:
# Obtain the stratified validation dataset proportions that have images without nrecords
stratif_df_img_no_nrec = stratified_sample(df_img, ["Year_interval", "Binned_latitude", "Binned_longitude"], 3000, keep_index= False, seed=100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['size'] = 1
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index

In [303]:
stratif_df_img_no_nrec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3001 entries, 0 to 3000
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gbifID                          3001 non-null   int64  
 1   institutionCode                 3001 non-null   object 
 2   recordedBy                      2493 non-null   object 
 3   eventDate                       3001 non-null   object 
 4   year                            3001 non-null   float64
 5   month                           3001 non-null   float64
 6   day                             2938 non-null   float64
 7   Country_name                    3001 non-null   object 
 8   decimalLongitude                3001 non-null   float64
 9   decimalLatitude                 3001 non-null   float64
 10  elevation                       2086 non-null   float64
 11  family                          3001 non-null   object 
 12  genus                           30

In [122]:
# Export stratif dataset with number of records
stratif_df_img_nrec.to_csv("Stratified_val_df_img_with_nrecords.csv", index=False)

In [123]:
# Export stratif dataset without number of records
stratif_df_img_no_nrec.to_csv("Stratified_val_df_img_without_nrecords.csv", index=False)

## Check proportions of dataset with and without number of records

In [2]:
# Load data
stratif_df_img_nrec = pd.read_csv("Stratified_val_df_img_with_nrecords.csv")
stratif_df_img_no_nrec = pd.read_csv("Stratified_val_df_img_without_nrecords.csv")

In [3]:
stratif_df_img_nrec.value_counts("Nrecords_interval")

Nrecords_interval
>500       1737
101-500     720
11-100      455
0-10         91
dtype: int64

In [7]:
stratif_df_img_no_nrec.value_counts("Binned_longitude")

Binned_longitude
(-80, -60]      1440
(-60, -40]       865
(-100, -80]      480
(-40, -20]       179
(-120, -100]      32
(-140, -120]       2
(-160, -140]       2
dtype: int64

## Join labeled data with non-duplicated dataset

In [306]:
# Load labeled data
stratif_labeled = pd.read_csv("Stratified_val_df_img_def.csv")

In [307]:
stratif_labeled.columns

Index(['gbifID', 'institutionCode', 'recordedBy', 'eventDate', 'year', 'month',
       'day', 'Country_name', 'decimalLongitude', 'decimalLatitude',
       'elevation', 'family', 'genus', 'species', 'acceptedScientificName',
       'occurrenceRemarks', 'dynamicProperties', 'fieldNotes', 'image_url',
       'reproductiveCondition', 'Year_interval',
       'acceptedScientificName_splevel', 'n_records_GBIF', 'Nrecords_interval',
       'OrigDB_vs_GBIF', 'FieldNotes', 'Binned_latitude', 'Binned_longitude',
       'Label_flowering', 'Image_flowering', 'Label_fruiting',
       'Image_fruiting'],
      dtype='object')

In [310]:
# Select column with labels
columns_lab=['gbifID', 'Label_flowering', 'Image_flowering', 'Label_fruiting', 'Image_fruiting']
df_columns_lab = stratif_labeled[columns_lab]

In [311]:
df_columns_lab

Unnamed: 0,gbifID,Label_flowering,Image_flowering,Label_fruiting,Image_fruiting
0,2284494170,Yes,Yes,No,No
1,2270557443,Yes,Yes,No,No
2,1836962001,No,No,No,No
3,1803779575,Yes,Yes,No,No
4,2013260553,No,No,No,Yes
...,...,...,...,...,...
2995,3336611460,No,Yes,No,No
2996,2005421586,No,No,No,No
2997,3336608371,No,Yes,No,No
2998,1499613344,Yes,NAN,No,NAN


In [329]:
final_labeled_df = stratif_df_img_no_nrec.merge(df_columns_lab, how='left', on='gbifID')

In [330]:
final_labeled_df = test.drop_duplicates()

In [332]:
final_labeled_df.to_csv('Stratified_val_df_img_nondup.csv', index=False)

## Final verification validation non duplicated dataset

In [19]:
# Load labeled data
stratif_labeled_nondup = pd.read_csv("Stratified_val_df_img_nondup_def.csv")

In [20]:
stratif_labeled_nondup.columns

Index(['gbifID', 'institutionCode', 'recordedBy', 'eventDate', 'year', 'month',
       'day', 'Country_name', 'decimalLongitude', 'decimalLatitude',
       'elevation', 'family', 'genus', 'species', 'acceptedScientificName',
       'occurrenceRemarks', 'dynamicProperties', 'fieldNotes', 'image_url',
       'reproductiveCondition', 'Year_interval',
       'acceptedScientificName_splevel', 'n_records_GBIF', 'Nrecords_interval',
       'OrigDB_vs_GBIF', 'FieldNotes', 'Binned_latitude', 'Binned_longitude',
       'Label_flowering', 'Image_flowering', 'Label_fruiting',
       'Image_fruiting'],
      dtype='object')

In [24]:
stratif_labeled_nondup.value_counts("Label_fruiting")

Label_fruiting
No     2305
Yes     695
dtype: int64