In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
# Load data
df = pd.read_csv("Allspecies_GBIFrecords_fieldnotes_filtered_def.csv")

In [3]:
# Check columns 
df.columns

Index(['gbifID', 'institutionCode', 'recordedBy', 'eventDate', 'year', 'month',
       'day', 'Country_name', 'decimalLongitude', 'decimalLatitude',
       'elevation', 'family', 'genus', 'species', 'acceptedScientificName',
       'occurrenceRemarks', 'dynamicProperties', 'fieldNotes', 'image_url',
       'reproductiveCondition', 'Year_interval',
       'acceptedScientificName_splevel', 'n_records_GBIF', 'Nrecords_interval',
       'OrigDB_vs_GBIF', 'FieldNotes', 'Binned_latitude', 'Binned_longitude'],
      dtype='object')

In [4]:
# Delete records that only have dynamic properties as a source of fieldnote
# Getting indices
indices_del = df[(df.fieldNotes.isnull()) & (df.occurrenceRemarks.isnull()) & (df.dynamicProperties.notnull())].index
 
#droping row based on column value
df.drop(indices_del,inplace=True)

# Reset indices
df = df.reset_index(drop=True)

In [5]:
# Check that records were deleted 
df[(df.fieldNotes.isnull()) & (df.occurrenceRemarks.isnull()) & (df.dynamicProperties.notnull())]

Unnamed: 0,gbifID,institutionCode,recordedBy,eventDate,year,month,day,Country_name,decimalLongitude,decimalLatitude,...,image_url,reproductiveCondition,Year_interval,acceptedScientificName_splevel,n_records_GBIF,Nrecords_interval,OrigDB_vs_GBIF,FieldNotes,Binned_latitude,Binned_longitude


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46757 entries, 0 to 46756
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gbifID                          46757 non-null  int64  
 1   institutionCode                 46570 non-null  object 
 2   recordedBy                      45255 non-null  object 
 3   eventDate                       46757 non-null  object 
 4   year                            46757 non-null  float64
 5   month                           46757 non-null  float64
 6   day                             46250 non-null  float64
 7   Country_name                    46757 non-null  object 
 8   decimalLongitude                46757 non-null  float64
 9   decimalLatitude                 46757 non-null  float64
 10  elevation                       34599 non-null  float64
 11  family                          46757 non-null  object 
 12  genus                           

In [7]:
# Delete unnecesary columns
df.drop(['OrigDB_vs_GBIF', 'Year_interval', 'n_records_GBIF', 'Nrecords_interval', 
         'Binned_latitude', 'Binned_longitude'], axis=1, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46757 entries, 0 to 46756
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gbifID                          46757 non-null  int64  
 1   hcnqID                          0 non-null      float64
 2   institutionCode                 46570 non-null  object 
 3   recordedBy                      45255 non-null  object 
 4   eventDate                       46757 non-null  object 
 5   year                            46757 non-null  float64
 6   month                           46757 non-null  float64
 7   day                             46250 non-null  float64
 8   Country_name                    46757 non-null  object 
 9   decimalLongitude                46757 non-null  float64
 10  decimalLatitude                 46757 non-null  float64
 11  elevation                       34599 non-null  float64
 12  family                          

In [9]:
# Add a column for HCNQ ids
df.insert(1, 'hcnqID', np.nan)

In [11]:
# Change the order of columns
df = df[['gbifID', 'hcnqID','institutionCode','recordedBy','eventDate', 'year',
        'month','day','Country_name','decimalLongitude', 'decimalLatitude',
        'elevation','family','genus','species', 'acceptedScientificName_splevel',
        'acceptedScientificName','image_url','reproductiveCondition',
         'occurrenceRemarks', 'dynamicProperties', 'fieldNotes', 'FieldNotes']]

In [13]:
# Rename some columns
df.rename(columns={'Country_name': 'country', 'country': 'Country_name', 'acceptedScientificName_splevel': 'acceptedScientificName', 
                   'acceptedScientificName': 'scientificNameAuthorship'}, inplace=True)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46757 entries, 0 to 46756
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   gbifID                    46757 non-null  int64  
 1   hcnqID                    0 non-null      float64
 2   institutionCode           46570 non-null  object 
 3   recordedBy                45255 non-null  object 
 4   eventDate                 46757 non-null  object 
 5   year                      46757 non-null  float64
 6   month                     46757 non-null  float64
 7   day                       46250 non-null  float64
 8   country                   46757 non-null  object 
 9   decimalLongitude          46757 non-null  float64
 10  decimalLatitude           46757 non-null  float64
 11  elevation                 34599 non-null  float64
 12  family                    46757 non-null  object 
 13  genus                     46757 non-null  object 
 14  specie

In [16]:
df.to_csv('Final_datasets/Allspecies_GBIFrecords_fieldnotes_filtered_final.csv', index=False)