In [1]:
# Imports
import pandas as pd
import numpy as np

In [5]:
# Load data
df = pd.read_csv("HCNQ_all.csv", encoding='latin-1', low_memory=False)

In [16]:
# Select the most important columns 
cols = ["id", "institutionCode", "recordedBy", "eventDate", "year", 
       "month", "day", "country", "decimalLongitude", "decimalLatitude", 
        "verbatimElevation", "family", "genus", "specificEpithet", 
       "scientificName", "scientificNameAuthorship", "occurrenceRemarks", "dynamicProperties", 
       "verbatimAttributes", "references", "reproductiveCondition"]
df_filt = df[cols]

## Filter records with fieldnotes only

In [26]:
# General stats about field notes 
print(f"{df_filt.verbatimAttributes.notnull().mean() * 100:.2f} % of records have field notes")
print(f"{df_filt.occurrenceRemarks.notnull().mean() * 100:.2f} % of records have occurrence remarks")
print(f"{df_filt.dynamicProperties.notnull().mean() * 100:.2f} % of records have dynamic properties")
print(f"{df_filt[['verbatimAttributes', 'occurrenceRemarks', 'dynamicProperties']].apply(lambda row: row.notnull().any(), axis=1).mean()*100:.2f} % or records have at least one of them")

95.00 % of records have field notes
0.03 % of records have occurrence remarks
9.02 % of records have dynamic properties
95.25 % or records have at least one of them


In [33]:
# Dfs of field notes
fn_df = df_filt["occurrenceRemarks"]
fn_df = fn_df.dropna()
fn_df = df_filt.loc[fn_df.index]
len(fn_df)

3

In [34]:
fn_df2 = df_filt["verbatimAttributes"]
fn_df2 = fn_df2.dropna()
fn_df2 = df_filt.loc[fn_df2.index]
len(fn_df2)

11211

In [35]:
fn_df3 = df_filt["dynamicProperties"]
fn_df3 = fn_df3.dropna()
fn_df3 = df_filt.loc[fn_df3.index]
len(fn_df3)

1064

In [36]:
# Merge datasets
final_df = pd.concat([fn_df, fn_df2])
final_df = pd.concat([final_df, fn_df3])
# Delete duplicates
final_df = final_df.drop_duplicates()

In [38]:
# Reset indices of the dataframe
final_df = final_df.reset_index(drop=True)

In [40]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10881 entries, 0 to 10880
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        10881 non-null  int64  
 1   institutionCode           10881 non-null  object 
 2   recordedBy                10760 non-null  object 
 3   eventDate                 10727 non-null  object 
 4   year                      10834 non-null  float64
 5   month                     10739 non-null  float64
 6   day                       10740 non-null  float64
 7   country                   10881 non-null  object 
 8   decimalLongitude          8940 non-null   float64
 9   decimalLatitude           8914 non-null   float64
 10  verbatimElevation         10450 non-null  object 
 11  family                    10881 non-null  object 
 12  genus                     10822 non-null  object 
 13  specificEpithet           10822 non-null  object 
 14  scient

## Delete rows that don't have dates with at least month 

In [42]:
final_df = final_df.drop(final_df[(final_df.month.isna())].index)

In [44]:
final_df.shape

(10739, 21)

## Delete rows without latitude and longitude 

In [54]:
final_df = final_df.drop(final_df[(final_df.decimalLatitude.isna())].index)

In [55]:
final_df.shape

(8882, 21)

In [52]:
final_df = final_df.drop(final_df[(final_df.decimalLongitude.isna())].index)

In [53]:
final_df.shape

(8882, 21)

## Duplicates processing

In [56]:
#  Drop true duplicates 
dup_cols = ['scientificName', 'decimalLongitude', 'decimalLatitude','year', 'recordedBy']
final_df = final_df.drop(final_df.loc[:, dup_cols].dropna().duplicated().loc[lambda x: x].index)

In [57]:
# Check if true duplicates were deleted 
(final_df.loc[:, dup_cols].dropna().duplicated().loc[lambda x: x].index).shape

(0,)

In [58]:
final_df.shape

(7091, 21)

In [61]:
# Reset indices of the dataframe
final_df = final_df.reset_index(drop=True)

In [62]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7091 entries, 0 to 7090
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        7091 non-null   int64  
 1   institutionCode           7091 non-null   object 
 2   recordedBy                7049 non-null   object 
 3   eventDate                 7090 non-null   object 
 4   year                      7090 non-null   float64
 5   month                     7091 non-null   float64
 6   day                       7091 non-null   float64
 7   country                   7091 non-null   object 
 8   decimalLongitude          7091 non-null   float64
 9   decimalLatitude           7091 non-null   float64
 10  verbatimElevation         6917 non-null   object 
 11  family                    7091 non-null   object 
 12  genus                     7043 non-null   object 
 13  specificEpithet           7043 non-null   object 
 14  scientif

In [64]:
final_df['country'].value_counts()

Ecuador    7088
Perú          2
Bolivia       1
Name: country, dtype: int64

In [65]:
# Check scientific names
unique_names_comp_df = pd.DataFrame(final_df["scientificName"].unique())
unique_names_comp_df.to_csv("unique_names_hcnq_df.csv", index=False)

In [68]:
# Export dataset
final_df.to_csv('HCNQ_dataset_filtered.csv', index=False)