# Summary about the GBIF results of all the 438 species
## by [Sebastián Ayala-Ruano](https://sayalaruano.github.io/)

In [7]:
# Imports
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from millify import prettify

# Load data
df_GBIF = pd.read_parquet("Allspecies_GBIFrecords_fieldnotes_filtered.parquet")

## General information about the records of the dataset

In [5]:
# Create variables
n_total = len(df_GBIF)
n_fn = prettify(df_GBIF["fieldNotes"].notnull().sum())
n_or = prettify(df_GBIF["occurrenceRemarks"].notnull().sum())
n_dp = prettify(df_GBIF["dynamicProperties"].notnull().sum())
n_im = prettify(df_GBIF["image_url"].notnull().sum())
n_rc = prettify(df_GBIF["reproductiveCondition"].notnull().sum())

metrics = [n_total, n_fn, n_or, n_dp, n_im, n_rc]

names = ["Total", "With fieldNotes data", "With occurrenceRemarks data", "With dynamicProperties data",
        "With links to images", "With reproductiveCondition data"]

df_metrics = pd.DataFrame(metrics, index =names, columns =['Values'])

In [6]:
df_metrics

Unnamed: 0,Values
Total,68114
With fieldNotes data,12418
With occurrenceRemarks data,49516
With dynamicProperties data,22139
With links to images,5828
With reproductiveCondition data,8990


## Number of records by species

In [10]:
# Create a df with the data
df_records = df_GBIF.groupby(['acceptedScientificName', 'n_records_GBIF']).size().reset_index(name='count')

# Create dot plot of records by species
dotplot = px.scatter(df_records, x="acceptedScientificName", y="count", 
                 labels={"count": "Number of records",
                    "acceptedScientificName": "Scientific names"
                    })

In [11]:
dotplot.update_xaxes(showticklabels=False)

## Number of records by species intervals

In [19]:
# Create a df with the data
df_rec_intervals = df_GBIF.groupby('Nrecords_interval').size().reset_index(name='count')

# Plot
barplot1 = px.bar(df_rec_intervals, y='count', x='Nrecords_interval',
            text_auto='.2s', labels={
                    "count": "Number of records",
                    "Nrecords_interval": "Interval"
                })

In [20]:
barplot1.update_traces(textfont_size=12, textangle=0, textposition="outside", showlegend=False) 

In [68]:
# Create a df with the data
df_rec_intervals2 = df_GBIF.groupby(['acceptedScientificName', 'Nrecords_interval']).size().reset_index(name='count')

test = pd.DataFrame(df_rec_intervals2["Nrecords_interval"].value_counts())

df_rec_intervals2 = pd.DataFrame(list(zip(test.index.tolist(), test["Nrecords_interval"])), columns=["intervals", "count"])

# Plot
barplot2 = px.bar(df_rec_intervals2, y='count', x='intervals',
            text_auto='.2s', labels={
                    "count": "Number of species",
                    "intervals": "Interval"
                })

In [69]:
barplot2.update_traces(textfont_size=12, textangle=0, textposition="outside", showlegend=False)

## Number of records by date intervals

In [70]:
# Create a df with the data
df_dates_interv = df_GBIF.groupby('Year_interval').size().reset_index(name='count')

# Plot
barplot3 = px.bar(df_dates_interv, y='count', x='Year_interval',
            text_auto='.2s', labels={
                    "count": "Number of records",
                    "Year_interval": "Year Interval"
                })

In [71]:
barplot3.update_traces(textfont_size=12, textangle=0, textposition="outside", showlegend=False)

In [72]:
# Create a df with the data
df_date_interv2 = df_GBIF.groupby(['acceptedScientificName', 'Year_interval']).size().reset_index(name='count')

test2 = pd.DataFrame(df_date_interv2["Year_interval"].value_counts())

df_date_interv2 = pd.DataFrame(list(zip(test2.index.tolist(), test2["Year_interval"])), columns=["intervals", "count"])

# Plot
barplot4 = px.bar(df_date_interv2, y='count', x='intervals',
            text_auto='.2s', labels={
                    "count": "Number of species",
                    "intervals": "Year Interval"
                })

In [73]:
barplot4.update_traces(textfont_size=12, textangle=0, textposition="outside", showlegend=False)

## Number of records by countries

In [77]:
# Create a df with the data
df_countries = df_GBIF.groupby('Country_name').size().reset_index(name='count')

# Plot
pieplot = go.Figure(data=[go.Pie(labels=df_countries["Country_name"], 
    values=df_countries["count"], hole=.4, textposition = "inside")])

In [78]:
pieplot

## Map of the distribution of records

In [80]:
map_plot = px.scatter_geo(df_GBIF, lat="decimalLatitude", lon="decimalLongitude", 
                    hover_name="Country_name")

In [81]:
map_plot

## Binned latitude histogram

In [82]:
# Create a df with the data
lat = df_GBIF[df_GBIF['decimalLatitude'].notnull()]["decimalLatitude"]

# Plot
hist_plot = px.histogram(lat, x="decimalLatitude", labels={'decimalLatitude':'Latitude'}, 
                    nbins=15, opacity=0.8)

In [83]:
hist_plot

##  Number of species from the original database and GBIF

In [84]:
# Create a df with the data
df_orig_gbif = df_GBIF.groupby('OrigDB_vs_GBIF').size().reset_index(name='count')

# Plot
barplot5 = px.bar(df_orig_gbif, y='count', x='OrigDB_vs_GBIF',
            text_auto='.2s', labels={
                    "count": "Number of records",
                    "OrigDB_vs_GBIF": "Origin of the species names"
                })

In [85]:
barplot5.update_traces(textfont_size=12, textangle=0, textposition="outside", showlegend=False)