# EDA for BirdCLEF 2022

# Load Packages

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from termcolor import colored
sns.set_style("darkgrid")

In [None]:
DATA_DIR = '/kaggle/input/birdclef-2022'

# Basic Statistics

In [None]:
eBird_tax_df = pd.read_csv(os.path.join(DATA_DIR, 'eBird_Taxonomy_v2021.csv'))
eBird_tax_df.head()
print(colored('Check number of rows and columns', 'red'))
print('Number of rows = {}'.format(eBird_tax_df.shape[0]))
print('Number of columns = {}'.format(eBird_tax_df.shape[1]))
print('List of columns:', eBird_tax_df.columns.values)

print(colored('\nCheck unique numbers of entities in each parameters', 'red'))
print('Number of TAXON_ORDER: {}'.format(len(eBird_tax_df.TAXON_ORDER.unique())))
print('Number of CATEGORY = {}'.format(len(eBird_tax_df.CATEGORY.unique())))
print('Number of SPECIES_CODE = {}'.format(len(eBird_tax_df.SPECIES_CODE.unique())))
print('Number of PRIMARY_COM_NAME = {}'.format(len(eBird_tax_df.PRIMARY_COM_NAME.unique())))
print('Number of SCI_NAME = {}'.format(len(eBird_tax_df.SCI_NAME.unique())))
print('Number of ORDER1 = {}'.format(len(eBird_tax_df.ORDER1.unique())))
print('Number of FAMILY = {}'.format(len(eBird_tax_df.FAMILY.unique())))
print('Number of REPORT_AS = {}'.format(len(eBird_tax_df.REPORT_AS.unique())))
print('Number of SPECIES_GROUP = {}'.format(len(eBird_tax_df.SPECIES_GROUP.unique())))

<p style="font-size:16px">
    From above stats, we can see that <em>TAXON_ORDER, SPECIES_CODE, PRIMARY_COM_NAME and SCI_NAME </em> are unique parameters. Rest <em>CATEGORY, ORDER1, FAMILY, 'REPORT_AS and SPECIES_GROUP</em> are in various categories. We will check these parameters one by one. Let's start from <em>Category</em>.
</p>

In [None]:
print('Number of Categories = {}'.format(len(eBird_tax_df.CATEGORY.unique())))
pie_plot_data = eBird_tax_df.groupby('CATEGORY')['TAXON_ORDER'].count().sort_values()
labels = pie_plot_data.keys()
explode = [0.015]*len(labels)
pie, ax = plt.subplots(figsize = [10,6])
plt.pie(pie_plot_data, autopct = "%.1f%%", labels = labels, explode = explode, pctdistance = 0.5)
plt.show()

<p style="font-size:16px">
    The details of taxonomy categories are as follows:
    <ul>
        <li><b>Species:</b> e.g., Tundra Swan Cygnus columbianus </li>
        <li><b>ISSF or Identifiable Sub-specific Group:</b> Identifiable subspecies or group of subspecies, e.g., Tundra Swan (Bewick’s) Cygnus columbianus bewickii or Tundra Swan (Whistling) Cygnus columbianus columbianus</li>
        <li><b>Slash:</b> Identification to Species-pair, e.g., Tundra/Trumpeter Swan Cygnus columbianus/buccinator</li>
        <li><b>Spuh:</b> Genus or identification at broad level, e.g., swan sp. Cygnus sp.</li>
        <li><b>Hybrid:</b> Hybrid between two species, e.g., Tundra x Trumpeter Swan (hybrid)</li>
        <li><b>Intergrade:</b> Hybrid between two ISSF (subspecies or subspecies groups), e.g., Tundra Swan (Whistling x Bewick’s) Cygnus columbianus columbianus x bewickii</li>
        <li><b>Domestic:</b> Distinctly-plumaged domesticated varieties that may be free-flying (these do not count on personal lists) e.g., Mallard (Domestic type)</li>
        <li><b>Form:</b> Miscellaneous other taxa, including recently-described species yet to be accepted or distinctive forms that are not universally accepted, e.g., Red-tailed Hawk (abieticola), Upland Goose (Bar-breasted)</li>
    </ul> 
<a href="https://ebird.org/science/use-ebird-data/the-ebird-taxonomy">source</a>
</p>

<p style="font-size:16px">
    <em>Species</em> is the most predominant category, followed by <em>ISSF</em>, <em>Slash</em>, <em>Spuh</em> etc. Now let's check <em>ORDER1</em> parameter.
</p>

In [None]:
fig, ax = plt.subplots(figsize = (10,7), dpi = 120)
# gs = gridspec.GridSpec(2,2)
colors = sns.color_palette()

# ax0 = fig.add_subplot(gs[0:2,0])
order_count = pd.DataFrame(eBird_tax_df.groupby('ORDER1').TAXON_ORDER.count())
fifth_largest_value = order_count['TAXON_ORDER'].nlargest(5)[4]
color_plt = [colors[0] if value < fifth_largest_value else colors[3] for value in order_count.TAXON_ORDER]
bar = sns.barplot(data = order_count, y = order_count.index, x = 'TAXON_ORDER',  ax = ax, palette = color_plt)
plt.xlabel('# of Order')
plt.ylabel('')
plt.show()

We can see the top 5 Orders highlighted in red bars.

In [None]:
## WORK IN PROGRESS... STAY TUNED