In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
# specs directory
train_spec_dir = '/Netdata/2020/ziang/data/guangdong194/dataset/rest_25/specs/train_specs/'
val_spec_dir = '/Netdata/2020/ziang/data/guangdong194/dataset/rest_25/specs/val_specs/'

In [4]:
all_birds = os.listdir(train_spec_dir)
# do a statistics
train_stats = {bird:len(os.listdir(train_spec_dir+bird)) for bird in all_birds}
val_stats = {bird:len(os.listdir(val_spec_dir+bird)) for bird in all_birds}
# sort descending
train_stats = {k: v for k, v in sorted(train_stats.items(), key=lambda item: item[1], reverse=True)}
val_stats = {k: v for k, v in sorted(val_stats.items(), key=lambda item: item[1], reverse=True)}
# show result
train_stats

{'charadrius_dubius': 2000,
 'gallinago_gallinago': 2000,
 'hirundo_rustica': 2000,
 'motacilla_alba': 2000,
 'orthotomus_sutorius': 2000,
 'parus_minor': 2000,
 'prinia_inornata': 2000,
 'tringa_glareola': 2000,
 'cecropis_daurica': 1853,
 'motacilla_tschutschensis': 1749,
 'spilopelia_chinensis': 1692,
 'egretta_garzetta': 1645,
 'ardea_alba': 1496,
 'pycnonotus_jocosus': 1092,
 'prinia_flaviventris': 990,
 'himantopus_himantopus': 980,
 'pycnonotus_sinensis': 977,
 'lonchura_punctulata': 869,
 'zosterops_japonicus': 813,
 'tringa_stagnatilis': 726,
 'tringa_erythropus': 718,
 'apus_nipalensis': 609,
 'acridotheres_cristatellus': 543,
 'tachybaptus_ruficollis': 345,
 'gracupica_nigricollis': 253}

**Note:** I noticed that this is not the dataset with the most balanced bird. There are two motivations to use tensorflow.
   1. check Data Augmentation methods on images (spectrograms)
   2. find a more balanced set
   
So now I will make up a new dataset with more and more balanced data.

I just recalled that I sorted the entire file with all noises into files based on the species they are from.

In [28]:
# datadir with all segmented audio
data_dir_all = '/Netdata/2020/ziang/data/guangdong194/dataset/splited_data/train_dirs/'
all_birds = os.listdir(data_dir_all)
# all 93 birds stats
bird_all_stats = {bird:len(os.listdir(data_dir_all+bird)) for bird in all_birds}
all_train_stats = {k: v for k, v in sorted(bird_all_stats.items(), key=lambda item: item[1], reverse=True) if v > 2500}
all_train_stats

{'phylloscopus_fuscatus': 24602,
 'cyanoptila_cyanomelana': 18242,
 'motacilla_alba': 16096,
 'turdus_merula': 11427,
 'ficedula_narcissina': 10377,
 'riparia_riparia': 9701,
 'limosa_limosa': 8761,
 'hirundo_rustica': 8332,
 'ardea_cinerea': 8180,
 'EurasianHoopoe': 7794,
 'cuculus_canorus': 7428,
 'pandion_haliaetus': 7301,
 'tringa_glareola': 6727,
 'muscicapa_griseisticta': 6579,
 'acrocephalus_orientalis': 6081,
 'chlidonias_hybrida': 5785,
 'phoenicurus_auroreus': 5636,
 'arenaria_interpres': 5560,
 'fulica_atra': 4977,
 'falco_peregrinus': 4933,
 'falco_subbuteo': 4924,
 'charadrius_hiaticula': 4923,
 'cisticola_juncidis': 4862,
 'actitis_hypoleucos': 4733,
 'tringa_nebularia': 4699,
 'spilornis_cheela': 4574,
 'chroicocephalus_ridibundus': 4478,
 'gallinula_chloropus': 4460,
 'emberiza_pusilla': 4061,
 'corvus_macrorhynchos': 3911,
 'dicrurus_macrocercus': 3904,
 'turdus_cardis': 3607,
 'calidris_alba': 3413,
 'hypothymis_azurea': 3120,
 'Brown-flankedBushWarbler': 3055,
 'phyl

In [29]:
len(all_train_stats)

43

In [30]:
# take a union of the most presented birds
presence_excel = '/Netdata/2020/ziang/data/guangdong194/guangdong194_updated.xlsx'
presence_all = pd.read_excel(presence_excel).head(50)
# presence_all.head(50)

In [35]:
# find intersections with both most presence and data
def find_intersection(stats, excel):
    count = []
    for bird in excel:
        if '_'.join(bird.split()).lower() in stats:
            count.append('_'.join(bird.split()).lower())
#         else:
#             print('... %s not satisfied...'%bird)
    print('... %d birds in common ...'%len(count))
    return count

In [37]:
birds_most_data = list(all_train_stats.keys())
birds_most_pres = list(presence_all['拉丁学名'])
best_birds = find_intersection(birds_most_data, birds_most_pres)

... 12 birds in common ...


In [38]:
for bird in birds_most_data[:30]:
    if bird not in best_birds:
        best_birds.append(bird)
len(best_birds)

32

### Best 32 Birds
Now we have fixed the best bucket containing the best birds with best data and most presence. Originally we have 32 birds, we remove the last two, who has relatively less data. Now we have data of **30** birds in total.

In [53]:
best_32_stats_train = {bird:len(os.listdir(data_dir_all+bird)) for bird in best_birds}
best_30_stats_train = {k: v for k, v in sorted(best_32_stats_train.items(), key=lambda item: item[1], reverse=True) if v > 3900}

In [54]:
best_30_stats_train

{'phylloscopus_fuscatus': 24602,
 'cyanoptila_cyanomelana': 18242,
 'motacilla_alba': 16096,
 'turdus_merula': 11427,
 'ficedula_narcissina': 10377,
 'riparia_riparia': 9701,
 'limosa_limosa': 8761,
 'hirundo_rustica': 8332,
 'ardea_cinerea': 8180,
 'EurasianHoopoe': 7794,
 'cuculus_canorus': 7428,
 'pandion_haliaetus': 7301,
 'tringa_glareola': 6727,
 'muscicapa_griseisticta': 6579,
 'acrocephalus_orientalis': 6081,
 'chlidonias_hybrida': 5785,
 'phoenicurus_auroreus': 5636,
 'arenaria_interpres': 5560,
 'fulica_atra': 4977,
 'falco_peregrinus': 4933,
 'falco_subbuteo': 4924,
 'charadrius_hiaticula': 4923,
 'cisticola_juncidis': 4862,
 'actitis_hypoleucos': 4733,
 'tringa_nebularia': 4699,
 'spilornis_cheela': 4574,
 'chroicocephalus_ridibundus': 4478,
 'gallinula_chloropus': 4460,
 'emberiza_pusilla': 4061,
 'corvus_macrorhynchos': 3911}

In [57]:
data_dir_val = '/Netdata/2020/ziang/data/guangdong194/dataset/splited_data/val_/'
best_32_stats_val = {bird:len(os.listdir(data_dir_val+bird)) for bird in best_birds}
best_30_stats_val = {k: v for k, v in sorted(best_32_stats_val.items(), key=lambda item: item[1], reverse=True) if v > 3900}

FileNotFoundError: [Errno 2] No such file or directory: '/Netdata/2020/ziang/data/guangdong194/dataset/splited_data/val/motacilla_alba'

In [56]:
best_30_stats_val

{'phylloscopus_fuscatus': 24602,
 'cyanoptila_cyanomelana': 18242,
 'motacilla_alba': 16096,
 'turdus_merula': 11427,
 'ficedula_narcissina': 10377,
 'riparia_riparia': 9701,
 'limosa_limosa': 8761,
 'hirundo_rustica': 8332,
 'ardea_cinerea': 8180,
 'EurasianHoopoe': 7794,
 'cuculus_canorus': 7428,
 'pandion_haliaetus': 7301,
 'tringa_glareola': 6727,
 'muscicapa_griseisticta': 6579,
 'acrocephalus_orientalis': 6081,
 'chlidonias_hybrida': 5785,
 'phoenicurus_auroreus': 5636,
 'arenaria_interpres': 5560,
 'fulica_atra': 4977,
 'falco_peregrinus': 4933,
 'falco_subbuteo': 4924,
 'charadrius_hiaticula': 4923,
 'cisticola_juncidis': 4862,
 'actitis_hypoleucos': 4733,
 'tringa_nebularia': 4699,
 'spilornis_cheela': 4574,
 'chroicocephalus_ridibundus': 4478,
 'gallinula_chloropus': 4460,
 'emberiza_pusilla': 4061,
 'corvus_macrorhynchos': 3911}

Now we set train the model with each bird approximately 4000 clips