In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from random import shuffle

In [2]:
# specs directory
train_spec_dir = '/Netdata/2020/ziang/data/guangdong194/dataset/rest_25/specs/train_specs/'
val_spec_dir = '/Netdata/2020/ziang/data/guangdong194/dataset/rest_25/specs/val_specs/'

In [3]:
all_birds = os.listdir(train_spec_dir)
# do a statistics
train_stats = {bird:len(os.listdir(train_spec_dir+bird)) for bird in all_birds}
val_stats = {bird:len(os.listdir(val_spec_dir+bird)) for bird in all_birds}
# sort descending
train_stats = {k: v for k, v in sorted(train_stats.items(), key=lambda item: item[1], reverse=True)}
val_stats = {k: v for k, v in sorted(val_stats.items(), key=lambda item: item[1], reverse=True)}
# show result
train_stats

{'charadrius_dubius': 2000,
 'gallinago_gallinago': 2000,
 'hirundo_rustica': 2000,
 'motacilla_alba': 2000,
 'orthotomus_sutorius': 2000,
 'parus_minor': 2000,
 'prinia_inornata': 2000,
 'tringa_glareola': 2000,
 'cecropis_daurica': 1853,
 'motacilla_tschutschensis': 1749,
 'spilopelia_chinensis': 1692,
 'egretta_garzetta': 1645,
 'ardea_alba': 1496,
 'pycnonotus_jocosus': 1092,
 'prinia_flaviventris': 990,
 'himantopus_himantopus': 980,
 'pycnonotus_sinensis': 977,
 'lonchura_punctulata': 869,
 'zosterops_japonicus': 813,
 'tringa_stagnatilis': 726,
 'tringa_erythropus': 718,
 'apus_nipalensis': 609,
 'acridotheres_cristatellus': 543,
 'tachybaptus_ruficollis': 345,
 'gracupica_nigricollis': 253}

**Note:** I noticed that this is not the dataset with the most balanced bird. There are two motivations to use tensorflow.
   1. check Data Augmentation methods on images (spectrograms)
   2. find a more balanced set
   
So now I will make up a new dataset with more and more balanced data.

I just recalled that I sorted the entire file with all noises into files based on the species they are from.

In [4]:
# datadir with all segmented audio
data_dir_all = '/Netdata/2020/ziang/data/guangdong194/dataset/splited_data/train_dirs/'
all_birds = os.listdir(data_dir_all)
# all 93 birds stats
bird_all_stats = {bird:len(os.listdir(data_dir_all+bird)) for bird in all_birds}
all_train_stats = {k: v for k, v in sorted(bird_all_stats.items(), key=lambda item: item[1], reverse=True) if v > 2500}
all_train_stats

{'phylloscopus_fuscatus': 24602,
 'cyanoptila_cyanomelana': 18242,
 'motacilla_alba': 16096,
 'turdus_merula': 11427,
 'ficedula_narcissina': 10377,
 'riparia_riparia': 9701,
 'limosa_limosa': 8761,
 'hirundo_rustica': 8332,
 'ardea_cinerea': 8180,
 'upupa_epops': 7794,
 'cuculus_canorus': 7428,
 'pandion_haliaetus': 7301,
 'tringa_glareola': 6727,
 'muscicapa_griseisticta': 6579,
 'acrocephalus_orientalis': 6081,
 'chlidonias_hybrida': 5785,
 'phoenicurus_auroreus': 5636,
 'arenaria_interpres': 5560,
 'fulica_atra': 4977,
 'falco_peregrinus': 4933,
 'falco_subbuteo': 4924,
 'charadrius_hiaticula': 4923,
 'cisticola_juncidis': 4862,
 'actitis_hypoleucos': 4733,
 'tringa_nebularia': 4699,
 'spilornis_cheela': 4574,
 'chroicocephalus_ridibundus': 4478,
 'gallinula_chloropus': 4460,
 'emberiza_pusilla': 4061,
 'corvus_macrorhynchos': 3911,
 'dicrurus_macrocercus': 3904,
 'turdus_cardis': 3607,
 'calidris_alba': 3413,
 'hypothymis_azurea': 3120,
 'horornis_fortipes': 3055,
 'phylloscopus_t

In [5]:
len(all_train_stats)

43

In [6]:
# take a union of the most presented birds
presence_excel = '/Netdata/2020/ziang/data/guangdong194/guangdong194_updated.xlsx'
presence_all = pd.read_excel(presence_excel).head(50)
# presence_all.head(50)

In [7]:
# find intersections with both most presence and data
def find_intersection(stats, excel):
    count = []
    for bird in excel:
        if '_'.join(bird.split()).lower() in stats:
            count.append('_'.join(bird.split()).lower())
#         else:
#             print('... %s not satisfied...'%bird)
    print('... %d birds in common ...'%len(count))
    return count

In [8]:
birds_most_data = list(all_train_stats.keys())
birds_most_pres = list(presence_all['拉丁学名'])
best_birds = find_intersection(birds_most_data, birds_most_pres)

... 12 birds in common ...


In [9]:
best_birds

['motacilla_alba',
 'hirundo_rustica',
 'tringa_glareola',
 'charadrius_dubius',
 'phylloscopus_fuscatus',
 'corvus_macrorhynchos',
 'phoenicurus_auroreus',
 'gallinula_chloropus',
 'ardea_cinerea',
 'tringa_nebularia',
 'actitis_hypoleucos',
 'eudynamys_scolopaceus']

In [10]:
for bird in birds_most_data[:30]:
    if bird not in best_birds:
        best_birds.append(bird)
len(best_birds)

32

### Best 32 Birds
Now we have fixed the best bucket containing the best birds with best data and most presence. Originally we have 32 birds, we remove the last two, who has relatively less data. Now we have data of **30** birds in total.

In [11]:
best_32_stats_train = {bird:len(os.listdir(data_dir_all+bird)) for bird in best_birds}
best_30_stats_train = {k: v for k, v in sorted(best_32_stats_train.items(), key=lambda item: item[1], reverse=True) if v > 3900}

In [12]:
best_30_stats_train

{'phylloscopus_fuscatus': 24602,
 'cyanoptila_cyanomelana': 18242,
 'motacilla_alba': 16096,
 'turdus_merula': 11427,
 'ficedula_narcissina': 10377,
 'riparia_riparia': 9701,
 'limosa_limosa': 8761,
 'hirundo_rustica': 8332,
 'ardea_cinerea': 8180,
 'upupa_epops': 7794,
 'cuculus_canorus': 7428,
 'pandion_haliaetus': 7301,
 'tringa_glareola': 6727,
 'muscicapa_griseisticta': 6579,
 'acrocephalus_orientalis': 6081,
 'chlidonias_hybrida': 5785,
 'phoenicurus_auroreus': 5636,
 'arenaria_interpres': 5560,
 'fulica_atra': 4977,
 'falco_peregrinus': 4933,
 'falco_subbuteo': 4924,
 'charadrius_hiaticula': 4923,
 'cisticola_juncidis': 4862,
 'actitis_hypoleucos': 4733,
 'tringa_nebularia': 4699,
 'spilornis_cheela': 4574,
 'chroicocephalus_ridibundus': 4478,
 'gallinula_chloropus': 4460,
 'emberiza_pusilla': 4061,
 'corvus_macrorhynchos': 3911}

In [13]:
data_dir_val = '/Netdata/2020/ziang/data/guangdong194/dataset/splited_data/val_dirs/'
best_32_stats_val = {bird:len(os.listdir(data_dir_val+bird)) for bird in best_birds}
best_32_stats_val = {k: v for k, v in sorted(best_32_stats_val.items(), key=lambda item: item[1], reverse=True)}

In [14]:
best_30_stats_val = best_32_stats_val
del best_30_stats_val['eudynamys_scolopaceus']
del best_30_stats_val['charadrius_dubius']
best_30_stats_val

{'phylloscopus_fuscatus': 5142,
 'cyanoptila_cyanomelana': 5128,
 'arenaria_interpres': 2631,
 'motacilla_alba': 2536,
 'cuculus_canorus': 2178,
 'turdus_merula': 1995,
 'riparia_riparia': 1814,
 'ficedula_narcissina': 1760,
 'acrocephalus_orientalis': 1624,
 'hirundo_rustica': 1590,
 'chlidonias_hybrida': 1426,
 'pandion_haliaetus': 1369,
 'muscicapa_griseisticta': 1149,
 'tringa_glareola': 1136,
 'upupa_epops': 1095,
 'limosa_limosa': 1069,
 'fulica_atra': 1064,
 'phoenicurus_auroreus': 995,
 'ardea_cinerea': 991,
 'cisticola_juncidis': 989,
 'spilornis_cheela': 963,
 'corvus_macrorhynchos': 767,
 'actitis_hypoleucos': 764,
 'falco_subbuteo': 753,
 'tringa_nebularia': 677,
 'charadrius_hiaticula': 611,
 'chroicocephalus_ridibundus': 562,
 'emberiza_pusilla': 547,
 'falco_peregrinus': 528,
 'gallinula_chloropus': 490}

Now we set train the model with each bird approximately 4000 clips, correspondingly 400 clips for validation each bird

In [15]:
train_h5 = '/DATA1/ziang/data/guangdong194/train_h5/'
val_h5 = '/DATA1/ziang/data/guangdong194/val_h5/'

In [16]:
all_train_h5 = [train_h5+x for x in os.listdir(train_h5)]
all_train_h5[1]

'/DATA1/ziang/data/guangdong194/train_h5/LittleEgret_448972_seg_52.wav.h5'

In [17]:
all_train_segs = []
for bird in all_birds:
    all_train_segs += [data_dir_all+bird+'/'+x for x in os.listdir(data_dir_all+bird)]
all_train_segs[1]

'/Netdata/2020/ziang/data/guangdong194/dataset/splited_data/train_dirs/motacilla_alba/370003_seg_1.wav'

#### Create latin2eng dict

In [18]:
excel_dir = '/Netdata/2020/ziang/data/guangdong194/guangdong194_updated.xlsx'
bird_194 = pd.read_excel(excel_dir)
# create latin2eng dict
latin = list(bird_194['拉丁学名'])
eng = list(bird_194['英文名称'])
latin2eng = {'_'.join(latin[i].split()).lower():''.join(eng[i].split()) for i in range(len(latin))}

In [19]:
sample_h5 = all_train_h5[1]
sample_seg = all_train_segs[1]
latin2eng[sample_seg.split('/')[-2]]

'WhiteWagtail'

In [20]:
latin2eng['upupa_epops']

'EurasianHoopoe'

In [3]:
train_max = 4000
val_max = 400
index_dir = '/DATA1/ziang/index/best_30/'
best_30_birds = list(best_30_stats_train.keys())

NameError: name 'best_30_stats_train' is not defined

In [22]:
# select 4000 each bird for training set and create train index
utt2wav_text = ""
utt2label_text = ""
for bird in tqdm(best_30_birds):
    bird_segs = [data_dir_all+bird+'/'+x for x in os.listdir(data_dir_all+bird)]
    shuffle(bird_segs)
    if len(bird_segs) > train_max:
        tmp_segs = bird_segs[:train_max]
    else:
        tmp_segs = bird_segs
    u2w_text_segs = ['_'.join(x.split('/')[-2:])[:-4] + ' ' + train_h5+latin2eng[x.split('/')[-2]]+'_'+x.split('/')[-1]+'.h5' for x in tmp_segs]
    u2l_text_segs = ['_'.join(x.split('/')[-2:])[:-4] + ' ' + x.split('/')[-2] for x in tmp_segs]
    u2w_demo_text = '\n'.join(u2w_text_segs)
    u2l_demo_text = '\n'.join(u2l_text_segs)
    utt2wav_text += u2w_demo_text
    utt2label_text += u2l_demo_text
with open(index_dir+'train_utt2wav','w') as f:
    f.write(utt2wav_text)
with open(index_dir+'train_utt2label','w') as f:
    f.write(utt2label_text)

100%|██████████| 30/30 [00:00<00:00, 30.24it/s]


In [24]:
os.path.isfile('/DATA1/ziang/data/guangdong194/train_h5/EurasianHoopoe_484332_seg_4.wav.h5')

True

In [4]:
# select 4000 each bird for valid set and create valid index
utt2wav_text = ""
utt2label_text = ""
for bird in tqdm(best_30_birds):
    bird_segs = [data_dir_val+bird+'/'+x for x in os.listdir(data_dir_val+bird)]
    shuffle(bird_segs)
    if len(bird_segs) > val_max:
        tmp_segs = bird_segs[:val_max]
    else:
        tmp_segs = bird_segs
    u2w_text_segs = ['_'.join(x.split('/')[-2:])[:-4] + ' ' + val_h5+latin2eng[x.split('/')[-2]]+'_'+x.split('/')[-1]+'.h5' for x in tmp_segs]
    u2l_text_segs = ['_'.join(x.split('/')[-2:])[:-4] + ' ' + x.split('/')[-2] for x in tmp_segs]
    u2w_demo_text = '\n'.join(u2w_text_segs)
    u2l_demo_text = '\n'.join(u2l_text_segs)
    utt2wav_text += u2w_demo_text
    utt2label_text += u2l_demo_text
with open(index_dir+'val_utt2wav','w') as f:
    f.write(utt2wav_text)
with open(index_dir+'val_utt2label','w') as f:
    f.write(utt2label_text)

NameError: name 'best_30_birds' is not defined

#### Check validity of paths

In [6]:
index_dir = '/DATA1/ziang/index/best_30/'
new_utt2wav_text = ""
new_utt2label_text = ""
with open('/DATA1/ziang/index/best_30/val_utt2wav','r') as f:
    for line in f:
        path = line.split()[1]
        if not os.path.isfile(path):
            print("... %s does not exist ..."%path)
        else:
            new_utt2wav_text += line.split()[0]+' '+'_'.join(line.split()[0].split('_')[:2])+'\n'
with open(index_dir+'new_val_utt2label', 'w') as f:
    f.write(new_utt2wav_text)

... /DATA1/ziang/data/guangdong194/val_h5/DuskyWarbler_458710_seg_238.wav.h5cyanoptila_cyanomelana_177234_seg_108 does not exist ...
... /DATA1/ziang/data/guangdong194/val_h5/Blue-and-whiteFlycatcher_429814_seg_17.wav.h5motacilla_alba_278653_seg_31 does not exist ...
... /DATA1/ziang/data/guangdong194/val_h5/WhiteWagtail_576653_seg_3.wav.h5turdus_merula_583789_seg_194 does not exist ...
... /DATA1/ziang/data/guangdong194/val_h5/CommonBlackbird_562366_seg_290.wav.h5ficedula_narcissina_156008_seg_23 does not exist ...
... /DATA1/ziang/data/guangdong194/val_h5/NarcissusFlycatcher_268043_seg_0.wav.h5riparia_riparia_123944_seg_9 does not exist ...
... /DATA1/ziang/data/guangdong194/val_h5/SandMartin_426147_seg_26.wav.h5limosa_limosa_318214_seg_1 does not exist ...
... /DATA1/ziang/data/guangdong194/val_h5/Black-tailedGodwit_281105_seg_26.wav.h5hirundo_rustica_378927_seg_478 does not exist ...
... /DATA1/ziang/data/guangdong194/val_h5/BarnSwallow_575747_seg_24.wav.h5ardea_cinerea_280972_seg_