## This file ensures we have a few representatives for each Order

In [None]:
params_file = None
input_file = None
output_file = None
common_file = None
geography_file = None

In [None]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [None]:
import json
with open(params_file) as json_file:
    data = json.load(json_file)
min_count = int(data['taxonomy']['min_count_species'])
max_add = int(data['taxonomy']['max_add_species'])
taxo_file = data['taxonomy']['file']


In [None]:
table = pd.read_csv(taxo_file)
table.columns

Index(['TAXON_ORDER', 'CATEGORY', 'SPECIES_CODE', 'PRIMARY_COM_NAME',
       'SCI_NAME', 'ORDER1', 'FAMILY', 'SPECIES_GROUP', 'REPORT_AS'],
      dtype='object')

In [None]:
meta = pd.read_csv(input_file)
order_table = table[['SPECIES_CODE', 'ORDER1','FAMILY']].drop_duplicates()
order_table.columns = ['primary_label','order','family']
meta = meta.merge(order_table, on='primary_label', how='left')
meta.columns

Index(['primary_label', 'secondary_labels', 'type', 'latitude', 'longitude',
       'scientific_name', 'common_name', 'author', 'license', 'rating', 'time',
       'url', 'filename', 'hour_str', 'hour_int', 'minute_str', 'minute_int',
       'order', 'family'],
      dtype='object')

In [None]:
geo = np.loadtxt(geography_file,dtype=str)
top_n = np.loadtxt(common_file,dtype=str)

array(['commyn', 'rocpig', 'sander', 'norpin', 'nutman', 'merlin',
       'yefcan', 'warwhe1', 'parjae', 'chukar', 'wetshe'], dtype='<U7')

In [None]:
# check the number of selected species in each order 
already = meta[meta['primary_label'].isin(np.concatenate([geo, top_n]))]
al = already['order'].value_counts().index.to_list()
few_order = []
for a in al:
    sub = already[already['order'] == a]
    print(a, sub['primary_label'].unique())
    # determine when order has few representatives
    if sub['primary_label'].nunique() < max_add:
        few_order.append(a)

Passeriformes ['apapan' 'commyn' 'comwax' 'houfin' 'houspa' 'jabwar' 'norcar' 'normoc'
 'nutman' 'omao' 'saffin' 'skylar' 'towsol' 'warwhe1' 'wesmea' 'yefcan']
Charadriiformes ['arcter' 'bkbplo' 'bknsti' 'bongul' 'caster1' 'comsan' 'dunlin' 'glwgul'
 'laugul' 'leasan' 'leater1' 'lesyel' 'lotjae' 'parjae' 'redpha1' 'rudtur'
 'sander' 'wessan']
Anseriformes ['brant' 'buwtea' 'cangoo' 'eurwig' 'gadwal' 'gnwtea' 'gwfgoo' 'mallar3'
 'norpin']
Galliformes ['calqua' 'chukar' 'compea' 'gamqua' 'rinphe']
Strigiformes ['brnowl' 'sheowl']
Columbiformes ['moudov' 'rocpig' 'spodov' 'zebdov']
Pelecaniformes ['bcnher']
Gruiformes ['comgal1' 'sora']
Psittaciformes ['rempar' 'rorpar']
Accipitriformes ['osprey']
Coraciiformes ['belkin1']
Falconiformes ['merlin']
Suliformes ['grefri' 'masboo']
Procellariiformes ['wetshe']


In [None]:
# look at orders with no selected species
the_orders = meta['order'].value_counts().index.to_list()
the_not_orders = [o for o in the_orders if o not in al]
the_not_orders

['Podicipediformes', 'Phaethontiformes', 'Pterocliformes']

In [81]:

# contribute some species from orders with no representatives
add_orders = []
for o in the_not_orders:
    sub = meta[meta['order'] == o]
    thr = sub['primary_label'].value_counts() > min_count
    arr = np.array(sub['primary_label'].value_counts().index.to_list())[thr]
    arr = arr.tolist()
    try:
        add_orders += arr[:max_add]
    except:
        add_orders += arr
add_orders


['pibgre', 'rettro', 'whttro', 'chbsan']

In [None]:
# contribute additional species from orders with few representatives
supp_orders = []
not_already = meta[~meta['primary_label'].isin(np.concatenate([geo, top_n]))]
for o in few_order:
    sub = not_already[not_already['order'] == o]
    thr = sub['primary_label'].value_counts() > min_count
    arr = np.array(sub['primary_label'].value_counts().index.to_list())[thr]
    arr = arr.tolist()
    print(o,arr)
    try:
        supp_orders += arr[:max_add]
    except:
        supp_orders += arr

Strigiformes []
Pelecaniformes ['categr', 'grbher3', 'whfibi']
Gruiformes ['hawcoo']
Psittaciformes ['mitpar', 'peflov', 'burpar']
Accipitriformes ['norhar2']
Coraciiformes []
Falconiformes ['perfal']
Suliformes ['refboo', 'brnboo']
Procellariiformes ['madpet', 'magpet1']


In [None]:
with open(output_file,'w') as f:
    for pl in add_orders:
        f.write(pl + '\n')
    for pl in supp_orders:
        f.write(pl + '\n')