In [2]:
from __future__ import print_function
from ortools.linear_solver import pywraplp

from scipy.stats import entropy as kl_div
from numpy.linalg import norm
import numpy as np
import math

In [3]:
import csv
from collections import Counter


# sheetOrg="/Users/berk/Desktop/NNA/downloads/Sheet1.csv"
# sheetMine="/Users/berk/Desktop/NNA/downloads/Sheet1(1).csv"

resources_folder = ('/scratch/enis/archive/' +
                    'forks/cramer2020icassp/resources/')
src_path = '/scratch/enis/data/nna/labeling/megan/AudioSamplesPerSite/'
megan_labeled_files_info_path = src_path + 'meganLabeledFiles_wlenV1.txt'

# csv4megan_excell = (resources_folder + 'Sheet1.csv')
csv4megan_excell_clenaed = (resources_folder + 'Sheet1(1).csv')
csv4megan_excell = (resources_folder + 'Sheet1.csv')


with open(csv4megan_excell_clenaed) as csvfile:
    reader = csv.DictReader(csvfile)
    reader= list(reader)


In [4]:
# reader[0]

from typing import Dict, Union, Optional, Type
from nna import dataimport



In [5]:
def add_taxo_code2dataset(megan_data_sheet, audio_dataset):
    '''Go through rows of the excell and store taxonomy info into audio_dataset
    '''
    codest_dict = {}
    
    for row in megan_data_sheet:
        try:
            taxonomy_code = dataimport.megan_excell_row2yaml_code(row, None)
            site_id=row['Site ID'].strip()
            codest_dict.setdefault(taxonomy_code,Counter({}))

            codest_dict[taxonomy_code]=codest_dict[taxonomy_code]+Counter({site_id:1})
        except:
            print(row)
    return codest_dict



In [23]:
# [0.6,0.2,0.2]
# [0.7,0.15,0.15]
# [0.8,0.1,0.1]
def getCombinations(total):
    combinations=set()
    for i in range(100,201,1):
        test_val_dist=i/1000
        train_dist=1-(test_val_dist*2)
        dist=np.array([train_dist,test_val_dist,test_val_dist])
        bin_capacities=tuple(np.ceil(total*dist).astype("int"))
        combinations.add(bin_capacities)

    #add some combinations that are test and valid are bigger so that small number of elements can be handled
    combinations2=combinations.copy()
    for comb in combinations2:
        for rate in [1.2,1.4,1.6,1.8,2.0]:
            newComb=(comb[0],np.ceil(comb[1]*rate),np.ceil(comb[1]*rate))
            combinations.add(newComb)

    return combinations

# test
total=110
# dist = np.array([0.6,0.2,0.2])
print(getCombinations(total))

{(70, 42.0, 42.0), (71, 20, 20), (88, 15.0, 15.0), (79, 23.0, 23.0), (78, 34.0, 34.0), (75, 29.0, 29.0), (76, 26.0, 26.0), (82, 27.0, 27.0), (69, 34.0, 34.0), (68, 44.0, 44.0), (78, 24.0, 24.0), (70, 38.0, 38.0), (87, 24.0, 24.0), (84, 20.0, 20.0), (74, 38.0, 38.0), (68, 22, 22), (74, 23.0, 23.0), (71, 36.0, 36.0), (82, 30.0, 30.0), (75, 36.0, 36.0), (75, 26.0, 26.0), (73, 23.0, 23.0), (88, 20.0, 20.0), (72, 24.0, 24.0), (75, 33.0, 33.0), (72, 40.0, 40.0), (79, 20.0, 20.0), (69, 30.0, 30.0), (67, 27.0, 27.0), (68, 40.0, 40.0), (80, 29.0, 29.0), (67, 36.0, 36.0), (84, 26.0, 26.0), (77, 24.0, 24.0), (88, 14.0, 14.0), (88, 16.0, 16.0), (66, 44.0, 44.0), (86, 24.0, 24.0), (74, 35.0, 35.0), (84, 23.0, 23.0), (83, 23.0, 23.0), (76, 22.0, 22.0), (85, 13, 13), (83, 26.0, 26.0), (72, 28.0, 28.0), (78, 28.0, 28.0), (86, 13, 13), (68, 36.0, 36.0), (80, 23.0, 23.0), (73, 19, 19), (87, 20.0, 20.0), (85, 19.0, 19.0), (74, 19, 19), (71, 32.0, 32.0), (85, 24.0, 24.0), (84, 28.0, 28.0), (68, 31.0, 31.0

In [7]:


def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (kl_div(_P, _M) + kl_div(_Q, _M))

def create_data_model(weights,values,bin_capacities):
    """Create the data for the example."""
    data = {}
    data['weights'] = weights
    data['values'] = values
    data['items'] = list(range(len(weights)))
    data['num_items'] = len(weights)
    num_bins = 5
    data['bins'] = list(range(3))
    total=sum(weights)
#     print(total)
#     bin_capacities = np.ceil(total*dist).astype("int")
    data['bin_capacities'] = bin_capacities
    
    return data



In [8]:

# [(k,v) for (k,v) in codesDict.items() if sum(v.values())<25]

def main(codesDict,dist):
    total=0
    solutionPerTaxonomy={}
    for k in codesDict.keys():
        
        weights=list(codesDict[k].values())
        values = list(codesDict[k].values())
        if sum(weights)<10:
            print(k)
            print("Error, too small", weights)
            continue
        if len(weights)<3:
            print(k)
            print("Error, number of elements less than 3", weights)
            continue
        
        total=sum(weights)
        

        combinations=getCombinations(total,dist)
        
#         solutionPerCombination=[]
        solutionPerTaxonomy.setdefault(k,[])
        for bin_capacities in combinations:
            
            data = create_data_model(weights,values,bin_capacities)
            # old version of ortools
            # Create the mip solver with the CBC backend.
#             solver = pywraplp.Solver.CreateSolver('multiple_knapsack_mip', 'CBC')
            # new version of ortools=>8.1
            solver = pywraplp.Solver.CreateSolver('SCIP')

            # Variables
            # x[i, j] = 1 if item i is packed in bin j.
            x = {}
            for i in data['items']:
                for j in data['bins']:
                    x[(i, j)] = solver.IntVar(0, 1, 'x_%i_%i' % (i, j))

            # Constraints
            # Each item can be in at most one bin.
            for i in data['items']:
                solver.Add(sum(x[i, j] for j in data['bins']) <= 1)
            # The amount packed in each bin cannot exceed its capacity.
            for j in data['bins']:
                solver.Add(
                    sum(x[(i, j)] * data['weights'][i]
                        for i in data['items']) <= data['bin_capacities'][j])

            # Objective
            objective = solver.Objective()

            for i in data['items']:
                for j in data['bins']:
                    objective.SetCoefficient(x[(i, j)], data['values'][i])
            objective.SetMaximization()

            status = solver.Solve()

            if status == pywraplp.Solver.OPTIMAL:
#                 if objective.Value()/sum(data['bin_capacities'])>0.90:
#                     continue

                total+=sum(data['weights'])

#                 print(codesDict[k])
#                 print("------------",k,"--------------")
#                 print('Total packed value:', objective.Value(),"/",sum(data['bin_capacities']))

    #             print()
                total_weight = 0
                solution=[list() for i in range(len(data['bins']))]
                for binIndex,j in enumerate(data['bins']):
                    bin_weight = 0
                    bin_value = 0
#                     print('Bin ', j, '\n')
                    for i in data['items']:
                        if x[i, j].solution_value() > 0:
                            solution[j].append(data['weights'][i])
#                             print('Item', i, '- weight:', data['weights'][i], ' value:',
#                                   data['values'][i])
                            bin_weight += data['weights'][i]
                            bin_value += data['values'][i]
                            
#                     print('Packed bin weight:', bin_weight,"/",data['bin_capacities'][binIndex])
    #                 print('bin capacity:',)
    #                 print('Packed bin value:', bin_value)
                    total_weight += bin_weight
#                 print('Total packed weight:', total_weight)
                solutionPerTaxonomy[k].append((bin_capacities,solution[:]))
            else:
                print('The problem does not have an optimal solution.')
#             print("total",total)
    return solutionPerTaxonomy


In [9]:
def func(codesDict):
    expectedDist=[0.6,0.2,0.2]
    results=[]
    BestSolutionPerTaxonomy={}

    for taxoKey in solutionPerTaxonomy:
        found=False
        total=sum(codesDict[taxoKey].values())

        for i in solutionPerTaxonomy[taxoKey]:
            if total==sum([sum(m) for m in i[1]]):
                found=True
        if found is False:
            print(codesDict[taxoKey].values())
            print(total)

    #         for i in solutionPerTaxonomy[taxoKey]:
    #             print(i[0],,sum([sum(m) for m in i[1]]))

        costPerDist=[]
        smallest=999999
        bestComp=None
        bestDist=None
        for i in solutionPerTaxonomy[taxoKey]:
    #         print(dist=[sum(m) for m in i[1]])
            dist=[sum(m) for m in i[1]]
            cost=JSD(expectedDist,dist)
            if cost<smallest and total-sum(dist)==0:
                smallest=cost
                bestComp=i[1]
                bestDist=[sum(m) for m in i[1]]

        combinedSorted=sorted(list(zip(bestDist,bestComp)),reverse=True)
        a,b=[],[]
        for m in combinedSorted:
            a.append(m[0])
            b.append(m[1])
        bestDist,bestComp=a,b

        results.append([cost,bestDist,bestComp])

        BestSolutionPerTaxonomy[taxoKey] = [cost,bestDist,bestComp]
    return results,BestSolutionPerTaxonomy

    #     if total-sum(bestDist)!=0:
    #         print("BAD")
    #     print(taxoKey)
    #     print(codesDict[taxoKey].values())
    #     print("total",total,sum(bestDist),bestDist)

    #     print(bestComp)
    # #         costPerDist.append()
    #     for i in solutionPerTaxonomy[taxoKey]:
    #         dist=[sum(m) for m in i[1]]
    #         print(dist,sum(dist))


In [10]:
# codesDict

In [11]:
# BestSolutionPerTaxonomy

In [12]:
def func2(codesDict,BestSolutionPerTaxonomy):
    solReverse={i:{} for i in codesDict.keys()}
    for taxo, counter in codesDict.items():
        counter=dict(counter)
        for x,y in counter.items():
            solReverse[taxo].setdefault(y, []).append(x)

    BestSolutionPerTaxonomyLocation={i:None for i in BestSolutionPerTaxonomy.keys()}
    for taxo, data in BestSolutionPerTaxonomy.items():
    #     print(taxo,data)
        comb=data[2]
        train,test,val=comb[:]
        combLocation=[[] for i in range(len(comb))]
        for i,dataSet in enumerate(comb):
            for v in dataSet:
                location=solReverse[taxo][v].pop()
                combLocation[i].append(location)
        BestSolutionPerTaxonomyLocation[taxo]=combLocation
    return BestSolutionPerTaxonomyLocation
    #     print(combLocation)
    #     break

    


In [13]:
# excellNames2code

In [18]:

def func3(BestSolutionPerTaxonomyLocation,excellNames2code=None,):
    # train test valid
    # BestSolutionPerTaxonomyLocation
    if excellNames2code is None:
        excell_names2code = {
                'anth': '0.0.0',
                'auto': '0.1.0',
                'bio': '1.0.0',
                'bird': '1.1.0',
                'bug': '1.3.0',
                'dgs': '1.1.7',
                'flare': '0.4.0',
                'fox': '1.2.4',
                'geo': '2.0.0',
                'grouse': '1.1.8',
                'loon': '1.1.3',
                'mam': '1.2.0',
                'plane': '0.2.0',
                'ptarm': '1.1.8',
                'rain': '2.1.0',
                'seab': '1.1.5',
                'silence': '3.0.0',
                'songbird': '1.1.10',
                'unknown': 'X.X.X',
                'water': '2.2.0',
                'x': 'X.X.X',
            }
    for yamlCode, data in BestSolutionPerTaxonomyLocation.items():
    #     print(yamlCode)
        fileCode=yamlCode.replace(".","-")
        for dataSet in data:
            for loc in dataSet:
                fileName=("_".join(["site-"+str(loc),fileCode,"original.h5"]))
                pathFile="./resources/myDatasets/megan/"+fileName
#                 print(pathFile)



In [15]:
# birdvox-cls-test
# birdvox-cls-train
# birdvox-cls-valid
# load files with librosa, sample to 


In [17]:

codest_dict2 = add_taxo_code2dataset(reader,[])
total2=110
dist2 = np.array([0.6,0.2,0.2])
#test
# np.ceil(total*dist).astype("int")
solutionPerTaxonomy=main(codest_dict2,dist2)
results2,BestSolutionPerTaxonomy2 =func(codest_dict2)



OrderedDict([('Site ID', '12'), ('Anthro/Bio', 'Bio/Anth'), ('Category', 'Bird/Plane'), ('Specific Category', ''), ('Comments', 'raven, songbirds, long tailed ducks with aircraft in background'), ('', ''), ('File Name', 'S4A10268_20190610_103000_bio_anth.wav'), ('Could not decice', 'FALSE'), ('Songbird', 'FALSE'), ('Water Bird', 'FALSE'), ('Insect', 'FALSE'), ('Running Water', 'FALSE'), ('Rain', 'FALSE'), ('Cable', 'FALSE'), ('Wind', 'FALSE'), ('Vehicle', 'FALSE'), ('Aircraft', 'FALSE'), ('Date dd/mm/yy)', '6/6/2019'), ('Start Time', '1:05:22'), ('End Time', '1:05:53'), ('#VALUE!', '0:00:31')])
1.2.4
Error, too small [2]
1.1.5
Error, too small [1, 2]
1.2.0
Error, too small [1]
0.4.0
Error, too small [5, 3]


In [19]:

results2=sorted(results2,reverse=True)
[i[1] for i in results2]
# len(results),len(codesDict.keys())

BestSolutionPerTaxonomyLocation2 = func2(codest_dict2,BestSolutionPerTaxonomy2)
# BestSolutionPerTaxonomyLocation
func3(BestSolutionPerTaxonomyLocation2)

In [28]:
BestSolutionPerTaxonomyLocation2

{'1.0.0': [['44', '46', '17', '14'],
  ['11', '34', '27'],
  ['31', '50', '18', '12', '30', '39', '48', '45']],
 '3.0.0': [['40', '20', '14', '17', '13', '36', '25', '33'],
  ['18', '38', '39'],
  ['32', '45']],
 'X.X.X': [['45', '14', '27', '25', '34', '46', '29', '18', '38'],
  ['36'],
  ['32', '20', '21']],
 '1.1.10': [['49', '48', '19', '16', '22', '37', '29', '25', '31', '27'],
  ['46', '20', '11', '33', '24'],
  ['17', '21', '39', '30', '38', '18', '47', '50', '14']],
 '1.1.0': [['12',
   '37',
   '11',
   '22',
   '18',
   '44',
   '29',
   '46',
   '13',
   '34',
   '25',
   '24',
   '17',
   '40',
   '31',
   '27',
   '14'],
  ['19', '16', '39', '30', '38', '41'],
  ['50', '20', '47', '49', '48', '21', '15', '36']],
 '1.3.0': [['21', '40', '32', '39', '41', '44'],
  ['50', '20', '38', '11'],
  ['24', '19', '27', '14']],
 '1.1.8': [['20', '16', '11', '37', '15', '25', '31'],
  ['21', '49', '38'],
  ['22', '40']],
 '1.1.7': [['16', '30', '15', '46', '27'],
  ['49', '25', '24'],


OrderedDict([('Site ID', '12'), ('Anthro/Bio', 'Bio/Anth'), ('Category', 'Bird/Plane'), ('Specific Category', ''), ('Comments', 'raven, songbirds, long tailed ducks with aircraft in background'), ('', ''), ('File Name', 'S4A10268_20190610_103000_bio_anth.wav'), ('Could not decice', 'FALSE'), ('Songbird', 'FALSE'), ('Water Bird', 'FALSE'), ('Insect', 'FALSE'), ('Running Water', 'FALSE'), ('Rain', 'FALSE'), ('Cable', 'FALSE'), ('Wind', 'FALSE'), ('Vehicle', 'FALSE'), ('Aircraft', 'FALSE'), ('Date dd/mm/yy)', '6/6/2019'), ('Start Time', '1:05:22'), ('End Time', '1:05:53'), ('#VALUE!', '0:00:31')])
1.2.4
Error, too small [2]
1.1.5
Error, too small [1, 2]
1.2.0
Error, too small [1]
0.4.0
Error, too small [5, 3]
