# Objective

This script get number of entities in saraga dataset

In [1]:
import codecs
import compmusic
from compmusic import dunya as dn
from compmusic.dunya import hindustani as hi
from compmusic.dunya import carnatic as ca
from compmusic.dunya import docserver as ds
from compmusic import musicbrainz
import numpy as np

In [2]:
token = "" #<get your api token form: https://dunya.compmusic.upf.edu/user/profile/>

# Ids of CC collection for both Carnatic and Hindustani music tradition
carnatic_cc = dict(id='a163c8f2-b75f-4655-86be-1504ea2944c2',
                  name='carnatic')
hindustani_cc = dict(id='6adc54c6-6605-4e57-8230-b85f1de5be2b',
                  name='hindustani')
collections = [carnatic_cc, hindustani_cc]

In [3]:
# Entity types in Carnatic and Hindustani have different names, mapping them to one variable for easy processing
mapp={}
for collection in collections:
    if collection['name']=='hindustani':
        concert = 'release'  
        work = 'works'
        raga = 'raags'
        tala = 'taals'
        form = 'forms'
        laya = 'layas'
        lead_artists = 'album_artists'
        artists = 'artists'
    elif collection['name']=='carnatic':
        concert = 'concert'  # in carnatic album level items are referred by 'concerts'
        work = 'work'
        raga = 'raaga'
        tala = 'taala'
        form = 'form'
        laya = 'laya'        
        lead_artists = 'album_artists'
        artists = 'artists'
    entities = [concert, work, raga, tala, form, laya, artists, lead_artists, 'length']
    id_mapping = {concert: 'mbid', work: 'mbid', raga: 'uuid', tala: 'uuid', form: 'name', laya: 'uuid', lead_artists: 'mbid', artists: 'mbid'}
    mapp[collection['name']] = dict(entities=entities,
                             id_mapping=id_mapping)

# Metadata related functions

In [4]:
def get_mbids_in_collection(collection_id, music_tradition, token):
    """
    fetches mbids in a collection
    """
    if music_tradition == 'hindustani':
        tradition = hi
    elif music_tradition == 'carnatic':
        tradition = ca        
    
    dn.set_token(token)
    tradition.set_collections([collection_id])
    recs = tradition.get_recordings()
    return [r['mbid'] for r in recs]

def get_collection_stats(music_tradition, token,  mbids):
    """
    Fetches number of different "unique" entities link to all the recordings of a collection
    This function also returns total length of the recordings in a collection
    """
    
    if music_tradition == 'hindustani':
        tradition = hi
    elif music_tradition == 'carnatic':
        tradition = ca        
    
    dn.set_token(token)    
    entities = mapp[music_tradition]['entities']
    stats = dict(zip(entities, [[] for e in entities]))
    for mbid in mbids_collection:
        rec_info = tradition.get_recording(mbid)
        for e in entities:
            if e in rec_info:
                if e == 'artists':
                    temp = [x['artist'] for x in rec_info[e]]
                    stats[e].extend([x[mapp[music_tradition]['id_mapping'][e]] for x in temp])
                elif e=='length':
                    stats[e].append(rec_info[e])
                else:
                    stats[e].extend([x[mapp[music_tradition]['id_mapping'][e]] for x in rec_info[e]])
    return stats

# Get overall statistics about the Saraga datsets

* Number of recordings, related works, artists etc

In [5]:
stats = {}
for collection in collections:
    mbids_collection = get_mbids_in_collection(collection['id'], collection['name'], token)
    print('------------------------------------------------')
    print("Statistis for %s collection"%collection['name'])
    print('------------------------------------------------')
    print("Number of recordings: %d"%len(mbids_collection))
    stats[collection['name']] = get_collection_stats(collection['name'], token, mbids_collection)
    stats[collection['name']].update(dict(num_mbids=len(mbids_collection)))

------------------------------------------------
Statistis for carnatic collection
------------------------------------------------
Number of recordings: 197
------------------------------------------------
Statistis for hindustani collection
------------------------------------------------
Number of recordings: 108


In [6]:
# Printing all the stats 
for key, items in stats.items():
    print('----------------------------------------')
    print("Stats for %s:"%key)
    for ent, nums in items.items():
        if ent == 'num_mbids':
            print("Total number of recordings %d"%nums)
        elif ent != 'length':
            print("Total number of unique %s are:%d"%(ent, len(set(nums))))
        else:
            print("Total length of the recordings: %d"%np.sum(nums))

----------------------------------------
Stats for carnatic:
Total number of unique concert are:19
Total number of unique work are:175
Total number of unique raaga are:96
Total number of unique taala are:10
Total number of unique form are:12
Total number of unique laya are:0
Total number of unique artists are:57
Total number of unique album_artists are:17
Total length of the recordings: 148046611
Total number of recordings 197
----------------------------------------
Stats for hindustani:
Total number of unique release are:36
Total number of unique works are:113
Total number of unique raags are:61
Total number of unique taals are:9
Total number of unique forms are:5
Total number of unique layas are:2
Total number of unique artists are:36
Total number of unique album_artists are:11
Total length of the recordings: 156929783
Total number of recordings 108
