# Objective

This script get number of entities in saraga dataset

In [2]:
import codecs
import compmusic
from compmusic import dunya as dn
from compmusic.dunya import hindustani as hi
from compmusic.dunya import carnatic as ca
from compmusic.dunya import docserver as ds
from compmusic import musicbrainz
import numpy as np
import pandas as pd
from compmusic.dunya import conn
import requests
import json

In [106]:
class SaragaStatistics():
    
    def __init__(self, token, collection=None):
        """
        Initializing the class...
        
        collection: str
            Name of the collection (carnatic, hindustani)
        """
        self.token = token
        self.config = self.__get_configs()
        
        if collection is not None:
            if collection not in self.collections_df.name.unique():
                raise IOError("Collection name not in repo, choose either 'carnatic' or 'hindustani'")
            else:
                self.collections_df = self.collections_df[self.collections_df.name==collection]
        
    
    def __get_configs(self):
        """
        There are many sort of mappings and collection of ids, its good to keep them in one place
        """
        ## Collection ids etc
        carnatic_cc = dict(id='a163c8f2-b75f-4655-86be-1504ea2944c2',
                  name='carnatic')
        hindustani_cc = dict(id='6adc54c6-6605-4e57-8230-b85f1de5be2b',
                  name='hindustani')
        collections = [carnatic_cc, hindustani_cc]
        self.collections_df = pd.DataFrame(collections)
        
        
        # Entity types in Carnatic and Hindustani have different names, mapping them to one variable for easy processing
        self.mapp={}
        for collection in collections:
            if collection['name']=='hindustani':
                concert = 'release'  
                work = 'works'
                raga = 'raags'
                tala = 'taals'
                form = 'forms'
                laya = 'layas'
                lead_artists = 'album_artists'
                artists = 'artists'
            elif collection['name']=='carnatic':
                concert = 'concert'  # in carnatic album level items are referred by 'concerts'
                work = 'work'
                raga = 'raaga'
                tala = 'taala'
                form = 'form'
                laya = 'laya'        
                lead_artists = 'album_artists'
                artists = 'artists'
            entities = [concert, work, raga, tala, form, laya, artists, lead_artists, 'length']
            id_mapping = {concert: 'mbid', work: 'mbid', raga: 'uuid', tala: 'uuid', form: 'name', laya: 'uuid', lead_artists: 'mbid', artists: 'mbid'}
            self.mapp[collection['name']] = dict(entities=entities,
                                     id_mapping=id_mapping)
            
            self.slug_info = pd.DataFrame([dict(thetype='mp3', subtype='source', file_type='mp3', name='audio'),
             dict(thetype='pitch', subtype='pitch', file_type='txt', name='pitch'),
            dict(thetype='pitch-vocal', subtype='source', file_type='txt', name='pitch_vocal'),
             dict(thetype='ctonic', subtype='tonic', file_type='txt', name='tonic'),
             dict(thetype='sama-manual', subtype='source', file_type='txt', name='sama'),
             dict(thetype='bpm-manual', subtype='source', file_type='txt', name='bpm'),
             dict(thetype='tempo-manual', subtype='source', file_type='txt', name='tempo'),
             dict(thetype='sections-manual-p', subtype='source', file_type='txt', name='sections'),
             dict(thetype='mphrases-manual', subtype='source', file_type='txt', name='phrases'),
            dict(thetype='multitrack-vocal', subtype='source', file_type='mp3', name='vocal'),
            dict(thetype='multitrack-vocal-s', subtype='source', file_type='mp3', name='vocal_s'),
            dict(thetype='multitrack-violin', subtype='source', file_type='mp3', name='violin'),
            dict(thetype='multitrack-ghatam', subtype='source', file_type='mp3', name='ghatam'),
            dict(thetype='multitrack-mridangam-left', subtype='source', file_type='mp3', name='mridangam_left'),
            dict(thetype='multitrack-mridangam-right', subtype='source', file_type='mp3', name='mridangam_right')                      
     ])
    
    def get_mbids_in_collection(self, collection_id, music_tradition, token):
        """
        fetches mbids in a collection
        """
        tradition = None
        if music_tradition == 'hindustani':
            tradition = hi
        elif music_tradition == 'carnatic':
            tradition = ca        

        if tradition: 
            dn.set_token(token)
            tradition.set_collections([collection_id])
            recs = tradition.get_recordings()
            return [r['mbid'] for r in recs]
        else:
            raise IOError("wrong music tradition string")

    def get_collection_stats(self, music_tradition, token,  mbids):
        """
        Fetches number of different "unique" entities link to all the recordings of a collection
        This function also returns total length of the recordings in a collection
        """

        if music_tradition == 'hindustani':
            tradition = hi
        elif music_tradition == 'carnatic':
            tradition = ca        

        dn.set_token(token)    
        entities = self.mapp[music_tradition]['entities']
        stats = dict(zip(entities, [[] for e in entities]))
        for mbid in mbids:
            rec_info = tradition.get_recording(mbid)
            for e in entities:
                if e in rec_info:
                    if e == 'artists':
                        temp = [x['artist'] for x in rec_info[e]]
                        stats[e].extend([x[self.mapp[music_tradition]['id_mapping'][e]] for x in temp])
                    elif e=='length':
                        stats[e].append(rec_info[e])
                    else:
                        stats[e].extend([x[self.mapp[music_tradition]['id_mapping'][e]] for x in rec_info[e]])
        return stats
    
    def get_metadata_stats(self):
        self.stats = {}
        for index, row in self.collections_df.iterrows():
            print()
            mbids_collection = self.get_mbids_in_collection(row['id'], row['name'], self.token)
#             print('------------------------------------------------')
#             print("Statistis for %s collection"%collection['name'])
#             print('------------------------------------------------')
#             print("Number of recordings: %d"%len(mbids_collection))
            self.stats[row['name']] = self.get_collection_stats(row['name'], self.token, mbids_collection)
            self.stats[row['name']].update(dict(num_mbids=len(mbids_collection)))
        
    def get_file_stats(self):
        output = []
        headers = {"Authorization": "Token %s" % self.token}
        for index, row in self.collections_df.iterrows():
            mbids = self.get_mbids_in_collection(row['id'], row['name'], self.token)
            for mbid in mbids:
                try:
                    path = "document/by-id/%s" % (mbid)        
                    url = conn._make_url(path)
                    out = requests.get(url, headers=headers)
                    content = json.loads(out.content)
                    mapp = dict(zip(content['sourcefiles'], np.ones(len(content['sourcefiles']))))
                    mapp.update(dict(mbid=mbid, collection=row['name']))
                except:
                    print("Issue with: %s, %s"%(row['id'], mbid))
                output.append(mapp)
        self.file_stats = pd.DataFrame(output)
    def print_pretty(self, type_='metadata_stats'):
        if type_=='metadata_stats':
            stats = self.stats
            for tradition, trad_items in stats.items():
                print('----------------------------------------')
                print("Stats for %s tradition:"%tradition)
                for ent, nums in trad_items.items():
                    if ent == 'num_mbids':
                        print("Total number of recordings %d"%nums)
                    elif ent != 'length':
                        print("Total number of unique %s are:%d"%(ent, len(set(nums))))
                    else:
                        print("Total length of the recordings: %0.2f hrs"%(np.sum(nums)/3600000.))

In [107]:
obj = SaragaStatistics('60312f59428916bb854adaa208f55eb35c3f2f07')

In [108]:
obj.get_file_stats()

Issue with: a163c8f2-b75f-4655-86be-1504ea2944c2, c79c3d8f-29a2-40cd-85f7-cc98da4b4532
Issue with: a163c8f2-b75f-4655-86be-1504ea2944c2, 14c18b73-88a7-4092-9317-13ff31254b35


In [110]:
obj.file_stats.sum()

tempo-manual                                                                208
sections-manual                                                             194
pitch-vocal                                                                  56
sections-manual-p                                                           194
bpm-manual                                                                  186
mphrases-manual                                                             170
sama-manual                                                                 216
mp3                                                                         359
multitrack-mridangam-right                                                  170
multitrack-vocal                                                            170
multitrack-mridangam-left                                                   170
multitrack-ghatam                                                            46
multitrack-violin                       

In [5]:
# token = "" #<get your api token form: https://dunya.compmusic.upf.edu/user/profile/>

# # Ids of CC collection for both Carnatic and Hindustani music tradition
# carnatic_cc = dict(id='a163c8f2-b75f-4655-86be-1504ea2944c2',
#                   name='carnatic')
# hindustani_cc = dict(id='6adc54c6-6605-4e57-8230-b85f1de5be2b',
#                   name='hindustani')
# collections = [carnatic_cc, hindustani_cc]
# collections_df = pd.DataFrame(collections)


In [3]:
# # Entity types in Carnatic and Hindustani have different names, mapping them to one variable for easy processing
# mapp={}
# for collection in collections:
#     if collection['name']=='hindustani':
#         concert = 'release'  
#         work = 'works'
#         raga = 'raags'
#         tala = 'taals'
#         form = 'forms'
#         laya = 'layas'
#         lead_artists = 'album_artists'
#         artists = 'artists'
#     elif collection['name']=='carnatic':
#         concert = 'concert'  # in carnatic album level items are referred by 'concerts'
#         work = 'work'
#         raga = 'raaga'
#         tala = 'taala'
#         form = 'form'
#         laya = 'laya'        
#         lead_artists = 'album_artists'
#         artists = 'artists'
#     entities = [concert, work, raga, tala, form, laya, artists, lead_artists, 'length']
#     id_mapping = {concert: 'mbid', work: 'mbid', raga: 'uuid', tala: 'uuid', form: 'name', laya: 'uuid', lead_artists: 'mbid', artists: 'mbid'}
#     mapp[collection['name']] = dict(entities=entities,
#                              id_mapping=id_mapping)

# Metadata related functions

In [12]:
# def get_mbids_in_collection(collection_id, music_tradition, token):
#     """
#     fetches mbids in a collection
#     """
#     if music_tradition == 'hindustani':
#         tradition = hi
#     elif music_tradition == 'carnatic':
#         tradition = ca        
    
#     dn.set_token(token)
#     tradition.set_collections([collection_id])
#     recs = tradition.get_recordings()
#     return [r['mbid'] for r in recs]

# def get_collection_stats(music_tradition, token,  mbids):
#     """
#     Fetches number of different "unique" entities link to all the recordings of a collection
#     This function also returns total length of the recordings in a collection
#     """
    
#     if music_tradition == 'hindustani':
#         tradition = hi
#     elif music_tradition == 'carnatic':
#         tradition = ca        
    
#     dn.set_token(token)    
#     entities = mapp[music_tradition]['entities']
#     stats = dict(zip(entities, [[] for e in entities]))
#     for mbid in mbids_collection:
#         rec_info = tradition.get_recording(mbid)
#         for e in entities:
#             if e in rec_info:
#                 if e == 'artists':
#                     temp = [x['artist'] for x in rec_info[e]]
#                     stats[e].extend([x[mapp[music_tradition]['id_mapping'][e]] for x in temp])
#                 elif e=='length':
#                     stats[e].append(rec_info[e])
#                 else:
#                     stats[e].extend([x[mapp[music_tradition]['id_mapping'][e]] for x in rec_info[e]])
#     return stats

# Get overall statistics about the Saraga datsets

* Number of recordings, related works, artists etc

In [13]:
# stats = {}
# for collection in collections:
#     mbids_collection = get_mbids_in_collection(collection['id'], collection['name'], token)
#     print('------------------------------------------------')
#     print("Statistis for %s collection"%collection['name'])
#     print('------------------------------------------------')
#     print("Number of recordings: %d"%len(mbids_collection))
#     stats[collection['name']] = get_collection_stats(collection['name'], token, mbids_collection)
#     stats[collection['name']].update(dict(num_mbids=len(mbids_collection)))

In [58]:
# # Printing all the stats 
# for key, items in stats.items():
#     print('----------------------------------------')
#     print("Stats for %s:"%key)
#     for ent, nums in items.items():
#         if ent == 'num_mbids':
#             print("Total number of recordings %d"%nums)
#         elif ent != 'length':
#             print("Total number of unique %s are:%d"%(ent, len(set(nums))))
#         else:
#             print("Total length of the recordings: %d"%np.sum(nums))

In [5]:
dn.set_token('')

In [6]:
mbids = ca.get_recordings(recording_detail=True)

In [7]:
mbids[0]

{'mbid': '88166f7e-a85d-4c7a-91ec-2f16831b7e79',
 'title': 'Maya Tita Swaroopini',
 'length': 1331000,
 'artists': [{'artist': {'mbid': '4676e6c8-2862-435b-a02c-4d1077101885',
    'name': 'M. D. Ramanathan'},
   'instrument': {'mbid': 'd92884b7-ee0c-46d5-96f3-918196ba8c5b',
    'name': 'Voice'},
   'lead': True,
   'attributes': 'lead vocals'}],
 'raaga': [{'uuid': '9cedca68-4a9d-4170-bec3-0d1db1ff730e',
   'name': 'Māyāmāḷavagauḷa',
   'common_name': 'mayamalava gaula'}],
 'taala': [{'uuid': '8c6c26db-e01a-4eef-ae0b-9f7e31a926e8',
   'name': 'Rūpaka',
   'common_name': 'rupaka'}],
 'form': [{'name': 'Kriti'}],
 'work': [{'mbid': '89f4de10-3245-4747-82c2-c85f2c9875f0',
   'title': 'Mayatita Svarupini'}],
 'concert': [{'mbid': '8fb848f1-caf8-4830-aba6-e2ecb1aa5696',
   'title': 'Classical Vocal'}],
 'album_artists': [{'mbid': '4676e6c8-2862-435b-a02c-4d1077101885',
   'name': 'M. D. Ramanathan'}]}