# 00-download_dataset_audio

Explanation XX

In [1]:
import configparser
import importlib
import logging
import os
from pathlib import Path

import mlflow
import numpy as np
import pandas as pd

from compmusic import dunya
from tqdm.notebook import tqdm

## Init logger

In [2]:
importlib.reload(logging)  # fix jupyter logging: https://stackoverflow.com/a/21475297
logging.basicConfig(level=logging.INFO)

# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Logger initiated...")

INFO:__main__:Logger initiated...


## Read relevant parts of the configuration

In [3]:
config_dir = "../config"

config = configparser.ConfigParser()
config_file = config.read(os.path.join(config_dir, 'config.ini'))
logger.info(f"Reading configuration from {config_file}")

annotation_github_file = config["dataset"]["annotation_file"]
secrets_file = os.path.join(config_dir, config["secrets"]["file"])
staging_dir = os.path.join("../", config["preprocessing"]["staging_dir"])

INFO:__main__:Reading configuration from ['../config/config.ini']


## Read secrets; never commit these to the repo!!!


In [4]:
if not os.path.exists(secrets_file):
    raise FileNotFoundError(f"{secrets_file} does not exist")

secrets_file = config.read(secrets_file)
logger.info(f"Reading secrets from {secrets_file}")

dunya_token = config["tokens"]["dunya"]

INFO:__main__:Reading secrets from ['../config/./secrets.ini']


## Read annotations from `otmm_makam_recognition_dataset`

We read the latest version as of April 2020, which was generated for the paper:

    Karakurt, A., Şentürk S., & Serra X. (2016). MORTY: A Toolbox for Mode Recognition and Tonic Identification. 3rd International Digital Libraries for Musicology Workshop. New York, USA

### Read from github

In [5]:
logger.info(f"Reading annotations from {annotation_github_file}")
annotations = pd.read_json(annotation_github_file)
annotations["mb_url"] = annotations["mbid"]
annotations["mbid"] = annotations["mbid"].str.split(pat = "/").apply(lambda a: a[-1])


INFO:__main__:Reading annotations from https://raw.githubusercontent.com/sertansenturk/otmm_makam_recognition_dataset/dlfm2016-fix1/annotations.json


### Validate annotations

In [6]:
num_recordings = 1000
num_recordings_per_makam = 50
num_makams = num_recordings / num_recordings_per_makam  # 20

assert len(annotations.mbid) == num_recordings, f"There are less than {num_recordings} recordings"
assert len(annotations.mbid.unique()) == num_recordings, "MusicBrainz ID (MBIDs) are not unique"

makam_counts = annotations.makam.value_counts()
assert len(makam_counts) == num_makams, "There are less than {num_makams} makams"
np.testing.assert_array_equal(makam_counts.unique(), [50])


## Download recordings from Dunya makam corpus


### Authenticate to dunya

In [7]:
dunya.set_token(dunya_token)


## Populate `dunya_uid`s

The MBIDs in `CompMusic makam music corpus` may be outdated, i.e. they may not be pointing to the master MBID. `otmm_makam_recognition_dataset` patches such recordings with an extra `dunya_uid` key. 

Below, we merge the `mbid` and `dunya_uid`'s to ensure we send the correct requests to Dunya API.

In [8]:
annotations.loc[annotations["dunya_uid"].isna(), "dunya_uid"] = \
    annotations.loc[annotations["dunya_uid"].isna(), "mbid"]

display(annotations)

Unnamed: 0,mbid,verified,tonic,makam,observations,dunya_uid,mb_url
0,00f1c6d9-c8ee-45e3-a06f-0882ebcb4e2f,False,256.0,Acemasiran,,00f1c6d9-c8ee-45e3-a06f-0882ebcb4e2f,http://musicbrainz.org/recording/00f1c6d9-c8ee...
1,168f7c75-84fb-4316-99d7-acabadd3b2e6,False,115.2,Acemasiran,,168f7c75-84fb-4316-99d7-acabadd3b2e6,http://musicbrainz.org/recording/168f7c75-84fb...
2,24f549dd-3fa4-4e9b-a356-778fbbfd5cad,False,232.5,Acemasiran,,24f549dd-3fa4-4e9b-a356-778fbbfd5cad,http://musicbrainz.org/recording/24f549dd-3fa4...
3,407bb0b4-f19b-42ab-8c0a-9f1263126951,False,233.5,Acemasiran,,407bb0b4-f19b-42ab-8c0a-9f1263126951,http://musicbrainz.org/recording/407bb0b4-f19b...
4,443819eb-6092-420c-bd86-d946a0ad6555,False,219.6,Acemasiran,,443819eb-6092-420c-bd86-d946a0ad6555,http://musicbrainz.org/recording/443819eb-6092...
...,...,...,...,...,...,...,...
995,3bf62832-d655-458c-af3e-11594f487162,False,146.0,Ussak,,3bf62832-d655-458c-af3e-11594f487162,http://musicbrainz.org/recording/3bf62832-d655...
996,632656b7-6a0f-476a-80cd-ced396bdb57c,False,179.0,Ussak,,632656b7-6a0f-476a-80cd-ced396bdb57c,http://musicbrainz.org/recording/632656b7-6a0f...
997,722df65d-6bb7-4280-8ae5-298c72602133,False,170.0,Ussak,,722df65d-6bb7-4280-8ae5-298c72602133,http://musicbrainz.org/recording/722df65d-6bb7...
998,b576d7fc-3238-4b97-9e19-89acac3cda9e,False,249.0,Ussak,,b576d7fc-3238-4b97-9e19-89acac3cda9e,http://musicbrainz.org/recording/b576d7fc-3238...


### Download and save to a temporary folder

Here, we do not work with tempfiles, because we may have to manually add some recordings if anything fails below. We create the temporary folder inside the `data` folder which is inside the shared Docker volumes, to easily move the audio around.

In [9]:
audio_tmp_folder = os.path.join(staging_dir, "mre_audio")
Path(audio_tmp_folder).mkdir(parents=True, exist_ok=True)

failed_mbids = dict()
for idx, anno in tqdm(annotations.iterrows(), total=num_recordings):
    tmp_file = os.path.join(audio_tmp_folder, f"{anno.mbid}.mp3")
    
    if not os.path.exists(tmp_file):
        try:
            mp3_content = dunya.docserver.get_mp3(anno.dunya_uid)
            with open(tmp_file, "wb") as f:
                f.write(mp3_content)
        except dunya.conn.HTTPError as e:
            if "404 Client Error: Not Found for url:" in str(e):
                logger.error("%d/%d: %s. Skipping..." % (idx, num_recordings, str(e)))
                failed_mbids[anno.mbid] = {
                    "type": "dunya.conn.HTTPError",
                    "reason": "404_url_not_found",
                    "message": str(e)
                }
            else:
                raise e


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




### Ensure the filenames and mbid's all match

If there are any missing or extra files, we stop here so we don't accidentally corrupt the data/pipeline.

In [10]:
onlyfiles = [os.path.splitext(f)[0]
             for f in os.listdir(audio_tmp_folder)
             if os.path.isfile(os.path.join(audio_tmp_folder, f))]
assert sorted(list(onlyfiles)) == sorted(list(annotations.mbid))

## Log recordings as mlflow artifacts

In [11]:
%pdb

mlflow.set_experiment('dataset_preparation')
with mlflow.start_run() as mlflow_run:
    run_id = mlflow_run.info.run_id
    sdfwef
#     mlflow.active_run()
#     mlflow.log_param("name", "test")
#     mlflow.log_param("waether", "sunny")
#     mlflow.log_param("mood", "great")
    
#     for i in range(10):
#         mlflow.log_metric("i", i)
#     mlflow.log_artifact("mlflow_example.ipynb")

Automatic pdb calling has been turned ON
INFO: 'dataset_preparation' does not exist. Creating a new experiment


NameError: name 'sdfwef' is not defined

> [0;32m<ipython-input-11-b5e30616cd49>[0m(6)[0;36m<module>[0;34m()[0m
[0;32m      4 [0;31m[0mmlflow[0m[0;34m.[0m[0mset_experiment[0m[0;34m([0m[0;34m'dataset_preparation'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      5 [0;31m[0;32mwith[0m [0mmlflow[0m[0;34m.[0m[0mstart_run[0m[0;34m([0m[0;34m)[0m [0;32mas[0m [0mmlflow_run[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m    [0msdfwef[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m[0;31m#     mlflow.active_run()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m[0;31m#     mlflow.log_param("name", "test")[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  mlflow_run


<ActiveRun: >


ipdb>  mlflow_run.info


<RunInfo: artifact_uri='/data/artifacts/2/d8041d43ada9435486781364be55668f/artifacts', end_time=None, experiment_id='2', lifecycle_stage='active', run_id='d8041d43ada9435486781364be55668f', run_uuid='d8041d43ada9435486781364be55668f', start_time=1586904379827, status='RUNNING', user_id='sertansenturk'>


ipdb>  q


### Remove staging folder