# 01-download_dataset_audio

This notebook downloads the audio recordings listed in the `annotation.json` from [Dunya makam music corpus](https://dunya.compmusic.upf.edu/).  The annotations are saved in a `mlflow` run called `download_audio` under an experiment named `data_processing`.

In [1]:
import configparser
import importlib
import logging
import os
import tempfile

import mlflow
import pandas as pd

from compmusic import dunya
from tqdm.notebook import tqdm


## Stop if audio were fetched in the past


In [None]:
experiment_name = "data_processing"
run_name = "download_audio"

experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is not None:
    annotation_runs = mlflow.search_runs(
        experiment_ids=experiment.experiment_id,
        filter_string=f"tags.mlflow.runName = '{run_name}'")

    assert len(annotation_runs) == 0, (
        f"There is already a run for {run_name}:{', '.join(annotation_runs.run_id)}. "
        "Overwriting is not permitted. Please inspect the run in the mlflow UI "
        "and manually make the necessary corrections.")


## Init logger

In [2]:
importlib.reload(logging)  # fix jupyter logging: https://stackoverflow.com/a/21475297
logging.basicConfig(level=logging.INFO)

# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Logger initiated...")

INFO:__main__:Logger initiated...


## Read relevant parts of the configuration

In [3]:
config_dir = "../config"

config = configparser.ConfigParser()
config_file = config.read(os.path.join(config_dir, 'config.ini'))
logger.info(f"Reading configuration from {config_file}")

secrets_file = os.path.join(config_dir, config["secrets"]["file"])


INFO:__main__:Reading configuration from ['../config/config.ini']


## Read secrets; never commit these to the repo!!!


In [4]:
if not os.path.exists(secrets_file):
    raise FileNotFoundError(f"{secrets_file} does not exist")

secrets_file = config.read(secrets_file)
logger.info(f"Reading secrets from {secrets_file}")

dunya_token = config["tokens"]["dunya"]

INFO:__main__:Reading secrets from ['../config/./secrets.ini']


## Read annotations from mlflow


In [33]:
experiment_name = "data_processing"
annotation_run_name = "download_annotations"
annotation_filename = "annotations.json"

experiment = mlflow.get_experiment_by_name(experiment_name)

annotation_run = mlflow.search_runs(
    experiment_ids=experiment.experiment_id,
    filter_string=f"tags.mlflow.runName = '{annotation_run_name}'")

assert len(annotation_run) > 0, (
    f"There are more than one runs for {annotation_run_name}:{', '.join(annotation_runs.run_id)}. "
    "Please inspect the run in the mlflow UI and manually make the necessary corrections.")

display(annotation_run)

# artifact_uri shows the absolute path in the mlflow container
# change it to the relative path in the jupyter container
# note: os.path.join would not work because os.path.join("..", abs_path) => abs_path
artifact_uri = ".." + annotation_run.iloc[0].artifact_uri

annotation_file = os.path.join(artifact_uri, annotation_filename)
annotations = pd.read_json(annotation_file, orient="records")
display(annotations.head())

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,tags.dataset_annotation_file,tags.mlflow.source.git.commit,tags.dataset_git_tag,tags.mlflow.user,tags.dataset_num_recordings,tags.dataset_num_recordings_per_makam,tags.mlflow.source.type,tags.mlflow.source.git.repoURL,tags.dataset_url,tags.mlflow.runName,tags.mlflow.source.name,tags.dataset_name
0,4c5d3a2ff4d249778460b9a4d97b6392,1,FINISHED,/data/artifacts/1/4c5d3a2ff4d249778460b9a4d97b...,2020-06-07 17:31:58.301000+00:00,2020-06-07 17:31:58.405000+00:00,https://raw.githubusercontent.com/sertansentur...,d086c8c17e1532dbf3e0e98ae19ac09677a21c81,dlfm2016-fix1,sertansenturk,1000,50,NOTEBOOK,git@github.com:sertansenturk/makam_recognition...,https://raw.githubusercontent.com/sertansentur...,download_annotations,./notebooks/00-download_dataset_annotations.ipynb,otmm_makam_recognition_dataset


Unnamed: 0,mbid,verified,tonic,makam,observations,dunya_uid,mb_url
0,00f1c6d9-c8ee-45e3-a06f-0882ebcb4e2f,False,256.0,Acemasiran,,00f1c6d9-c8ee-45e3-a06f-0882ebcb4e2f,http://musicbrainz.org/recording/00f1c6d9-c8ee...
1,168f7c75-84fb-4316-99d7-acabadd3b2e6,False,115.2,Acemasiran,,168f7c75-84fb-4316-99d7-acabadd3b2e6,http://musicbrainz.org/recording/168f7c75-84fb...
2,24f549dd-3fa4-4e9b-a356-778fbbfd5cad,False,232.5,Acemasiran,,24f549dd-3fa4-4e9b-a356-778fbbfd5cad,http://musicbrainz.org/recording/24f549dd-3fa4...
3,407bb0b4-f19b-42ab-8c0a-9f1263126951,False,233.5,Acemasiran,,407bb0b4-f19b-42ab-8c0a-9f1263126951,http://musicbrainz.org/recording/407bb0b4-f19b...
4,443819eb-6092-420c-bd86-d946a0ad6555,False,219.6,Acemasiran,,443819eb-6092-420c-bd86-d946a0ad6555,http://musicbrainz.org/recording/443819eb-6092...


## Download recordings from Dunya makam corpus


### Authenticate to dunya

In [34]:
dunya.set_token(dunya_token)


### Download audio and save to mlflow as artifacts

In [36]:
mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name=run_name) as mlflow_run:
    run_id = mlflow_run.info.run_id
    
    git_repo_url = !git config --get remote.origin.url
    git_commit = !git rev-parse HEAD
    notebook_name = "./notebooks/01-download_dataset_audio.ipynb"
    mlflow.set_tags({
        "mlflow.source.type": "NOTEBOOK",
        "mlflow.source.name": notebook_name,
        "mlflow.source.git.commit": git_commit[0],
        "mlflow.source.git.repoURL": git_repo_url[0],
        "mlflow.audio_source": "https://dunya.compmusic.upf.edu"
    })
    
    with tempfile.TemporaryDirectory() as tmp_dir:
        failed_mbids = dict()
        num_recordings = len(annotations)
        for idx, anno in tqdm(annotations.iterrows(), total=num_recordings):
            tmp_file = os.path.join(tmp_dir, f"{anno.mbid}.mp3")

            try:
                mp3_content = dunya.docserver.get_mp3(anno.dunya_uid)
                with open(tmp_file, "wb") as f:
                    f.write(mp3_content)
            except dunya.conn.HTTPError as e:
                if "404 Client Error: Not Found for url:" in str(e):
                    logger.error("%d/%d: %s. Skipping..." % (idx, num_recordings, str(e)))
                    failed_mbids[anno.mbid] = {
                        "type": "dunya.conn.HTTPError",
                        "reason": "404_url_not_found",
                        "message": str(e)
                    }
                else:
                    raise e

        mlflow.log_artifacts(tmp_dir)


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


