# DALI Sandbox

Exercise pulling out metadata from DALI on version 1 and version 2.  Below I pull out artist lyrics for Taylor Swift and Muse and save transcripts in a local directory transcripts/artist_name/song_id.txt

In [99]:
import os
import json
import sys

import tqdm
import numpy as np
import pandas as pd

import DALI as dali_code
from DALI import utilities

In [100]:
# import DALI data
dali_version = 1 #2

dali_data_path = os.getcwd()+'/'+f"metadata_v{str(dali_version)}"
dali_data = dali_code.get_the_DALI_dataset(dali_data_path, skip=[], keep=[])

In [106]:
if dali_version == 1:
    dali_info = dali_code.get_info(dali_data_path + '/info/DALI_DATA_INFO.gz')
    entry = dali_data['0091064bdc72469ca7096d3a0db74562']
    print(json.dumps(entry.info,indent=2))
elif dali_version == 2: 
    entry = dali_data['fff64f1669c34253928802a9ed246d2f']
    print(json.dumps(entry.info,indent=2))

{
  "dataset_version": 1.0,
  "ground-truth": false,
  "artist": "Five For Fighting",
  "title": "100 Years",
  "scores": {
    "NCC": 0.8314642198566967,
    "manual": 0.0
  },
  "audio": {
    "url": "fJN9YIOl1xE",
    "path": "None",
    "working": true
  },
  "id": "0091064bdc72469ca7096d3a0db74562",
  "metadata": {
    "album": "The Battle for Everything",
    "release_date": "2004",
    "cover": "https://e-cdns-images.dzcdn.net/images/cover/a9da82ee07fc50da6052e27e9c97e6cd/1000x1000-000000-80-0-0.jpg",
    "genres": [
      "Rock"
    ],
    "language": "english"
  }
}


# Parse the lyrics from a single song

In [107]:
display(entry.info['metadata'])
display(entry.info['audio'])
display(entry.info['artist'])
display(entry.info['ground-truth'])

{'album': 'The Battle for Everything',
 'release_date': '2004',
 'cover': 'https://e-cdns-images.dzcdn.net/images/cover/a9da82ee07fc50da6052e27e9c97e6cd/1000x1000-000000-80-0-0.jpg',
 'genres': ['Rock'],
 'language': 'english'}

{'url': 'fJN9YIOl1xE', 'path': 'None', 'working': True}

'Five For Fighting'

False

In [108]:
lyrics = [v for d in entry.annotations['annot']['lines'] for k,v in d.items() if k=="text"]
' '.join(lyrics)

"i'm fifteen for a moment caught in between ten and twenty and i'm just dreaming counting the ways to where you are i'm twenty two for a moment and she feels better than ever and we're on fire making our way back from mars fifteen there's still time for you time to buy and time to lose fifteen, there's never a wish better than this when you only got a hundred years to live i'm thirty three for a moment but still the man, but you see i'm of age kid on the way babe a family on my mind i'm forty five for a moment the sea is high and i'm heading into a crisis chasing the years of my life fifteen there's still time for you time to buy and time to lose yourself within a morning star fif teen i'm all right with you fifteen, nev-er a wish better than-this when you onlygot a hundredyears to live half timegoes by suddenly you're wise- another blink of aneye sixty seven isgone the sun isget ting high we're moving on i'mnine ty nine for a moment timefor just anothermo ment and i'm just dreaming co

In [109]:
lyrics2 = [v for d in entry.annotations['annot']['paragraphs'] for k,v in d.items() if k=="text"]
'\n\n'.join(lyrics2)

"i'm fifteen for a moment caught in between ten and twenty and i'm just dreaming counting the ways to where you are\n\ni'm twenty two for a moment and she feels better than ever and we're on fire making our way back from mars\n\nfifteen there's still time for you time to buy and time to lose fifteen, there's never a wish better than this when you only got a hundred years to live\n\ni'm thirty three for a moment but still the man, but you see i'm of age kid on the way babe a family on my mind\n\ni'm forty five for a moment the sea is high and i'm heading into a crisis chasing the years of my life\n\nfifteen there's still time for you time to buy and time to lose yourself within a morning star\n\nfif teen i'm all right with you fifteen, nev-er a wish better than-this when you onlygot a hundredyears to live\n\nhalf timegoes by suddenly you're wise- another blink of aneye sixty seven isgone the sun isget ting high we're moving on\n\ni'mnine ty nine for a moment timefor just anothermo ment 

# Find short utterances that would fit with wav2vec2 anlysis.

In [160]:
# finds utterances...
def find_utterances(dali_entry):
    """
    find all utterances in a single DALI "entry" or song.  
    save and return the metadata in a dictionary.

    note: only save utterances that are longer than 1.0s or shorter
          than 28 seconds.

    file_id: uuid that aligns with the audio song base filename
    utt_id: unique id for that song utterance 
    start_time: start time in seconds of utterance
    stop_time: stop time in seconds of utterance
    duration: duratio of utterance in seconds
    text: transcript of snippet
    
    dali_entry: structure for accessing metadata for dali songs.
    """
    j = 0
    arr = []
    for i, d in enumerate(dali_entry.annotations['annot']['lines']):
        
        # fetch metadata for a line
        for k,v in d.items():

            
            if k=='time':
                # calculate duration when the key is time.
                duration = v[1]-v[0]

                # only keep utterances that fit the criteria.
                if (1.0 < duration < 28.0):
                    
                    j += 1
    
                    # save the metadata
                    arr.append({'file_id':entry.info['id'], 'utt_id':i, 'start_time':v[0], 'stop_time':v[1], 'duration':duration, 'text':d['text']})
    return arr

utt_array = find_utterances(entry)
print(f"{len(utt_array)} lines fit criteria out of {len(entry.annotations['annot']['lines'])}")

47 lines fit criteria out of 47


# Creating Single Song Utterances from DALI

In [154]:
from IPython.display import Audio
import torchaudio

# for song_utt in utt_array:
song_utt = utt_array[1]
# this finds the file, parses the file, then plays the snippet.
filename = f"dali_datasets/test/{entry.info['id']}.wav"
if os.path.isfile(filename):
    wav, sr = torchaudio.load(filename)
else:
    print(f"File: {filename} does not exist.")

start_samp = int(song_utt['start_time'] * sr)
stop_samp = int(song_utt['stop_time'] * sr)
print(song_utt['text'])
Audio(wav[0,start_samp:stop_samp],rate=sr)

caught in between ten and twenty and i'm


In [213]:
# Automation of the cell above for a single entry...

def create_file_name(fileid,linenum):
    return f"{fileid}_{linenum}.wav"

def create_metadata_file(utt_array, destination_dir="./",create_csv=False):
    """
    create_metadata_file - create metadata file for HuggingFace DataSet from array of utterances

    utt_array - list of dictionary objects.  each object is dictionary is generated by find_utterances

    returns dataframe used to create the csv.
    """
    # 
    #
    # creating transcript column
    transcripts = [song_utt['text'].upper() for song_utt in utt_array]
    # creating filenames column.
    file_names  = [create_file_name(song_utt['file_id'],song_utt['utt_id']) for song_utt in utt_array]
    #convert to dataframe
    metadata_df = pd.DataFrame({'file_name':file_names, 'transcription':transcripts})
    
    # save as csv
    if create_csv:
        print("create_metadata_file: Saving to CSV...")
        metadata_df.to_csv(f"{destination_dir}/metadata.csv",index=False)

    return metadata_df

def create_utt_files(utt_array, audio_dir, destination_dir=None):

    if destination_dir is None:
        destination_dir=audio_dir

    print(f"Saving Files from: {utt_array[0]['file_id']}")

    # iterate through all utterances 
    for song_utt in utt_array:
        
        # this finds the file, parses the file
        filename = f"{audio_dir}/{entry.info['id']}.wav"
        if os.path.isfile(filename):
            wav, sr = torchaudio.load(filename)
        else:
            # this is ok because we are trying to make utterance files
            # from a metadata file where some files may or may not be 
            # downloaded.  so the user is warned, but we obstensively 
            # give up and let the user figure out how to get that file
            # or just be happy without that file.
            print(f"File: {filename} does not exist....moving on.")
            return False

        start_samp = int(song_utt['start_time'] * sr)
        stop_samp = int(song_utt['stop_time'] * sr)
        
        # crop and shape the waveform based on 
        newwav = wav[0,start_samp:stop_samp]
        newwav = newwav.reshape((1,-1))
        newwav_filename = create_file_name(song_utt['file_id'], song_utt['utt_id'])
        newwav_transcript = song_utt['text']
    
        # saving utterance...
        torchaudio.save(f"{destination_dir}/{newwav_filename}",newwav,sample_rate=sr)
    
    return True

def create_utt_corpus_from_entry(dali_entry, audio_dir, destination_dir=None):
    """
    
    Creating audio snippets and a metadata file that will 
    go in the audio_dir to support importing the song 
    into Huggingface Dataset object.  

    destination_dir - location where the audio and metadata will be stored.
                      if None, then use audio_dir.
    
    """

    # find all utterances using the DALI metadata labels 
    # utt_array is a dictionary with file, duration, and transcript 
    # details.  see method.
    utt_array = find_utterances(dali_entry)

    # Using the utt_array, we generation files (wav files) 
    # assuming the source audio is in the directory 
    # audio_dir.  Destination_dir is where utterance files
    # will be stored.
    #
    # Returns False if File not found in audio_dir.
    file_found = create_utt_files(utt_array, audio_dir, destination_dir)

    if file_found:
        # Once the audio is created, we need to create the audio 
        # metadata file.  This file will be stored in destination_dir
        #print(f"create_metadata_file: {len(utt_array)}, {destination_dir}")
        return create_metadata_file(utt_array, destination_dir)
        
    return False

In [215]:
audio_dir = "dali_datasets/test"
destination_dir = "dali_datasets/newtest"

md_df = create_utt_corpus_from_entry(entry, audio_dir, destination_dir)

Saving Files from: b1b6bc336f78441b8b31da555ccf59d8
File: dali_datasets/test/b1b6bc336f78441b8b31da555ccf59d8.wav does not exist....moving on.


# Find all Entries in DALI Test Metadata

In [216]:
# import the dali_test metadata...
dali_test_metadata_df = pd.read_csv("DALI_TestSet4ALT.csv")          
dali_ids = [dali_id for dali_id in dali_test_metadata_df['DALI_ID']]

In [218]:
audio_dir = "dali_datasets/test"
destination_dir = "dali_datasets/newtest"

final_metadata_df = pd.DataFrame({"file_name":[],"transcription":[]})

for i, dali_id in enumerate(dali_ids):
    entry = dali_data[dali_id]
    df_temp = create_utt_corpus_from_entry(entry, audio_dir, destination_dir)

    if isinstance(df_temp,pd.DataFrame): 
        print(f"{i}: creating dataframe for {entry.info['id']}") 
        final_metadata_df = pd.concat([final_metadata_df, df_temp], ignore_index=True, sort=False)
    else:
        print(f"Skip: {entry.info['id']}") 



Saving Files from: 44a2455abc0e4fb397a396d2cd1ebeb9
0: creating dataframe for 44a2455abc0e4fb397a396d2cd1ebeb9
Saving Files from: 15d6e9e88ced41dfbff38ba2f3e1d885
1: creating dataframe for 15d6e9e88ced41dfbff38ba2f3e1d885
Saving Files from: a59e44a4c910443a87f068b177200fdc
2: creating dataframe for a59e44a4c910443a87f068b177200fdc
Saving Files from: ae91bcda73944695b7756ddc066c3e02
3: creating dataframe for ae91bcda73944695b7756ddc066c3e02
Saving Files from: d6e3cf403653490f8366bf77cbc0f186
4: creating dataframe for d6e3cf403653490f8366bf77cbc0f186
Saving Files from: 7a1642003f574713a6e25e5ee549fce6
5: creating dataframe for 7a1642003f574713a6e25e5ee549fce6
Saving Files from: 8eb15ad6d17f41b68009fe3848930dee
6: creating dataframe for 8eb15ad6d17f41b68009fe3848930dee
Saving Files from: b63d71b7ec6b4c5e9f53a87f83fcd73e
7: creating dataframe for b63d71b7ec6b4c5e9f53a87f83fcd73e
Saving Files from: ac0279efad294261b4c7dc0b86eaaec5
8: creating dataframe for ac0279efad294261b4c7dc0b86eaaec5
S

In [234]:
#
#  Saving Metadata file...
#
metadata_df.to_csv(f"{destination_dir}/metadata.csv",index=False)

# Sampling Utterance Files

In [261]:
# this finds the file, parses the file, then plays the snippet.
dali_id = dali_test_metadata_df['DALI_ID'].sample(1).iloc[0]
entry = dali_data[dali_id]
utt_array = find_utterances(entry)
ID = 23
song_utt = utt_array[ID]
filename = f"{destination_dir}/{entry.info['id']}_{ID}.wav"
if os.path.isfile(filename):
    wav, sr = torchaudio.load(filename)
else:
    print(f"File: {filename} does not exist.")

print(song_utt['text'])
Audio(wav,rate=sr)

and stare


# Total Duration of DALI Test (as utterances)

In [231]:
total_song_duration = 0
for i, dali_id in enumerate(dali_ids):
    entry = dali_data[dali_id]
    utt_array = find_utterances(entry)
    total_song_duration += np.sum(np.array([utt_song['duration'] for utt_song in utt_array]))
print(f"Total Duration of Entry: {total_song_duration:.1f}s or {total_song_duration/60:.1f}min or {total_song_duration/3600:.2f}hr")

Total Duration of Entry: 32696.4s or 544.9min or 9.08hr


# Looking for all artists

In [65]:
# expected number of songs by version...
dali_version1_nsongs = 5358
dali_version2_nsongs = 7756

artists = list()

for sond_id, entry in tqdm.tqdm(dali_data.items(), desc="Processing"):
    artists.append(entry.info['artist'])

print(f"Found {len(set(artists))} artists")

Processing: 100%|███████████████████████████████████████████████████████████████████████| 5358/5358 [00:00<00:00, 128482.95it/s]

Found 2274 artists





In [66]:
# Top 20 Artists
ds = pd.Series(artists)
ds.value_counts().iloc[0:20]

Demi Lovato         43
Muse                39
Taylor Swift        29
Volbeat             28
Linkin Park         28
ABBA                27
Die Ärzte           27
Green Day           27
Take That           24
Elvis Presley       24
Glee Cast           24
Mumford & Sons      23
Avril Lavigne       23
Queen               23
The Beatles         22
System Of A Down    22
Metallica           21
Madonna             21
Three Days Grace    21
Blink-182           21
Name: count, dtype: int64

## Get All Taylor Lyrics in DALI and Save to Disk

In [68]:
SAVE_DIR = os.getcwd()+"/transcripts/"
if not os.path.exists(SAVE_DIR):
    os.mkdir(SAVE_DIR)

In [69]:
ARTIST_DIR = SAVE_DIR+"/taylor_swift"
if not os.path.exists(ARTIST_DIR):
    os.mkdir(ARTIST_DIR)

artist_count = 0
for song_id, entry in tqdm.tqdm(dali_data.items(), desc="Processing"):
    if entry.info['artist'].lower() == "taylor swift":
        artist_count += 1
        paras = [v for d in entry.annotations['annot']['paragraphs'] for k,v in d.items() if k=="text"]
        lyrics = '\n\n'.join(paras)
        with open(f"{ARTIST_DIR}/{song_id}.txt", 'a') as file:
            file.write(lyrics)
print(f"Found {artist_count} Taylor Swift Songs")

Processing: 100%|███████████████████████████████████████████████████████████████████████| 5358/5358 [00:00<00:00, 276816.62it/s]

Found 29 Taylor Swift Songs





## Get All Muse Lyrics in DALI and Save to Disk

In [70]:
ARTIST_DIR = SAVE_DIR+"/muse"
if not os.path.exists(ARTIST_DIR):
    os.mkdir(ARTIST_DIR)

artist_count = 0
for song_id, entry in tqdm.tqdm(dali_data.items(), desc="Processing"):
    if entry.info['artist'].lower() == "muse":
        artist_count += 1
        paras = [v for d in entry.annotations['annot']['paragraphs'] for k,v in d.items() if k=="text"]
        lyrics = '\n\n'.join(paras)
        with open(f"{ARTIST_DIR}/{song_id}.txt", 'a') as file:
            file.write(lyrics)
print(f"Found {artist_count} Muse Songs")

Processing: 100%|███████████████████████████████████████████████████████████████████████| 5358/5358 [00:00<00:00, 248376.22it/s]

Found 39 Muse Songs





# Clean up Transcripts from Directories