# Generate DALI Test Utterances

In [9]:
import os
import json
import sys
import re

import tqdm
import numpy as np
import pandas as pd

from IPython.display import Audio
import torchaudio

import DALI as dali_code
from DALI import utilities

In [2]:
# import DALI data
dali_version = 1 #2

dali_data_path = os.getcwd()+'/'+f"metadata_v{str(dali_version)}"
dali_data = dali_code.get_the_DALI_dataset(dali_data_path, skip=[], keep=[])

In [26]:
audio_dir = "dali_datasets/test_fullsongs" # used for pulling in full songs from DALI Test
destination_dir = "dali_datasets/test"  # used for placing utterances from full songs 

# Find short utterances that would fit with wav2vec2 anlysis.

In [3]:
# finds utterances...
def find_utterances(dali_entry):
    """
    find all utterances in a single DALI "entry" or song.  
    save and return the metadata in a dictionary.

    note: only save utterances that are longer than 1.0s or shorter
          than 28 seconds.

    file_id: uuid that aligns with the audio song base filename
    utt_id: unique id for that song utterance 
    start_time: start time in seconds of utterance
    stop_time: stop time in seconds of utterance
    duration: duratio of utterance in seconds
    text: transcript of snippet
    
    dali_entry: structure for accessing metadata for dali songs.
    """
    j = 0
    arr = []

    # d in this loop has the following keys: 
    #  - time (tuple) with start and stop time in seconds
    #  - text (string) transcript of line
    for i, d in enumerate(dali_entry.annotations['annot']['lines']):

        start, stop = d['time']
        assert(isinstance(start,float))
        assert(isinstance(stop,float))
        # calculate duration when the key is time.
        
        duration = stop-start
        
        if duration < 0:
            print(f"find_utterances: time stamp errors! Duration Negative: {duration}.  Ending Entry Search Early.")
            return arr
        
        # only keep utterances that fit the criteria.
        if (1.0 < duration < 28.0):
            # save the metadata
            arr.append({'file_id':entry.info['id'], 'utt_id':j, 'start_time':start, 'stop_time':stop, 'duration':duration, 'text':d['text']})
            j += 1

    return arr

entry = dali_data['874ff998bef04d939b8b21f7434abacd']
utt_array = find_utterances(entry)
print(f"{len(utt_array)} lines fit criteria out of {len(entry.annotations['annot']['lines'])}")

44 lines fit criteria out of 48


In [4]:
utt_array[42]

{'file_id': '874ff998bef04d939b8b21f7434abacd',
 'utt_id': 42,
 'start_time': 156.97942857142854,
 'stop_time': 159.658,
 'duration': 2.678571428571445,
 'text': "that i don't need you again"}

# Creating Single Song Utterances from DALI

In [18]:


# for song_utt in utt_array:
song_utt = utt_array[20]
# this finds the file, parses the file, then plays the snippet.
filename = f"{audio_dir}/{entry.info['id']}.wav"
if not os.path.isfile(filename):
    print(f"File: {filename} does not exist.")
else:
    wav, sr = torchaudio.load(filename)
    start_samp = int(song_utt['start_time'] * sr)
    stop_samp = int(song_utt['stop_time'] * sr)
    print(song_utt['text'])
    
    display(Audio(wav[0,start_samp:stop_samp],rate=sr))

that i don't need you again


In [19]:
# Automation of the cell above for a single entry...

def create_file_name(fileid,linenum):
    return f"{fileid}_{linenum}.wav"

def create_metadata_file(utt_array, destination_dir="./",create_csv=False):
    """
    create_metadata_file - create metadata file for HuggingFace DataSet from array of utterances

    utt_array - list of dictionary objects.  each object is dictionary is generated by find_utterances

    returns dataframe used to create the csv.
    """
    # 
    #
    # creating transcript column
    transcripts = [song_utt['text'].upper() for song_utt in utt_array]
    # creating filenames column.
    file_names  = [create_file_name(song_utt['file_id'],song_utt['utt_id']) for song_utt in utt_array]
    #convert to dataframe
    metadata_df = pd.DataFrame({'file_name':file_names, 'transcription':transcripts})
    
    # save as csv
    if create_csv:
        print("create_metadata_file: Saving to CSV...")
        metadata_df.to_csv(f"{destination_dir}/metadata.csv",index=False)

    return metadata_df

def create_utt_files(utt_array, audio_dir, destination_dir=None):

    if destination_dir is None:
        destination_dir=audio_dir

    print(f"Saving Audio Files from: {utt_array[0]['file_id']}.wav")

    # iterate through all utterances 
    for song_utt in utt_array:
        
        # this finds the file, parses the file
        filename = f"{audio_dir}/{entry.info['id']}.wav"
        if os.path.isfile(filename):
            wav, sr = torchaudio.load(filename)
        else:
            # this is ok because we are trying to make utterance files
            # from a metadata file where some files may or may not be 
            # downloaded.  so the user is warned, but we obstensively 
            # give up and let the user figure out how to get that file
            # or just be happy without that file.
            print(f"File: {filename} does not exist....moving on.")
            return False

        start_samp = int(song_utt['start_time'] * sr)
        stop_samp = int(song_utt['stop_time'] * sr)
        
        # crop and shape the waveform based on 
        newwav = wav[0,start_samp:stop_samp]
        newwav = newwav.reshape((1,-1))
        newwav_filename = create_file_name(song_utt['file_id'], song_utt['utt_id'])
        newwav_transcript = song_utt['text']
    
        # saving utterance...
        torchaudio.save(f"{destination_dir}/{newwav_filename}",newwav,sample_rate=sr)
    
    return True

def create_utt_corpus_from_entry(dali_entry, audio_dir, destination_dir=None):
    """
    
    Creating audio snippets and a metadata file that will 
    go in the audio_dir to support importing the song 
    into Huggingface Dataset object.  

    destination_dir - location where the audio and metadata will be stored.
                      if None, then use audio_dir.
    
    """

    # find all utterances using the DALI metadata labels 
    # utt_array is a dictionary with file, duration, and transcript 
    # details.  see method.
    utt_array = find_utterances(dali_entry)

    # Using the utt_array, we generation files (wav files) 
    # assuming the source audio is in the directory 
    # audio_dir.  Destination_dir is where utterance files
    # will be stored.
    #
    # Returns False if File not found in audio_dir.
    file_found = create_utt_files(utt_array, audio_dir, destination_dir)

    if file_found:
        # Once the audio is created, we need to create the audio 
        # metadata file.  This file will be stored in destination_dir
        #print(f"create_metadata_file: {len(utt_array)}, {destination_dir}")
        return create_metadata_file(utt_array, destination_dir)
        
    return False

In [20]:
# for testing...
# md_df = create_utt_corpus_from_entry(entry, audio_dir, destination_dir=None)

# Find all Entries in DALI Test Metadata

In [21]:
# import the dali_test metadata...
dali_test_metadata_df = pd.read_csv("DALI_TestSet4ALT.csv")          
dali_ids = [dali_id for dali_id in dali_test_metadata_df['DALI_ID']]

In [22]:
#
# Creates Metadata and Utterances Cuts from audio_dir.  Places both in 
# destination_dir.  If destination_dir is None, then they all go in audio_dir.
#
final_metadata_df = pd.DataFrame({"file_name":[],"transcription":[]})

for i, dali_id in enumerate(dali_ids):
    entry = dali_data[dali_id]
    df_temp = create_utt_corpus_from_entry(entry, audio_dir, destination_dir)

    if isinstance(df_temp,pd.DataFrame): 
        print(f"{i}: creating dataframe for {entry.info['id']}") 
        final_metadata_df = pd.concat([final_metadata_df, df_temp], ignore_index=True, sort=False)
    else:
        print(f"Skip: {entry.info['id']}") 



Saving Audio Files from: 44a2455abc0e4fb397a396d2cd1ebeb9.wav
0: creating dataframe for 44a2455abc0e4fb397a396d2cd1ebeb9
Saving Audio Files from: 15d6e9e88ced41dfbff38ba2f3e1d885.wav
1: creating dataframe for 15d6e9e88ced41dfbff38ba2f3e1d885
Saving Audio Files from: a59e44a4c910443a87f068b177200fdc.wav
2: creating dataframe for a59e44a4c910443a87f068b177200fdc
Saving Audio Files from: ae91bcda73944695b7756ddc066c3e02.wav
3: creating dataframe for ae91bcda73944695b7756ddc066c3e02
Saving Audio Files from: d6e3cf403653490f8366bf77cbc0f186.wav
4: creating dataframe for d6e3cf403653490f8366bf77cbc0f186
Saving Audio Files from: 7a1642003f574713a6e25e5ee549fce6.wav
5: creating dataframe for 7a1642003f574713a6e25e5ee549fce6
Saving Audio Files from: 8eb15ad6d17f41b68009fe3848930dee.wav
6: creating dataframe for 8eb15ad6d17f41b68009fe3848930dee
Saving Audio Files from: b63d71b7ec6b4c5e9f53a87f83fcd73e.wav
7: creating dataframe for b63d71b7ec6b4c5e9f53a87f83fcd73e
Saving Audio Files from: ac0279e

In [23]:
final_metadata_df.shape

(10186, 2)

In [27]:
#
#  Saving Metadata file...
#
final_metadata_df.to_csv(f"{destination_dir}/metadata.csv",index=False)

In [28]:
final_metadata_df.head()

Unnamed: 0,file_name,transcription
0,44a2455abc0e4fb397a396d2cd1ebeb9_0.wav,YOU SAID THAT LOVE WAS JUST A STATE OF MIND
1,44a2455abc0e4fb397a396d2cd1ebeb9_1.wav,A PUZZLE MADE OF PIECES YOU CAN'T FIND
2,44a2455abc0e4fb397a396d2cd1ebeb9_2.wav,AND FOR ME YOU NEVER REALLY HAD THE TIME
3,44a2455abc0e4fb397a396d2cd1ebeb9_3.wav,"I WAS BLIND, OH-OH"
4,44a2455abc0e4fb397a396d2cd1ebeb9_4.wav,AND EVERYTHING THAT YOU MEANT TO ME


In [7]:
final_metadata_df = pd.read_csv("dali_datasets/test/metadata.csv")
final_metadata_df

Unnamed: 0,file_name,transcription
0,44a2455abc0e4fb397a396d2cd1ebeb9_0.wav,YOU SAID THAT LOVE WAS JUST A STATE OF MIND
1,44a2455abc0e4fb397a396d2cd1ebeb9_1.wav,A PUZZLE MADE OF PIECES YOU CAN'T FIND
2,44a2455abc0e4fb397a396d2cd1ebeb9_2.wav,AND FOR ME YOU NEVER REALLY HAD THE TIME
3,44a2455abc0e4fb397a396d2cd1ebeb9_3.wav,"I WAS BLIND, OH-OH"
4,44a2455abc0e4fb397a396d2cd1ebeb9_4.wav,AND EVERYTHING THAT YOU MEANT TO ME
...,...,...
10181,b0c1c41a5a024f47ae1eca7a8b5ca59b_11.wav,NO BETTER WORLD
10182,b0c1c41a5a024f47ae1eca7a8b5ca59b_12.wav,LET THIS END -
10183,b0c1c41a5a024f47ae1eca7a8b5ca59b_13.wav,MO - THERS CRY
10184,b0c1c41a5a024f47ae1eca7a8b5ca59b_14.wav,OUR BOYS DIE


In [23]:
# Join all the text in the 'Text' column into a single string
final_metadata_df = pd.read_csv("dali_datasets/test/metadata.csv")
final_metadata_df

def clean_columns(row):
    text = row['transcription'].lower()

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\`\&\(\)]'
    res = re.sub(chars_to_ignore_regex, '', text + " ")
    res = res.replace("in' ", "in ")
    res = res.replace("1", " one ")
    res = res.replace("2", " two ")
    res = res.replace("3", " three ")
    res = res.replace("4", " four ")
    res = res.replace("5", " five ")
    res = res.replace("6", " six ")
    res = res.replace("7", " seven ")
    res = res.replace("8", " eight ")
    res = res.replace("9", " nine ")
    res = res.replace("0", " ten ")
    return res

final_metadata_df['transcription'] = final_metadata_df.apply(clean_columns,axis=1)

text_concatenated = ''.join(final_metadata_df['transcription'])

# Extract unique characters from the concatenated string
unique_characters = set(text_concatenated)

print(sorted(unique_characters))
print(len(unique_characters))

[' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
28


In [22]:
final_metadata_df

Unnamed: 0,file_name,transcription
0,44a2455abc0e4fb397a396d2cd1ebeb9_0.wav,you said that love was just a state of mind
1,44a2455abc0e4fb397a396d2cd1ebeb9_1.wav,a puzzle made of pieces you can't find
2,44a2455abc0e4fb397a396d2cd1ebeb9_2.wav,and for me you never really had the time
3,44a2455abc0e4fb397a396d2cd1ebeb9_3.wav,i was blind ohoh
4,44a2455abc0e4fb397a396d2cd1ebeb9_4.wav,and everything that you meant to me
...,...,...
10181,b0c1c41a5a024f47ae1eca7a8b5ca59b_11.wav,no better world
10182,b0c1c41a5a024f47ae1eca7a8b5ca59b_12.wav,let this end
10183,b0c1c41a5a024f47ae1eca7a8b5ca59b_13.wav,mo thers cry
10184,b0c1c41a5a024f47ae1eca7a8b5ca59b_14.wav,our boys die


# Sampling Utterance Files

In [27]:
# this finds the file, parses the file, then plays the snippet.
dali_id = dali_test_metadata_df['DALI_ID'].sample(1).iloc[0]
entry = dali_data[dali_id]
utt_array = find_utterances(entry)
ID = np.random.randint(len(utt_array))
print(f"DALI ID: {dali_id}, Utterance: {ID}")
song_utt = utt_array[ID]
filename = f"{destination_dir}/{entry.info['id']}_{ID}.wav"
if os.path.isfile(filename):
    wav, sr = torchaudio.load(filename)
else:
    print(f"File: {filename} does not exist.")

print(song_utt['text'])
Audio(wav,rate=sr)

DALI ID: b4759e8b2f5d4f3d88ef8111c0c3cfea, Utterance: 2
our common goal


In [125]:
import re

print(song_utt['text'])
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
pass1 = re.sub(chars_to_ignore_regex, '', song_utt['text'] + " ")
print(pass1)

# Define the regex pattern for words ending with "in'"
word_in_ending_regex = r'(\b\w+)in\'(\b)'
# Define the replacement string
replacement_string = r'\1ing\2'

# Apply the first rule to remove characters to ignore
pass1.replace("in' ", "in ")


pistons poppin' ain't no stoppin now
pistons poppin' ain't no stoppin now 


"pistons poppin ain't no stoppin now "

# Total Duration of DALI Test (as utterances)

In [10]:
total_song_duration = 0
for i, dali_id in enumerate(dali_ids):
    entry = dali_data[dali_id]
    utt_array = find_utterances(entry)
    total_song_duration += np.sum(np.array([utt_song['duration'] for utt_song in utt_array]))
print(f"Total Duration of Entry: {total_song_duration:.1f}s or {total_song_duration/60:.1f}min or {total_song_duration/3600:.2f}hr")

Total Duration of Entry: 32696.4s or 544.9min or 9.08hr


# Attempt Source Seperation

In [11]:
print(entry.info['id'])

b1b6bc336f78441b8b31da555ccf59d8


In [17]:
filename = f"dali_datasets/test/7357de99882d49cb9a0564a8ce4d60f4.wav"
if os.path.isfile(filename):
    wav, sr = torchaudio.load(filename)
else:
    print(f"File: {filename} does not exist.")

Audio(wav[0,200000:200000+261120],rate=sr)

In [9]:
from sdx23.my_submission.src import tfc_tdf_v3
from ml_collections import ConfigDict
import yaml
import torch

checkpoint = torch.load("./mdx_AB/modelA/step100000_seed2.ckpt",torch.device('cpu'))

with open("mdx_AB/modelA/config.yaml") as f:
        config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader))

model = tfc_tdf_v3.TFC_TDF_net(config)
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [11]:
import torch

wav2 = wav[0,200000:200000+261120].reshape(1,-1)
print(wav2.shape)
wav_stack = torch.stack([wav2,wav2],dim=1)
wav_stack.shape

torch.Size([1, 261120])


torch.Size([1, 2, 261120])

In [12]:
output = model(wav_stack)

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/SpectralOps.cpp:879.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


In [13]:
output.shape

torch.Size([1, 4, 2, 261120])

In [16]:
Audio(output[0,0,0,:].detach().numpy(),rate=16000)

In [18]:
Audio(output[0,1,0,:].detach().numpy(),rate=16000)

In [19]:
Audio(output[0,2,0,:].detach().numpy(),rate=16000)

In [20]:
Audio(output[0,3,0,:].detach().numpy(),rate=16000)