In [1]:
import librosa
import os
import pandas as pd
import re
from pathlib import Path

In [2]:
regions = ["ATL","DCB","ROC","VLD"]

In [3]:
for region in regions:
    total_dur = 0 #in seconds
    if region[0] != '.': #skip .DS_Store
        path = f"CORAAL/{region}/"
        for directory in os.listdir(path):
            if directory[0] != '.' and directory[4] != "m": #to exclude metadata
                dir_path = f"{path}/{directory}"
                for file in os.listdir(dir_path):
                    if file[0] != '.' and file[0] != 't':
                        file_path = f"{dir_path}/{file}"
                        y,sr = librosa.load(file_path)
                        total_dur+=librosa.get_duration(y=y,sr=sr)
        print(region)
        print(total_dur/60/60) #in hours

ATL
8.620638788107836
DCB
46.059122209624576
ROC
13.185999974804734
VLD
11.482827676996726


How many hours of audio do we keep if we only keep the first file associated with each speaker? E.g. we have 
**VLD_se0_ag3_m_01_1.wav** and VLD_se0_ag3_m_01_2.wav but we only keep the bolded file.

In [4]:
for region in regions:
    total_dur = 0 #in seconds
    if region[0] != '.': #skip .DS_Store
        path = f"CORAAL/{region}"
        for directory in os.listdir(path):
            if directory[0] != '.' and directory[4] != "m": 
                dir_path = f"{path}/{directory}"
                for file in os.listdir(dir_path):
                    if file[0] != '.' and file[0] != 't':
                        if file[-5] == '1':
                            file_path = f"{dir_path}/{file}"
                            y,sr = librosa.load(file_path)
                            total_dur+=librosa.get_duration(y=y,sr=sr)
        print(region)
        print(total_dur/60/60) #in hours

ATL
8.343416565885613
DCB
42.473344368858655
ROC
12.011416679264299
VLD
10.555055467372137


This does not keep reduce the amount of data suficiently. We may need to randomly select speakers or select the speakers with the shortest interviews.

In [5]:
atl_df = pd.read_csv("/Users/aheuser/Documents/CORAAL/ATL/ATL_metadata_2020.05.txt",delimiter="\t")
dcb_df = pd.read_csv("/Users/aheuser/Documents/CORAAL/DCB/DCB_metadata_2018.10.06.txt",delimiter="\t")
roc_df = pd.read_csv("/Users/aheuser/Documents/CORAAL/ROC/ROC_metadata_2020.05.txt",delimiter="\t")
vld_df = pd.read_csv("/Users/aheuser/Documents/CORAAL/VLD/VLD_metadata_2021.07.txt",delimiter="\t")
#Note that VLD_ag2_f_01_2 is not in VLD audio folders. This is a mistake in the metadata.
vld_df.loc[vld_df["CORAAL.Spkr"] == "VLD_se0_ag2_f_02","CORAAL.File"] = "VLD_se0_ag2_f_02_1"
metadata = pd.concat([atl_df,dcb_df,roc_df,vld_df],ignore_index=True)

In [6]:
metadata = metadata.loc[metadata['Primary.Spkr'] == 'yes'] #removes 10 files 

In [7]:
metadata

Unnamed: 0,CORAAL.Sub,Version.Created,Version.Modified,CORAAL.Spkr,CORAAL.File,Audio.Folder,Tarball,Primary.Spkr,SLAAP.Collection,SLAAP.Spkr,...,CORAAL.Word.Count,Is.Misc.Tier,Notes,Date.of.Interview,Region.in.City,LOR,LOR.Percent,Relationship.To.Others.In.Corpus,Age.Group.old,Consent.Level..WDC.ATL.SGA.
0,ATL,v.2020.05,,ATL_se0_ag2_f_01,ATL_se0_ag2_f_01_1,ATL_wav_part03,ATL_audio_part03_2020.05.tar.gz,yes,atl,atl001,...,6881,,,,,,,,,
1,ATL,v.2020.05,,ATL_se0_ag2_m_01,ATL_se0_ag2_m_01_1,ATL_wav_part04,ATL_audio_part04_2020.05.tar.gz,yes,atl,atl002,...,7166,,,,,,,,,
2,ATL,v.2020.05,,ATL_se0_ag2_f_02,ATL_se0_ag2_f_02_1,ATL_wav_part03,ATL_audio_part03_2020.05.tar.gz,yes,atl,atl003,...,5252,yes,,,,,,,,
3,ATL,v.2020.05,,ATL_se0_ag1_m_01,ATL_se0_ag1_m_01_1,ATL_wav_part01,ATL_audio_part01_2020.05.tar.gz,yes,atl,atl004,...,7223,,,,,,,,,
4,ATL,v.2020.05,,ATL_se0_ag1_f_01,ATL_se0_ag1_f_01_1,ATL_wav_part01,ATL_audio_part01_2020.05.tar.gz,yes,atl,atl005,...,4586,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,VLD,v.2021.07,,VLD_se0_ag3_m_03,VLD_se0_ag3_m_03_1,VLD_wav_part03,VLD_audio_part03_2021.07.tar.gz,yes,sga,sga006,...,9279,,,11/21/18,,,,,30 to 50,4.0
119,VLD,v.2021.07,,VLD_se0_ag4_f_01,VLD_se0_ag4_f_01_1,VLD_wav_part04,VLD_audio_part04_2021.07.tar.gz,yes,sga,sga004,...,8475,,,6/25/17,,,,,51+,4.0
120,VLD,v.2021.07,,VLD_se0_ag4_f_02,VLD_se0_ag4_f_02_1,VLD_wav_part04,VLD_audio_part04_2021.07.tar.gz,yes,sga,sga007,...,7943,,,2/9/19,,,,,51+,4.0
121,VLD,v.2021.07,,VLD_se0_ag4_m_01,VLD_se0_ag4_m_01_1,VLD_wav_part04,VLD_audio_part04_2021.07.tar.gz,yes,sga,sga008,...,10292,,,2/9/19,,,,,51+,4.0


In [8]:
def select_shortest(df,hours):
    files = set()
    folders = set()
    speakers = set()
    abbr = df[["CORAAL.Length.of.Transcript","CORAAL.File","Audio.Folder"]]
    ordered = abbr.sort_values("CORAAL.Length.of.Transcript")
    total_dur = 0
    for row in ordered.iterrows():
        if total_dur < hours*60*60: #We're going to overshoot this so we'll check if we're over 7 as opposed to 7.5
            file_name = row[1]["CORAAL.File"]
            speaker = file_name[:-2] 
            #the values before gender correspond to age and socioeconomic group so they ID a new speaker
            if speaker not in speakers:
                total_dur+=row[1]["CORAAL.Length.of.Transcript"]
                files.add(file_name)
                folder = row[1]["Audio.Folder"]
                split_name = folder.split('_')
                folders.add(split_name[0]+"_audio_"+split_name[2])
                speakers.add(speaker)
    print(f"Total duration: {total_dur} seconds")
    print(f"Total number of speakers: {len(speakers)}")
    return files,folders

In [9]:
atl_files,atl_folders = select_shortest(atl_df,7)

Total duration: 27000.0 seconds
Total number of speakers: 12


In [10]:
#to double check
total_dur = 0
region = 'ATL'
path = f"CORAAL/{region}"
for directory in os.listdir(path):
    if re.match(".+_",directory)[0][:-1] in atl_folders:
        dir_path = f"{path}/{directory}"
        for file in os.listdir(dir_path):
            if file[:-4] in atl_files:
                file_path = f"{dir_path}/{file}"
                y,sr = librosa.load(file_path)
                total_dur+=librosa.get_duration(y=y,sr=sr)
print(total_dur/60/60) #in hours

7.503833270345175


Success! We're down to 7.5 hours and we've maximized the number of speakers we're representing (12/13). Now to generalize the code to the other regions.

In [11]:
regions = ["ATL","DCB","ROC","VLD"]
folders = set()
files = set()
for region in regions:
    print(region)
    df = metadata.loc[metadata["CORAAL.Sub"] == region]
    reg_files,reg_folders = select_shortest(df,2.2)
    folders|=reg_folders
    files|=reg_files

ATL
Total duration: 9217.0 seconds
Total number of speakers: 5
DCB
Total duration: 9547.7 seconds
Total number of speakers: 12
ROC
Total duration: 7943.299999999999 seconds
Total number of speakers: 5
VLD
Total duration: 10304.2 seconds
Total number of speakers: 5


In [12]:
#double checking again
actual_total_dur = 0
for region in regions:
    total_dur = 0 #in seconds
    path = f"CORAAL/{region}"
    for directory in os.listdir(path):
        if re.match(".+_",directory)[0][:-1] in folders:
            dir_path = f"{path}/{directory}"
            for file in os.listdir(dir_path):
                if file[:-4] in files:
                    file_path = f"{dir_path}/{file}"
                    y,sr = librosa.load(file_path)
                    total_dur+=librosa.get_duration(y=y,sr=sr)
    print(region)
    print(f"Duration: {total_dur/60/60} hours")
    actual_total_dur+=total_dur
print(f"\nTotal duration: {actual_total_dur/60/60} hours")

ATL
Duration: 2.56183330813807 hours
DCB
Duration: 2.660611111111111 hours
ROC
Duration: 2.20481945074326 hours
VLD
Duration: 2.864577802973041 hours

Total duration: 10.291841672965482 hours


Finally to compile our chosen audio files into a new folder. We're using a dummy name change.

In [13]:
conversion = pd.DataFrame(columns = ["original_file_name", "new_file_name"])
counter = 0
for region in regions:
    path = f"CORAAL/{region}"
    for directory in os.listdir(path):
        if re.match(".+_",directory)[0][:-1] in folders:
            dir_path = f"{path}/{directory}"
            for file in os.listdir(dir_path):
                if file[:-4] in files:
                    file_path = f"{dir_path}/{file}"
                    #commented out code is for original folder, current folder was just used for testing purposes
                    #this commenting out/testing folder pattern continues throughout the notebook
                    #need to make sure that the folder the files are being copied to already exists
                    #os.system(f'cp {file_path} CORAAL/Condensed/Audio10/aliased/interview_{counter}.wav') #to copy file
                    os.system(f'cp {file_path} CORAAL/Condensed/Audio10/aliased_test/interview_{counter}.wav')
                    conversion.loc[len(conversion)] = [file,f"interview_{counter}.wav"]
                    counter+=1

In [14]:
conversion.to_csv("CORAAL/Condensed/DummyNameConversion10.csv")

Upload the corresponding transcripts.

In [15]:
folder_dates = {"ATL": "2020.05", "DCB": "2018.10.06", "ROC": "2020.05","VLD":"2021.07"}
for region in regions:
    path = f"CORAAL/{region}_textfiles_{folder_dates[region]}"
    for textfile in os.listdir(path):
        if textfile[:-4] in files:
            #os.system(f'cp {path}/{textfile} CORAAL/Condensed/Transcripts10/coraal/{textfile}') #to copy file
            os.system(f'cp {path}/{textfile} CORAAL/Condensed/Transcripts10/test/{textfile}') 

In [16]:
pre_conv_dict = conversion.to_dict("list")
conv_dict = {og[:-4]: new[:-4] for og,new in zip(pre_conv_dict['original_file_name'],pre_conv_dict['new_file_name'])}
flip_conv = {new[:-4]: og[:-4]  for og,new in zip(pre_conv_dict['original_file_name'],pre_conv_dict['new_file_name'])}

In [17]:
#to provide shortcut in case of notebook restart
flip_conv

{'interview_0': 'ATL_se0_ag2_f_01_1',
 'interview_1': 'ATL_se0_ag1_m_04_2',
 'interview_2': 'ATL_se0_ag1_f_03_1',
 'interview_3': 'ATL_se0_ag1_f_01_1',
 'interview_4': 'ATL_se0_ag1_f_02_1',
 'interview_5': 'DCB_se1_ag1_f_01_1',
 'interview_6': 'DCB_se1_ag4_f_01_1',
 'interview_7': 'DCB_se1_ag3_f_02_1',
 'interview_8': 'DCB_se1_ag3_m_02_2',
 'interview_9': 'DCB_se2_ag3_m_03_2',
 'interview_10': 'DCB_se2_ag3_m_02_2',
 'interview_11': 'DCB_se3_ag4_m_02_5',
 'interview_12': 'DCB_se1_ag2_m_03_1',
 'interview_13': 'DCB_se2_ag4_f_03_1',
 'interview_14': 'DCB_se3_ag1_m_01_2',
 'interview_15': 'DCB_se3_ag3_m_02_2',
 'interview_16': 'DCB_se1_ag2_m_02_3',
 'interview_17': 'ROC_se0_ag3_f_01_1',
 'interview_18': 'ROC_se0_ag2_m_01_2',
 'interview_19': 'ROC_se0_ag3_f_02_2',
 'interview_20': 'ROC_se0_ag2_f_04_1',
 'interview_21': 'ROC_se0_ag1_m_02_1',
 'interview_22': 'VLD_se0_ag3_m_01_1',
 'interview_23': 'VLD_se0_ag3_m_02_1',
 'interview_24': 'VLD_se0_ag3_f_01_2',
 'interview_25': 'VLD_se0_ag4_f_02_

In [18]:
def convert_to_original(source_dir,dest_dir,ext):
    for file in list(Path(source_dir).glob(f"*.{ext}")):
        key = Path(file).stem
        os.system(f'cp {source_dir}/{key}.{ext} {dest_dir}/{flip_conv[key]}.{ext}') 

In [19]:
source = "CORAAL/Condensed/Audio10/aliased_test"
#dest = "CORAAL/Condensed/Audio10/og_names"
dest = "CORAAL/Condensed/Audio10/og_names_test"
convert_to_original(source,dest,"wav")

In [20]:
#used the same code to convert the aliased transcripts back to names matching those of CORAAL
source = "/Users/aheuser/Documents/CORAAL/Condensed/Transcripts10/revvers/aliased"
#dest = "CORAAL/Condensed/Transcripts10/revvers/og_filenames"
dest = "CORAAL/Condensed/Transcripts10/revvers/og_filenames_test"
convert_to_original(source,dest,"json")
#following commented out code uses the original names
#source = "/Users/aheuser/Documents/CORAAL/Condensed/Transcripts10/aal_revvers/aliased"
#dest = "/Users/aheuser/Documents/CORAAL/Condensed/Transcripts10/aal_revvers/og_filenames_test"
#convert_to_original(source,dest,"json")

In [21]:
def convert_to_original_amberscript(source_dir,dest_dir,ext):
    for file in list(Path(source_dir).glob(f"*.{ext}")):
        key = Path(file).stem
        os.system(f'cp {source_dir}/{key}.{ext} {dest_dir}/{flip_conv[key[:-8]]}.{ext}') 

In [22]:
source = "/Users/aheuser/Documents/CORAAL/Condensed/Amberscript/og_json"
#dest = "/Users/aheuser/Documents/CORAAL/Condensed/Amberscript/og_json_renamed_test"
dest = "/Users/aheuser/Documents/CORAAL/Condensed/Amberscript/og_json_renamed"
convert_to_original_amberscript(source,dest,"json")