In [None]:
# Import packages
import pandas as pd
import os
import librosa
import soundfile as sf
from datasets import Dataset, DatasetDict, Audio
from huggingface_hub import login

# Login to Hugging Face
os.environ['HUGGINGFACE_HUB_TOKEN'] = '####'
login(token=os.environ['HUGGINGFACE_HUB_TOKEN'])

In [6]:
# Set data path
data_dir = 'edacc_v1.0/'

# Load accent info
acc_df = pd.read_csv(data_dir+'linguistic_background.csv')
acc_df.rename(columns={'How would you describe your accent in English? (e.g. Italian, Glaswegian)': 'Accent'}, inplace=True)
acc_df = acc_df[['CONVERSATION_ID', 'PARTICIPANT_ID', 'Accent']]

# Map to standardized categories (based on data inspection)
accent_map = {
    'American': [
        'Slightly American', 'Mostly West Coast American with some Australian Intonation', 'American accent ',
        'American, I guess.', 'American ', 'American with a slight accent', 'American-ish', 'Midwestern United States'],
    'Jamaican': ['Jamaican ', 'Jamaican accent ', 'EDACC-C52-A', 'EDACC-C52-B'],
    'English': ['South London', 'Southern London', 'English ', 'British', 'English with Scottish inflections'],
    'Scottish': ['Scottish (Fife)', 'Glaswegian (not slang)', 'Glaswegian'],
    'Irish': ['Irish/ Dublin', 'South Dublin Irish', 'Southern Irish'],
    'French': ['EDACC-C18-A'],
    'Spanish': ['Spanish accent', 'Spanish American'],
    'Italian': [
        'italian mixed with American and British English ', 'Italian mixed with American accent',
        'italian', 'Neutral English, Italian'],
    'Lithuanian': ['Lithuanian (eastern European)'],
    'Romanian': ['Romanian '],
    'Polish': ['European', 'EDACC-C24-B'],
    'Eastern European': ['East-European', 'Neutral accent'],
    'Chinese': ['Chinese accent or mixed accent(US, UK, China..) perhaps', 'Chinese '],
    'Vietnamese': ['Vietnamese accent', 'Slight Vietnamese accent', 'Vietnamese English', 'EDACC-C59-A'],
    'Indian': ['Standard Indian English', 'Indian ', 'Neutral'],
    'Pakistani': ['Indian / Pakistani accent', 'Pakistani/American'],
    'Egyptian': ['Egyptian '],
    'Nigerian': ['Afrian', 'EDACC-C45-A'],
    'Ghanaian': ['Ghanaian ', 'EDACC-C45-B'],
    'Kenyan': ['African accent'],
    'South African': ['South African English'],
    'Brazilian': ['Brazilian accent'],
    'Ecuadorian': ['Latin American'],
    'Chilean': ['South American'],
    'Colombian': ['Latín American', 'Latin'],
    'International': ['Trans-Atlantic', 'Generic middle class white person ', 'Standard American,Scottish', 'North American'],
    'Sri Lankan': ['Asian', 'EDACC-C10-B'],
    'Russian': ['EDACC-C24-A'],
    'Filipino': ['European']
}

# Create reverse mapping for lookup
reverse_map = {}
for accent, values in accent_map.items():
    for value in values:
        reverse_map[value] = accent

# Update accent using reverse mapping
def update_accent(row):
    participant_id = row['PARTICIPANT_ID']
    accent = row['Accent']
    if accent in reverse_map:
        return reverse_map[accent]
    elif participant_id in reverse_map:
        return reverse_map[participant_id]
    else:
        return accent

# Apply accent map to data
acc_df['Accent'] = acc_df.apply(update_accent, axis=1)

In [9]:
# Set data paths
dev = data_dir+'dev/'
test = data_dir+'test/'
data = data_dir+'data'

# Read in CSV files
def read(filen):
  with open(filen, 'r') as file:
    lines = file.readlines()
  return lines

# Split transcripts into features
def split_trans(line):
    parts = line.strip().split(' ', 1)
    return parts[0], parts[1]

# Split segments into features
def split_seg(line):
    parts = line.strip().split(' ')
    return parts[0], parts[1], parts[2], parts[3]

# Create transcript dataframe
def make_trans_df(file):
    return pd.DataFrame([split_trans(line) for line in read(file)], columns=['Code', 'Transcript'])

# Create segment dataframe
def make_seg_df(file):
    return pd.DataFrame([split_seg(line) for line in read(file)], columns=['Code', 'File', 'Start', 'End'])

# Function to process files
def process_files(folder):

    # Create dataframes for transcripts and segments
    trans_df = make_trans_df(folder+'text.csv')
    seg_df = make_seg_df(folder+'segments.csv')

    # Lists to hold data
    codes = []
    audio_paths = []
    trans_list = []
    accent_list = []

    for file in os.listdir(data):

        # Get audio file path
        audio_path = os.path.join(data, file)
        audio_file = os.path.splitext(file)[0]

        if audio_file in seg_df['File'].values:

            # Load audio file
            audio, sr = librosa.load(audio_path, sr=None)

            # Get corresponding segment data
            file_segs = seg_df[seg_df['File'] == audio_file]
            segs = file_segs[['Code', 'File', 'Start', 'End']].values.tolist()

            i = 0
            for seg in segs:

                # Get start and end times
                code = seg[0]
                start = float(seg[2])
                end = float(seg[3])

                # Convert times to samples
                start_sample = int(start * sr)
                end_sample = int(end * sr)

                # Extract audio segment
                audio_seg = audio[start_sample:end_sample]

                # Save new audio segment
                audio_seg_filen = f'{code}.wav'
                audio_seg_path = os.path.join(data_dir+'segments', audio_seg_filen)
                sf.write(audio_seg_path, audio_seg, sr)

                # Get corresponding transcript
                transcript = trans_df[trans_df['Code'] == code]['Transcript'].values[0]

                # Get corresponding accent
                participant_id = f"{code[:9]}-{'A' if i%2 == 0 else 'B'}"
                accent = acc_df.loc[acc_df['PARTICIPANT_ID'] == participant_id, 'Accent'].iloc[0]
                i += 1

                # Save data
                codes.append(code)
                audio_paths.append(audio_seg_path)
                trans_list.append(transcript)
                accent_list.append(accent)

    # Create a DataFrame
    data_lists = {
        'code': codes,
        'audio': audio_paths,
        'transcript': trans_list,
        'accent': accent_list
    }
    df = pd.DataFrame(data_lists)

    # Convert DataFrames to HuggingFace Datasets
    dataset = Dataset.from_pandas(df, preserve_index=False)

    # Define the audio feature
    dataset = dataset.cast_column('audio', Audio())

    return dataset

In [10]:
# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'dev': process_files(dev),
    'test': process_files(test)
})

# Push to HuggingFace Hub
dataset_dict.push_to_hub('sage-bergerson/edacc_whisper')