In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_xml_to_dataframe(xml_file_path):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # List to hold all recordings data
    data = []

    # Iterate over each speaker in the corpus
    for speaker in root.findall('speaker'):
        speaker_id = speaker.get('id')
        speaker_age = speaker.get('age')
        speaker_gender = speaker.get('gender')

        # Iterate over each recording for the current speaker
        for recording in speaker.findall('recording'):
            audio = recording.get('audio')
            md5sum = recording.get('md5sum')
            duration = recording.get('duration')
            pdp_score = recording.get('pdp_score')

            # Extract the transcription from the orth tag
            orth = recording.find('orth').text

            # Append the recording data to the list
            data.append({
                'speaker_id': speaker_id,
                'speaker_age': speaker_age,
                'speaker_gender': speaker_gender,
                'audio': audio,
                'md5sum': md5sum,
                'duration': duration,
                'pdp_score': pdp_score,
                'transcription': orth
            })

    # Create a DataFrame from the data list
    df = pd.DataFrame(data)
    return df


In [2]:
xml_file_path = 'nchlt_afr.trn.xml'
df = parse_xml_to_dataframe(xml_file_path)
print(df)

      speaker_id speaker_age speaker_gender  \
0            001          19           male   
1            001          19           male   
2            001          19           male   
3            001          19           male   
4            001          19           male   
...          ...         ...            ...   
63126        202          20         female   
63127        202          20         female   
63128        202          20         female   
63129        202          20         female   
63130        202          20         female   

                                             audio  \
0      nchlt_afr/audio/001/nchlt_afr_001m_0003.wav   
1      nchlt_afr/audio/001/nchlt_afr_001m_0004.wav   
2      nchlt_afr/audio/001/nchlt_afr_001m_0006.wav   
3      nchlt_afr/audio/001/nchlt_afr_001m_0008.wav   
4      nchlt_afr/audio/001/nchlt_afr_001m_0010.wav   
...                                            ...   
63126  nchlt_afr/audio/202/nchlt_afr_202f_0644.wav   
631

In [3]:
df.head(10)

Unnamed: 0,speaker_id,speaker_age,speaker_gender,audio,md5sum,duration,pdp_score,transcription
0,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0003.wav,2cc592cfd516e696357efd021e9226e7,3.96,-0.79,registrasiesertifikaat uitreik mits
1,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0004.wav,8f9737571c066560f5b4605e3ed3b3ba,2.64,-0.6827,dienste ten beste
2,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0006.wav,655a6c519ac261de5eb17b79e32156a0,5.04,-0.8597,sibw standaard inkomstebelasting
3,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0008.wav,daa046ae16623dddd3639d05e6266a04,3.24,-0.5397,vernaamste dienste verskaf
4,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0010.wav,13c56f34abe99b51c4350b28f1309864,5.88,-0.7039,voorbedryfsuitgawes ingevolge artikel
5,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0017.wav,f884ac03fb206ad7efef4476c4fdb2d9,2.4,-0.9158,ek daarmee moontlike
6,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0020.wav,75300f51ffb1892f68c4c19026190fdf,3.6,-0.8662,finansiële jaarverslag voorlê
7,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0024.wav,8749dcaa3c7483a539f02538f2e0929f,3.0,-0.8348,kinders jaarliks getoets
8,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0026.wav,1845f21a852cf56393083d2189174fc4,3.48,-0.9104,opportunistiese infeksies intree
9,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0028.wav,011e0f11570147f574a4cdd41491a8af,2.28,-0.7673,plaaslike hospitale sê


In [4]:
df.shape

(63131, 8)

In [8]:
df['duration'] = df['duration'].astype(float)

In [10]:
df['duration'].sum()/3600

53.7031

In [13]:
xml_file_path = 'nchlt_afr.tst.xml'
df_test = parse_xml_to_dataframe(xml_file_path)
print(df_test.shape)
df_test.head(10)

(3002, 8)


Unnamed: 0,speaker_id,speaker_age,speaker_gender,audio,md5sum,duration,pdp_score,transcription
0,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0004.wav,af3bbad361866af20c31022f27242d46,2.16,-0.2015,toerismewette
1,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0009.wav,5b0f18dc5dcee2c587a56cf23db3c8d1,4.08,-0.9231,voorwaardelike dranklisensie [s] toeken
2,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0016.wav,7c8f0186c32896397a51c0e584a20005,3.84,-0.8298,afal bestuursplan werksessies
3,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0017.wav,b4a2ac8e473935a687e78b74da7ba8fa,3.48,-0.4785,vuurwapen behoorlik weg
4,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0018.wav,b4ffa3b0f02ac5c0c61b11eadc1693b5,1.68,-0.2487,kultuur
5,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0019.wav,d1d73ae8eaa59da9358e9617ba0bf017,2.64,-0.4007,byvoeglike naamwoord plus
6,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0021.wav,60cf24f468a558f823d2cd202d01bd46,3.12,-0.2102,petisies aanlyn indien
7,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0022.wav,b3a64427c2ce52b21dfe40e19662db20,3.48,-0.7908,jong beroepslui meesters
8,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0023.wav,02fdac7d6fe568337ec3c0d647e663df,2.64,-0.4089,gebeur indien ek
9,500,19,male,nchlt_afr/audio/500/nchlt_afr_500m_0024.wav,a0fd03249ab072f1a288feb97e7324a7,3.12,-0.5097,volgens sekere kriteria


In [14]:
df_test['duration'] = df_test['duration'].astype(float)
df_test['duration'].sum()/3600

2.6647

In [15]:
df_test.speaker_id.unique()

array(['500', '501', '502', '503', '504', '505', '506', '507'],
      dtype=object)

Take a sample from the train data to set aside as a validation split, and keep the remaining data as the training set.

In [18]:
df_val = df.sample(3000, random_state=42)

In [21]:
df_val['duration'].sum()/3600

2.5568000000000004

In [22]:
#Create a train set with those values not in the validation set.
mask = ~df.set_index(['audio']).index.isin(df_val.set_index(['audio']).index)

# Use the mask to filter df
df_train = df[mask].reset_index(drop=True)

In [23]:
df_train.shape

(60131, 8)

In [24]:
df_train.head(10)

Unnamed: 0,speaker_id,speaker_age,speaker_gender,audio,md5sum,duration,pdp_score,transcription
0,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0003.wav,2cc592cfd516e696357efd021e9226e7,3.96,-0.79,registrasiesertifikaat uitreik mits
1,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0004.wav,8f9737571c066560f5b4605e3ed3b3ba,2.64,-0.6827,dienste ten beste
2,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0006.wav,655a6c519ac261de5eb17b79e32156a0,5.04,-0.8597,sibw standaard inkomstebelasting
3,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0008.wav,daa046ae16623dddd3639d05e6266a04,3.24,-0.5397,vernaamste dienste verskaf
4,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0010.wav,13c56f34abe99b51c4350b28f1309864,5.88,-0.7039,voorbedryfsuitgawes ingevolge artikel
5,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0017.wav,f884ac03fb206ad7efef4476c4fdb2d9,2.4,-0.9158,ek daarmee moontlike
6,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0020.wav,75300f51ffb1892f68c4c19026190fdf,3.6,-0.8662,finansiële jaarverslag voorlê
7,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0024.wav,8749dcaa3c7483a539f02538f2e0929f,3.0,-0.8348,kinders jaarliks getoets
8,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0026.wav,1845f21a852cf56393083d2189174fc4,3.48,-0.9104,opportunistiese infeksies intree
9,1,19,male,nchlt_afr/audio/001/nchlt_afr_001m_0028.wav,011e0f11570147f574a4cdd41491a8af,2.28,-0.7673,plaaslike hospitale sê


In [26]:
from datasets import Dataset, DatasetDict
def create_dataset(filepaths, transcriptions):
    dataset_dict = {
        "file": filepaths,
        "text": transcriptions,
        # Add other fields as necessary, e.g., "text": [text1, text2, ...]
    }
    return dataset_dict

In [27]:
# Create datasets
train_dataset = Dataset.from_dict(create_dataset(df_train['audio'], df_train['transcription']))
val_dataset = Dataset.from_dict(create_dataset(df_val['audio'], df_val['transcription']))
test_dataset = Dataset.from_dict(create_dataset(df_test['audio'], df_test['transcription']))

# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})


In [28]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['file', 'text'],
        num_rows: 60131
    })
    validation: Dataset({
        features: ['file', 'text'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['file', 'text'],
        num_rows: 3002
    })
})

In [29]:
# Save the dataset to disk
dataset_dict.save_to_disk("dataset_afr_only")

Saving the dataset (0/1 shards):   0%|          | 0/60131 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3002 [00:00<?, ? examples/s]