In [2]:
import xml.etree.ElementTree as ET
import gzip
import zipfile
import os
import shutil
import io
import pandas as pd
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import librosa
from pydub import AudioSegment

In [3]:
def resample_audio(file_path, target_sr=16000):
    audio, sr = librosa.load(file_path, sr=None)
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
    return audio


def create_dataset(filepaths, transcriptions):        
    dataset_dict = {
        'audio': filepaths,
        'transcription': transcriptions
    }
    
    return dataset_dict


In [4]:
def extract_corpuslink_names(xml_file):
    with open(xml_file, 'r') as file:
        xml_content = file.read()

    # remove entity reference as it interferes with the parsing.
    xml_content = xml_content.replace('&cgnSessionsPrefix;', '')
    root = ET.fromstring(xml_content)
    
    # List to store the 'name' attributes
    names = []

    #Find the name attributes for each corpus link.
    corpus = root.find('{http://www.mpi.nl/IMDI/Schema/IMDI}Corpus')
    for corpuslink in corpus.findall('{http://www.mpi.nl/IMDI/Schema/IMDI}CorpusLink'):
        
        name = corpuslink.get('Name')
        if name is not None:
            names.append(name)

    return names

In [5]:
def copy_files_to_directory(file_list, dest_directory):
    # Check if destination directory exists
    if os.path.exists(dest_directory):
        # If it exists, delete it and recreate
        shutil.rmtree(dest_directory)
    
    # Create the destination directory
    os.makedirs(dest_directory)
    
    print("Starting Copying files to new directory.")
    for file_path in tqdm(file_list):
        # Get the base name of the file
        file_name = os.path.basename(file_path)
        
        # Construct the destination file path
        dest_file_path = os.path.join(dest_directory, file_name)
        
        # Copy the file to the destination directory
        shutil.copy2(file_path, dest_file_path)


In [6]:
def text_in_range(lst, v1, v2):
    """
    Return subset of list lst between (but not including) the indices of the last instance of value v1 and
    the first instance of value v2.
    """
    
    try:
        idx_1 = len(lst) - 1 - lst[::-1].index(v1)
        idx_2 = len(lst) - 1 - lst[::-1].index(v2) #lst.index(v2)
    except:
        print("Failed to find one of the indices")
        print(v1, v2)
        return []
    
    return lst[idx_1 + 1:idx_2]

def text_from_subset(subset):
    ignore_list = ['TextGrid', 'IntervalTier', 'BACKGROUND', 'COMMENT']
    frags = []
    for s in subset:
        if len(s) > 2 and s[0] == '"':
            if not s.replace('"', '') in ignore_list:
                frags.append(s.replace('"', ''))
    frags = ' '.join(frags)
    return frags

def annotation_to_split_transcripts(file_data, max_len=30):
    transcripts = []
    split_points = []
    
    #1. Collect all timestamps
    timestamps = []
    
    lines = file_data.split('\n')
    lines = lines[0:lines.index('"BACKGROUND"')] #Retain only the actual transcript data
    
    for line in lines:
        try:
            l = float(line)
            if len(line) > 3:
                timestamps.append(line) 
        except:
            pass
    
    #2. From all timestamps, select some to create ranges <30 seconds
    #Remove duplicats and sort from small to large
    timestamps = [x.zfill(7) for x in timestamps]
    timestamps = sorted(list(set(timestamps)))
    timestamps = [x.lstrip('0') for x in timestamps]
    
    tmp = []
    for i in range(len(timestamps)):
        if timestamps[i][0] == '.':
            tmp.append('0' + timestamps[i])
        else:
            tmp.append(timestamps[i])
    
    timestamps = tmp
    
    #print(timestamps)
    
    last_point = 0
    for i in range(len(timestamps)):
        if float(timestamps[i]) - last_point >= max_len: #Range is too big as of this step. Backtrack one step and use that as the split point.
            split_points.append(timestamps[i-1])
            last_point = float(timestamps[i-1])
    
    #3. Extract the text for each range
    
    #First range:
    subset = text_in_range(lines, timestamps[0], split_points[0])
    transcripts.append(text_from_subset(subset))
    #Middle ranges:
    for i in range(len(split_points) - 1):
        subset = text_in_range(lines, split_points[i], split_points[i+1])
        transcripts.append(text_from_subset(subset))
    #Final range:
    subset = text_in_range(lines, split_points[-1], timestamps[-1])
    transcripts.append(text_from_subset(subset))
    
    split_points = [float(x) for x in split_points]
    
    return transcripts, split_points


def split_audio(file_path, timestamps, output_folder='audio'):
    """
    Splits a wav file into multiple smaller files based on the given timestamps.
    
    :param file_path: Path to the input wav file.
    :param timestamps: List of timestamps (in seconds) where the file should be split.
    :param output_folder: Folder where the split files should be saved. 
    """
    # Convert timestamps to ms
    timestamps = [x*1000 for x in timestamps]
    
    # Load the audio file
    audio = AudioSegment.from_wav(file_path)

    # Initial and final timestamp (start of the audio and end of the audio)
    start_time = 0
    timestamps.append(len(audio))  # Add the end of the audio file as the final timestamp

    session_name = file_path.split('/')[-1][:-4]
    
    file_names = []
    
    # Split and export audio segments
    for i, end_time in enumerate(timestamps):
        if start_time < end_time:
            segment = audio[start_time:end_time]
            f_name = f"{output_folder}/{session_name}_{i+1}.wav"
            segment.export(f_name, format="wav")
            file_names.append(f_name)
            start_time = end_time
    
    return file_names

In [7]:
def get_names(corpus_file, audio_dir):
    #Extract the session names from the subcorpus
    names = extract_corpuslink_names(corpus_file)
    
    #From the subcorpus, only select those files that are in the relevant folder (e.g. read speech)
    files = os.listdir(audio_dir)
    rt_names = []
    for name in names:
        if name + ".wav" in files:
            rt_names.append(name)
    
    #Intermediate output
    print(f'Total amount of files in the subcorpus: {len(names)}')
    print(f'Total amount of files selected: {len(rt_names)}')
    
    return rt_names

def check_recording_length(names):
    #Load the recordings metadata to check the text is long enough
    df = pd.read_csv('data/meta/text/recordings.txt', sep='\t')
    df_sel = df[df['recordingID'].isin(names)]
    audio_len = df_sel.secCount.sum()/3600
    print(f'Total found audio data: {audio_len.round(1)}h')    
    

In [32]:
def create_dataset_from_cgn(corpus_file, audio_dir, transcript_dir, dataset_name = "my_dataset", max_len=30):
    #Get the session names from the subcorpus that are in the relevant folder
    rt_names = get_names(corpus_file, audio_dir)
    
    #Check how many hours of audio data there are.
    check_recording_length(rt_names)
    
    # List all .gz files in the transcription directory
    gz_files = [f for f in os.listdir(transcript_dir) if f.endswith('.gz')]

    #Set up audio directory
    audio_dir_out = 'audio'
    # Check if destination directory exists
    if os.path.exists(audio_dir_out):
        # If it exists, delete it and recreate
        shutil.rmtree(audio_dir_out)
    
    # Create the destination directory
    os.makedirs(audio_dir_out)
    
    
    audio_names = []
    transcript_list = []
    
    print('Start Chunking Files.')
    
    for gz_file in tqdm(gz_files):
        if gz_file[:-7] in rt_names: #We only need the selected files. 
            gz_file_path = os.path.join(transcript_dir, gz_file)

            # Open the .gz file
            with gzip.open(gz_file_path, 'rb') as f:
                # Read the contents of the file
                file_content = f.read()
                text_content = file_content.decode('ansi')
                text_content = text_content.replace('\r', '')
                
                #Split the audio file and transcript into smaller chunks
                transcripts, split_points = annotation_to_split_transcripts(text_content, max_len=max_len)
                split_file_names = split_audio(f'{audio_dir}/{gz_file[:-7]}.wav', split_points)
                
                audio_names.extend(split_file_names)
                transcript_list.extend(transcripts)
    
    print('Finished chunking Files.')
    
    #Finally, create and return a Dataset Object.    
    dataset = Dataset.from_dict(create_dataset(audio_names, transcript_list))
    
    dataset.save_to_disk(dataset_name)
    print("Dataset saved to disk")
    
    print("Zipping dataset and audio files")
    with zipfile.ZipFile(dataset_name + '.zip', 'w') as zipf:
        for root, _, files in os.walk('audio'):
            for file in files:
                zipf.write(os.path.join(root, file))
        for root, _, files in os.walk(dataset_name):
            for file in files:
                zipf.write(os.path.join(root, file))
    
    print("Removing audio folder and dataset file.")
    shutil.rmtree('audio')
    shutil.rmtree(dataset_name)
    print(f"Dataset has been successfully created. It can be found in {dataset_name}.zip")
    
    return dataset

In [33]:
corpus_file = 'data/meta/imdi/corpora/regN1A.imdi' #South Holland
audio_dir = 'data/audio/wav/comp-o/nl/'
transcript_dir = 'data/annot/text/ort/comp-o/nl'
dataset_name = "dataset_south_hollandic_v3"

dataset = create_dataset_from_cgn(corpus_file, audio_dir, transcript_dir, dataset_name=dataset_name, max_len=10)

Total amount of files in the subcorpus: 3611
Total amount of files selected: 182
Total found audio data: 19.4h
Start Chunking Files.


100%|████████████████████████████████████████████████████████████████████████████████| 561/561 [00:31<00:00, 17.70it/s]

Finished chunking Files.





Saving the dataset (0/1 shards):   0%|          | 0/7773 [00:00<?, ? examples/s]

Dataset saved to disk
Zipping dataset and audio files
Removing audio folder and dataset file.
Dataset has been successfully created. It can be found in dataset_south_hollandic_v3.zip


In [34]:
corpus_file = 'data/meta/imdi/corpora/regV3.imdi' #West-Flanders
audio_dir = 'data/audio/wav/comp-o/vl/'
transcript_dir = 'data/annot/text/ort/comp-o/vl'
dataset_name = "dataset_west_flemish_v2"

dataset = create_dataset_from_cgn(corpus_file, audio_dir, transcript_dir, dataset_name=dataset_name, max_len=10)

Total amount of files in the subcorpus: 1039
Total amount of files selected: 201
Total found audio data: 6.6h
Start Chunking Files.


100%|█████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:10<00:00, 110.00it/s]

Finished chunking Files.





Saving the dataset (0/1 shards):   0%|          | 0/2803 [00:00<?, ? examples/s]

Dataset saved to disk
Zipping dataset and audio files
Removing audio folder and dataset file.
Dataset has been successfully created. It can be found in dataset_west_flemish_v2.zip


In [35]:
corpus_file = 'data/meta/imdi/corpora/regV2.imdi' #East-Flanders
audio_dir = 'data/audio/wav/comp-o/vl/'
transcript_dir = 'data/annot/text/ort/comp-o/vl'
dataset_name = "dataset_east_flemish_v2"

dataset = create_dataset_from_cgn(corpus_file, audio_dir, transcript_dir, dataset_name=dataset_name, max_len=10)

Total amount of files in the subcorpus: 1179
Total amount of files selected: 215
Total found audio data: 6.8h
Start Chunking Files.


100%|█████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:11<00:00, 103.41it/s]

Finished chunking Files.





Saving the dataset (0/1 shards):   0%|          | 0/2903 [00:00<?, ? examples/s]

Dataset saved to disk
Zipping dataset and audio files
Removing audio folder and dataset file.
Dataset has been successfully created. It can be found in dataset_east_flemish_v2.zip


In [21]:
dataset

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 941
})

In [23]:
dataset[1]

{'audio': 'audio/fv800281_2.wav',
 'transcription': 'Muntes autoriteit en verantwoordelijkheid strekten zich uit over alle domeinen van het leven van zijn parochianen. een fanfare zonder zijn zegen was niet denkbaar. als pastoor voedde hij niet zozeer op tot een persoonlijke geloofshouding hij hield met gezag voor wat men als christen moest geloven en doen en zelf leefde hij dat ideaal voor. Peer Claes de broer van Ernest'}

In [28]:
with gzip.open('data/annot/text/ort/comp-o/nl/fn001011.ort.gz', 'rb') as f:
    # Read the contents of the file
    file_content = f.read()
    text_content = file_content.decode('ansi')
    text_content = text_content.replace('\r', '')

    #Split the audio file and transcript into smaller chunks
    transcripts, split_points = annotation_to_split_transcripts(text_content, max_len=10)

In [29]:
transcripts

['yâ*v Abî Al-Mohtaram waarde vader. ik ben het Karima. op het moment dat ik je dit schrijf',
 'zit ik veilig in Nederland en stort ik me volop in mijn zo lang begeerde studie. maak je maar niet ongerust over mij. ik kom er wel',
 'ook al is de weg geplaveid met hindernissen. Ba maak je ook maar geen zorgen om jezelf. je kunt me vertrouwen hoewel dat niet je sterkste kant is',
 'mij vertrouwen geven. ik zal je naam niet te schande maken. ik weet wat het rechte pad is. jouw eer',
 'noch die van oom Mansour zullen door mij bezoedeld worden. als je lasterpraatjes ter ore komen weet dan dat het slecht bedoelde geruchten zijn.',
 'achterklap en kwaadsprekerij zijn in de Marokkaanse gemeenschap geen zeldzaamheid dat weet jij net zo goed als ik. Ba',
 'nu ik na die twaalf jaar gevangenschap mijn vrijheid heb gekregen wil ik je vertellen wie ik was en hoe ik geworden ben wie ik ben. weet je nog Ba',
 'toen ik geboren werd in ons dorp Beni Touzine? dat dorpje hoog boven op een berg in de Rif wa

In [30]:
split_points

[9.634,
 19.147,
 29.075,
 37.851,
 47.825,
 56.286,
 64.894,
 74.868,
 83.579,
 92.356,
 101.934,
 110.766,
 119.801,
 129.646,
 139.136,
 148.924,
 158.566,
 167.167,
 175.805,
 185.512,
 195.449,
 204.547,
 213.968,
 222.828,
 232.07]

In [31]:
sp = [0] + split_points
sp2 = split_points + [0]

for i in range(len(sp)):
    print(sp2[i] - sp[i])

9.634
9.512999999999998
9.928
8.776
9.974000000000004
8.460999999999999
8.608000000000004
9.97399999999999
8.710999999999999
8.777000000000001
9.578000000000003
8.832000000000008
9.034999999999997
9.844999999999985
9.490000000000009
9.788000000000011
9.641999999999996
8.600999999999999
8.638000000000005
9.706999999999994
9.937000000000012
9.097999999999985
9.420999999999992
8.860000000000014
9.24199999999999
-232.07
