# Access the CDLI datasets

## Imports and Login to HF

In [2]:
import datasets
from huggingface_hub import hf_hub_download
from IPython.display import Audio, display
import pandas as pd
import itables

In [3]:
from huggingface_hub import login, whoami
HF_TOKEN = input()
login(token=HF_TOKEN)

In [4]:
whoami()['fullname']

'Samuel Wanyua'

## Load the datasets

While the datasets are public, you need to request access (and accept the Terms & Conditions). When done, you will be able to load the data here (otherwise this will cause an error).

Dataset sites:

* https://huggingface.co/datasets/cdli/kenyan_swahili_nonstandard_speech_v0
* https://huggingface.co/datasets/cdli/kenyan_english_nonstandard_speech_v0


### Audio Samples

* also called "utterances"

In [5]:
dataset_name = 'cdli/kenyan_english_nonstandard_speech_v0'
ds_en = datasets.load_dataset(dataset_name, split='test', streaming=False)
ds_en

README.md:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

data/test-00000-of-00004.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

data/test-00001-of-00004.parquet:   0%|          | 0.00/293M [00:00<?, ?B/s]

data/test-00002-of-00004.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

data/test-00003-of-00004.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1698 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'audio_id', 'speaker_id', 'language', 'prompt_type', 'prompt_id', 'recording_data', 'recording_time', 'recording_environment', 'recording_device', 'transcription', 'audio_length', 'transcript_length'],
    num_rows: 1698
})

#### Inspect individual samples

* shows all metadata
* also has the audio (name of file, sampling rate, loaded into array)
* you can filter ...

In [8]:
ds_en[0]

{'audio': {'path': 'KES001_EN001.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          6.10351562e-05, -9.15527344e-05,  9.15527344e-05], shape=(431616,)),
  'sampling_rate': 16000},
 'audio_id': 'KES001_EN001',
 'speaker_id': 'KES001',
 'language': 'KENG',
 'prompt_type': 'Picture Prompt',
 'prompt_id': 'IM_001',
 'recording_data': '5/8/2025',
 'recording_time': '8:55:00 am',
 'recording_environment': 'Indoor with low Background Noise',
 'recording_device': 'Dji mic wireless microphone',
 'transcription': 'Like  a four star five star? Restaurant an appetizer menu serving  prawns and medium rare steak with aaa  sauce with nice presentation ',
 'audio_length': 26.976,
 'transcript_length': 22}

In [12]:
# get speaker 1 and speaker 2 data as an example
ds_en_speaker1 = ds_en.filter(lambda example: example['speaker_id']=='KES001')
ds_en_speaker2 = ds_en.filter(lambda example: example['speaker_id'] == 'KES002')
print(f"speaker 1: {len(ds_en_speaker1)}\nspeaker 2 : {len(ds_en_speaker2)}")
# len(ds_en_speaker1)


speaker 1: 104
speaker 2 : 106


#### Play Audio

In [13]:
# get audio file and transcript
audio_data = ds_en_speaker1[0]['audio']['array']
sample_rate = ds_en_speaker1[0]['audio']['sampling_rate']
print('True transcript:', ds_en_speaker1[0]['transcription'])
Audio(audio_data, rate=sample_rate)


True transcript: Like  a four star five star? Restaurant an appetizer menu serving  prawns and medium rare steak with aaa  sauce with nice presentation 


### Speaker-Level Metadata

In addition to the utterances along with their metadata, we also have some additional information on the speakers, which we can share.
This information will be helpful later when evaluating and analyzing the data, hence you need to know how to use it.

You can see the metadata as a separate file ('speaker_metadata.csv') on the HuggingFace website, eg see here:
https://huggingface.co/datasets/cdli/kenyan_english_nonstandard_speech_v0/tree/main

In [14]:
metadata_file_path = hf_hub_download(
        repo_id=dataset_name,
        filename="speaker_metadata.csv",
        repo_type="dataset"
    )
metadata_df = pd.read_csv(metadata_file_path)


speaker_metadata.csv:   0%|          | 0.00/5.46k [00:00<?, ?B/s]

#### Individual Fields

In [19]:
metadata_df

Unnamed: 0,speaker_id,gender,age,severity_speech_impairment,type_nonstandard_speech,etiology
0,KES001,Female,30-40,Moderate (requires effort to understand),Dysarthria,Cerebral Palsy
1,KES002,Female,30-40,Severe (frequent breakdowns),Dysarthria,Cerebral Palsy
2,KES003,Male,25-30,Profound (communication very difficult or impo...,Stuttering (Disfluency Disorders),Cerebral Palsy
3,KES004,Male,25-30,Severe (frequent breakdowns),Stuttering (Disfluency Disorders),Neurological disorder
4,KES005,Male,18-24,Moderate (requires effort to understand),Stuttering (Disfluency Disorders),Neurological disorder
5,KES006,Male,30-40,Mild (easily understood with minimal effort),Articulation Disorders,Cerebral Palsy
6,KES008,Male,18-24,Moderate (requires effort to understand),"Stuttering (Disfluency Disorders),",Autism Spectrum Disorder (ASD)
7,KES012,Female,30-40,Moderate (requires effort to understand),Stuttering (Disfluency Disorders),Neurodevelopmental disorder
8,KES015,Male,30-40,Profound (communication very difficult or impo...,"Dysarthria, Stuttering (Disfluency Disorders)",Cerebral Palsy
9,KES018,Male,30-40,Mild (easily understood with minimal effort),Fluency or Posody,Multiple Sclerosis (MS)


In [21]:
print(f"Types of Non-standard speech: {metadata_df['type_nonstandard_speech'].unique()}\n\nEtiology types: {metadata_df['etiology'].unique()}")

Types of Non-standard speech: ['Dysarthria' 'Stuttering (Disfluency Disorders)' 'Articulation Disorders'
 'Stuttering (Disfluency Disorders), '
 'Dysarthria, Stuttering (Disfluency Disorders)' 'Fluency or Posody'
 'Stuttering (Disfluency Disorders), Articulation Disorders'
 'Fluency or Posody, Dysarthria' 'breathy voice' nan 'Breathy voice']

Etiology types: ['Cerebral Palsy' 'Neurological disorder' 'Autism Spectrum Disorder (ASD)'
 'Neurodevelopmental disorder' 'Multiple Sclerosis (MS)'
 'Parkinson’s Disease' 'Down Syndrome' 'Neurodevelopmental' 'Stroke'
 'Neurodevelopmental disorder ']


In [15]:
metadata_df.gender.value_counts().round(2)

gender
Male      35
Female    17
Name: count, dtype: int64

In [16]:
metadata_df.severity_speech_impairment.value_counts()

severity_speech_impairment
Moderate (requires effort to understand)                 24
Mild (easily understood with minimal effort)             18
Severe (frequent breakdowns)                              7
Profound (communication very difficult or impossible)     2
Typical Speech (No noticeable impairment)                 1
Name: count, dtype: int64

In [22]:
metadata_df.etiology.value_counts().round(2)

etiology
Cerebral Palsy                    25
Neurodevelopmental disorder       12
Parkinson’s Disease                4
Neurodevelopmental                 3
Neurological disorder              2
Multiple Sclerosis (MS)            2
Autism Spectrum Disorder (ASD)     1
Down Syndrome                      1
Stroke                             1
Neurodevelopmental disorder        1
Name: count, dtype: int64

In [23]:
metadata_df.type_nonstandard_speech.value_counts().round(2)

type_nonstandard_speech
Dysarthria                                                   21
Stuttering (Disfluency Disorders)                            19
Dysarthria, Stuttering (Disfluency Disorders)                 3
Fluency or Posody                                             2
Stuttering (Disfluency Disorders),                            1
Articulation Disorders                                        1
Stuttering (Disfluency Disorders), Articulation Disorders     1
Fluency or Posody, Dysarthria                                 1
breathy voice                                                 1
Breathy voice                                                 1
Name: count, dtype: int64

#### Further analysis

In [24]:

itables.show(metadata_df, column_filters="header",)

0
Loading ITables v2.5.2 from the internet...  (need help?)


# Example: 

* lets find a speaker with severe speech impairment and speech disorder not being Stuttering
* filter for their data and listen to a few examples
* what are they saying ? Do you understand them ?

In [25]:
metadata_df[(metadata_df.severity_speech_impairment=='Severe (frequent breakdowns)') & (metadata_df.type_nonstandard_speech=='Dysarthria')]

Unnamed: 0,speaker_id,gender,age,severity_speech_impairment,type_nonstandard_speech,etiology
1,KES002,Female,30-40,Severe (frequent breakdowns),Dysarthria,Cerebral Palsy
37,KES016,Female,18-24,Severe (frequent breakdowns),Dysarthria,Cerebral Palsy
39,KES007,Male,30-40,Severe (frequent breakdowns),Dysarthria,Cerebral Palsy
44,KES033,Female,30-40,Severe (frequent breakdowns),Dysarthria,Cerebral Palsy
51,KES052,Female,30-40,Severe (frequent breakdowns),Dysarthria,Cerebral Palsy


In [26]:
speaker_id = 'KES003'
ds_en_selected_speaker = ds_en.filter(lambda example: example['speaker_id']==speaker_id)
len(ds_en_selected_speaker)



Filter:   0%|          | 0/1698 [00:00<?, ? examples/s]

16

In [None]:
# get audio file and transcript

for i in range(0,3):
    print('>> next example')
    audio_data = ds_en_selected_speaker[i]['audio']['array']
    sample_rate = ds_en_selected_speaker[i]['audio']['sampling_rate']
    print('True transcript:', ds_en_selected_speaker[i]['transcription'])
    display(Audio(audio_data, rate=sample_rate))