# Access the CDLI datasets

## Imports and Login to HF

In [None]:
import datasets
from huggingface_hub import hf_hub_download
from IPython.display import Audio, display
import pandas as pd
import itables

In [None]:
from huggingface_hub import login, whoami
HF_TOKEN = input()
login(token=HF_TOKEN)

In [None]:
whoami()['fullname']

## Load the datasets

While the datasets are public, you need to request access (and accept the Terms & Conditions). When done, you will be able to load the data here (otherwise this will cause an error).

Dataset sites:

* https://huggingface.co/datasets/cdli/kenyan_swahili_nonstandard_speech_v0
* https://huggingface.co/datasets/cdli/kenyan_english_nonstandard_speech_v0


### Audio Samples

* also called "utterances"

In [None]:
dataset_name = 'cdli/kenyan_english_nonstandard_speech_v0'
ds_en = datasets.load_dataset(dataset_name, split='test', streaming=False)
ds_en

#### Inspect individual samples

* shows all metadata
* also has the audio (name of file, sampling rate, loaded into array)
* you can filter ...

In [None]:
ds_en[0]

In [None]:
# get speaker 1 data
ds_en_speaker1 = ds_en.filter(lambda example: example['speaker_id']=='KES001')
len(ds_en_speaker1)


#### Play Audio

In [None]:
# get audio file and transcript
audio_data = ds_en_speaker1[0]['audio']['array']
sample_rate = ds_en_speaker1[0]['audio']['sampling_rate']
print('True transcript:', ds_en_speaker1[0]['transcription'])
Audio(audio_data, rate=sample_rate)


### Speaker-Level Metadata

In addition to the utterances along with their metadata, we also have some additional information on the speakers, which we can share.
This information will be helpful later when evaluating and analyzing the data, hence you need to know how to use it.

You can see the metadata as a separate file ('speaker_metadata.csv') on the HuggingFace website, eg see here:
https://huggingface.co/datasets/cdli/kenyan_english_nonstandard_speech_v0/tree/main

In [None]:
metadata_file_path = hf_hub_download(
        repo_id=dataset_name,
        filename="speaker_metadata.csv",
        repo_type="dataset"
    )
metadata_df = pd.read_csv(metadata_file_path)


#### Individual Fields

In [None]:
metadata_df.gender.value_counts().round(2)

In [None]:
metadata_df.severity_speech_impairment.value_counts()

In [None]:
metadata_df.etiology.value_counts().round(2)

In [None]:
metadata_df.type_nonstandard_speech.value_counts().round(2)

#### Further analysis

In [None]:

itables.show(metadata_df, column_filters="header",)

# Example: 

* lets find a speaker with severe speech impairment and speech disorder not being Stuttering
* filter for their data and listen to a few examples
* what are they saying ? Do you understand them ?

In [None]:
metadata_df[(metadata_df.severity_speech_impairment=='Severe (frequent breakdowns)') & (metadata_df.type_nonstandard_speech=='Dysarthria')]

In [None]:
speaker_id = 'KES003'
ds_en_selected_speaker = ds_en.filter(lambda example: example['speaker_id']==speaker_id)
len(ds_en_selected_speaker)



In [None]:
# get audio file and transcript

for i in range(0,3):
    print('>> next example')
    audio_data = ds_en_selected_speaker[i]['audio']['array']
    sample_rate = ds_en_selected_speaker[i]['audio']['sampling_rate']
    print('True transcript:', ds_en_selected_speaker[i]['transcription'])
    display(Audio(audio_data, rate=sample_rate))