In [None]:
%matplotlib ipympl

In [None]:
import os
from pathlib import Path
from glob import glob
import json

import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

import IPython.display as Ipd

import torch
import torchaudio
import pandas as pd

#b2aiprep is a library with various functions to load and process your files
import b2aiprep.process as b2p
import b2aiprep.demographics as b2dm

In [None]:
# load the full participant information file
data_path = Path.cwd().joinpath('bridge2ai-Voice', 'bridge2ai-voice-corpus-1')
# data_path = Path.home().joinpath('data', 'bridge2ai', 'bridge2ai-voice-corpus-1')

df = b2dm.load_csv_file(data_path / 'bridge2ai_voice_data.csv')

#create separate data frames for sets of columns
#number of participants
participants_df = b2dm.get_df_of_repeat_instrument(df, b2dm.RepeatInstrument.PARTICIPANT)
print('Number of participants:', len(participants_df))

# session info
sessions_df = b2dm.get_df_of_repeat_instrument(df, b2dm.RepeatInstrument.SESSION)
print('Number of sessions:', len(sessions_df))

#subject id (record_id) to acoustic_task_id and acoustic_task_name 
acoustic_tasks_df = b2dm.get_df_of_repeat_instrument(df, b2dm.RepeatInstrument.ACOUSTIC_TASK)
print('Number of Acoustic Tasks:', len(acoustic_tasks_df))

#recording info
recordings_df = b2dm.get_df_of_repeat_instrument(df, b2dm.RepeatInstrument.RECORDING)
print('Number of Recordings:', len(recordings_df))

#demographics
generic_demographics_df = b2dm.get_df_of_repeat_instrument(df, b2dm.RepeatInstrument.GENERIC_DEMOGRAPHICS)
print('Number of Demographics:', len(generic_demographics_df))

#confunds
generic_confounders_df = b2dm.get_df_of_repeat_instrument(df, b2dm.RepeatInstrument.GENERIC_CONFOUNDERS)
print('Number of Confounders:', len(generic_confounders_df))

#phq9 depression individual question scores
phq9_df = b2dm.get_df_of_repeat_instrument(df, b2dm.RepeatInstrument.GENERIC_PHQ9_DEPRESSION)
print('Number of PHQ9 entries:', len(phq9_df))

#gad7 anxiety individual question scores
gad7_df = b2dm.get_df_of_repeat_instrument(df, b2dm.RepeatInstrument.GENERIC_GAD7_ANXIETY)
print('Number of GAD7 entries:', len(gad7_df))

Each of the above variables is a pandas "DataFrame". The easiest way to preview these dataframes is to use the `.head()` method, which displays the first 5 rows.

In [None]:
participants_df.head()

In [None]:
participants_df['record_id'].nunique()

Below is an example of how you can create one Python object which has *all* of the information for an individual patient.
This is fairly verbose, but gives you a good idea of all the information available.

In [None]:
participants = []

for participant in participants_df.to_dict('records'):
    participants.append(participant)
    participant['sessions'] = sessions_df[sessions_df['record_id'] == participant['record_id']].to_dict('records')
    
    for session in participant['sessions']:
        # there can be multiple acoustic tasks per session
        session_id = session['session_id']
        session['acoustic_tasks'] = acoustic_tasks_df[acoustic_tasks_df['acoustic_task_session_id'] == session_id].to_dict('records')
        for task in session['acoustic_tasks']:
            # there can be multiple recordings per acoustic task
            task['recordings'] = recordings_df[recordings_df['recording_acoustic_task_id'] == task['acoustic_task_id']].to_dict('records')
        
        # there can be only one demographics per session
        session['generic_demographics'] = (generic_demographics_df[generic_demographics_df['demographics_session_id'] == session_id].to_dict('records')[:1] or [None])[0]
        # there can be only one confounders per session
        session['generic_confounders'] = (generic_confounders_df[generic_confounders_df['confounders_session_id'] == session_id].to_dict('records')[:1] or [None])[0]

print(json.dumps(participants[0], indent=2))

Alternatively, we can look across all values for a field such as `age`.

In [None]:
# Take a look at all the possible ages in the participant data frame
participants_df['age'].unique()

Note that not all of these are numbers! '90 or older'.

## Acoustic tasks

Let's look at the acoustic tasks dataframe.

In [None]:
acoustic_tasks_df.head()

Each row in the above corresponds to a different acoustic task: an audio check, prolonged vowels, etc. The `value_counts()` method for pandas DataFrames lets us count all the unique values for a column.

In [None]:
acoustic_tasks_df['acoustic_task_name'].value_counts()

Let's grab the rainbow passage for the first `record_id` in the dataset, and see if we can load in the corresponding audio / spectrogram data.

In [None]:
record_id = acoustic_tasks_df['record_id'].values[0]

# create an index into the dataframe which gets the row we are interested in
idx = (acoustic_tasks_df['record_id'] == record_id) & (acoustic_tasks_df['acoustic_task_name'] == 'Rainbow Passage')

display(acoustic_tasks_df.loc[idx])

# note we use .values[0] to get the first value of a length-1 array
acoustic_task_session_id = acoustic_tasks_df.loc[idx, 'acoustic_task_session_id'].values[0]
acoustic_task_id = acoustic_tasks_df.loc[idx, 'acoustic_task_id'].values[0]

It's useful to disambiguate a few of these columns:

- `record_id`: a unique identifier for each participant
- `recording_id` a unique identifier for each audio recording (and subsequently each audio spectrogram)
- `acoustic_task_session_id`: unique identifier for a session where they are recording an ID.
- `acoustic_task_id`: a unique identifier for each acoustic task *for* each acoustic session.

This is a great opportunity to take a look at the data dictionary which has all of this information.

Now that we know the session identifier, we can acquire the `recording_id` associated with this acoustic task session.

In [None]:
idx = (recordings_df['recording_acoustic_task_id'] == acoustic_task_id)

display(recordings_df.loc[idx])

recording_id = recordings_df.loc[idx, 'recording_id'].values[0]

This is very tedious! We've written a function to get recording IDs for a given task.

In [None]:
recordings = b2dm.get_recordings_for_acoustic_task(df, acoustic_task='Rainbow Passage')
recordings.head()

### Audio data

The audio data has been processed into a Pytorch file, with the following dictionary keys:

- 'specgram'
- 'melfilterbank'
- 'mfcc'
- 'opensmile'
- 'sample_rate'
- 'checksum'
- 'transcription'

The audio data is also in a subfolder, "data". Let's create this:

In [None]:
audio_path = data_path.joinpath('data')

In [None]:
recording_id = '529F2F35-ECC1-42EB-81F0-5D28E0CE4E75'
features = torch.load(data_path.joinpath('data') / f"{recording_id}_features.pt")
features.keys()

In [None]:
# plot the spectogram
fig, axs = plt.subplots(1, 1)
b2p.plot_spectrogram(torch.log10(features['specgram'].T), ax=axs)
fig.tight_layout()

We can also reconstruct the audio for the spectrogram, but note that the spectrogram has been modified to protect privacy, so the reconstructed audio sounds a bit unusual!

In [None]:
n_fft = 2 * (features['specgram'].shape[1] - 1)
sr = features['sample_rate']
win_length = int(sr * 25 / 1000)
hop_length = int(sr * 15 / 1000)
griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, win_length=win_length, hop_length=hop_length, power=2)
reconstructed_waveform = griffin_lim(features['specgram'].T)
Ipd.display(Ipd.Audio(data=reconstructed_waveform, rate=sr*15/25))

We have provided a helper function to load in all the spectograms for the above Rainbow Passage task.

In [None]:
spectrograms = b2dm.load_features_for_recordings(recordings, audio_path, 'specgram')

You can modify the index below (`i = 0`) to see different spectrograms.

In [None]:
# plot the spectogram
i = 0
recording_id = list(spectrograms.keys())[i]
fig, axs = plt.subplots(1, 1)
b2p.plot_spectrogram(torch.log10(spectrograms[recording_id].T), ax=axs)
fig.tight_layout()