Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dashboard #55

Merged
merged 21 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
99c6c2e
init streamlit dashboard
alistairewj Jun 10, 2024
60d3742
add number of individuals with non-null for first q to print statement
alistairewj Jun 14, 2024
34fce26
add vscode hidden folder to ignore
alistairewj Jun 14, 2024
4fd82d9
move script into dashboard
alistairewj Jun 17, 2024
dc17ef3
format, fix memory consumption, load data dictionary from GitHub repo
alistairewj Jun 17, 2024
89e3805
add cli argument to call streamlit (using an admittedly unsupported API)
alistairewj Jun 17, 2024
1d596bd
add bids_dir as input arg to dashboard
alistairewj Jun 17, 2024
eca1063
add arg based bids loading, fix up search and demographics pages
alistairewj Jun 17, 2024
1418252
reorder tabs
alistairewj Jun 17, 2024
436907f
init streamlit dashboard
alistairewj Jun 10, 2024
eadf065
add number of individuals with non-null for first q to print statement
alistairewj Jun 14, 2024
3090da1
add vscode hidden folder to ignore
alistairewj Jun 14, 2024
9c5439c
move script into dashboard
alistairewj Jun 17, 2024
00d8de5
format, fix memory consumption, load data dictionary from GitHub repo
alistairewj Jun 17, 2024
73a2a13
add cli argument to call streamlit (using an admittedly unsupported API)
alistairewj Jun 17, 2024
36baf27
add bids_dir as input arg to dashboard
alistairewj Jun 17, 2024
9af1bca
add arg based bids loading, fix up search and demographics pages
alistairewj Jun 17, 2024
636f189
reorder tabs
alistairewj Jun 17, 2024
c292b0d
Merge branch 'alistair/streamlit_dashboard' of https://github.com/sen…
ibevers Jun 24, 2024
8465697
Fix demographics not finding path bug
ibevers Jun 24, 2024
b7d82e9
Fix SSL certificate bug
ibevers Jun 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# vscode configuration
.vscode

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,10 @@ b2aiprep-cli prepsummerdata \
[desired output path for .tar file]
```

## Streamlit dashboard

A dashboard is provided to help navigate the data in the BIDS format. Launch the dashboard from the repository folder with:

```sh
streamlit run src/b2aiprep/app/Dashboard.py
```
94 changes: 0 additions & 94 deletions docs/semantic-search-streamlit.py

This file was deleted.

130 changes: 130 additions & 0 deletions src/b2aiprep/app/Dashboard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import argparse
from pathlib import Path
import sys

import streamlit as st
import pandas as pd
import numpy as np
import altair as alt

from b2aiprep.dataset import VBAIDataset

def parse_args(args):
parser = argparse.ArgumentParser('Dashboard for audio data in BIDS format.')
parser.add_argument('bids_dir', help='Folder with the BIDS data', default='output')
return parser.parse_args(args)

args = parse_args(sys.argv[1:])
bids_dir = Path(args.bids_dir).resolve()
if not bids_dir.exists():
raise ValueError(f"Folder {bids_dir} does not exist.")

if 'bids_dir' not in st.session_state:
st.session_state.bids_dir = bids_dir.as_posix()

dataset = VBAIDataset(st.session_state.bids_dir)

st.set_page_config(
page_title="b2ai voice",
page_icon="👋",
)

st.write("# Bridge2AI Voice Data Dashboard")

st.sidebar.success("Choose an option above.")

st.markdown(
"""
This dashboard allows you to explore the voice data collected by Bridge2AI.

You should first load in the data below by providing a path to the BIDS-like
formatted data folder. Once you've done that, you can explore the data.

"""
)

st.markdown(
"""## Session information"""
)

# every user has a sessionschema which we can get info for the users from
df = dataset.load_and_pivot_questionnaire('sessionschema')
n_sessions = df.shape[0]
n_subjects = df['record_id'].nunique()
n_subj_gt_1 = df.groupby('record_id').size().gt(1).sum()
n_subj_gt_3 = df.groupby('record_id').size().gt(3).sum()
max_num_sessions = df.groupby('record_id').size().max()
df_recordings = dataset.load_and_pivot_questionnaire('recordingschema')
n_recordings = df_recordings.shape[0]

st.write(
f"""
* {n_subjects} subjects have participated.
* {n_sessions} sessions have been conducted.
* {n_subj_gt_1} subjects ({n_subj_gt_1/n_subjects:1.2%}) have more than one session (max = {max_num_sessions}).
* {n_recordings} recordings have been made.
"""
)

st.write(
"""
## Subjects by site
"""
)
n_participants_per_size = df[['record_id', 'session_site']].drop_duplicates().groupby('session_site').size()
n_participants_per_size = n_participants_per_size.reset_index(name='count')
n_participants_per_size.columns = ['Site', 'Number of participants']

site_chart = alt.Chart(
n_participants_per_size
).mark_bar().encode(
x='Number of participants',
y='Site',
).properties(
width=600,
height=400,
)

st.altair_chart(site_chart)

st.write(
"""
## Sessions by site
"""
)

n_sessions_per_site = df.groupby('session_site').size()
n_sessions_per_site = n_sessions_per_site.reset_index(name='count')
n_sessions_per_site.columns = ['Site', 'Number of sessions']

site_chart = alt.Chart(
n_sessions_per_site
).mark_bar().encode(
x='Number of sessions',
y='Site',
).properties(
width=600,
height=400,
)

st.altair_chart(site_chart)

session_durations = df['session_duration'].astype(float) / 3600.0
session_durations = session_durations.dropna()

st.write(
"""
## Session durations
"""
)

# altair histogram of session durations
hist = alt.Chart(session_durations.reset_index()).mark_bar().encode(
alt.X("session_duration:Q", bin=alt.Bin(maxbins=20), title="Session duration (hours)"),
y='count()',
).properties(
width=600,
height=400,
)

st.altair_chart(hist)
118 changes: 118 additions & 0 deletions src/b2aiprep/app/pages/1_Demographics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import streamlit as st
import pandas as pd
import altair as alt

from b2aiprep.dataset import VBAIDataset

st.set_page_config(page_title="Demographics", page_icon="📈")

st.markdown("# Demographics")
st.sidebar.header("Demographics")
st.write(
"""This page overviews the demographics of the dataset."""
)

def get_bids_data():
# TODO: allow user to specify input folder input
dataset = VBAIDataset(st.session_state.bids_dir)
return dataset

schema_name = 'qgenericdemographicsschema'
dataset = get_bids_data()
df = dataset.load_and_pivot_questionnaire(schema_name)

# st.markdown("## Age Distribution")
# st.write(df['age'].describe())


st.markdown("## Gender Identity")
gender_counts = df['gender_identity'].value_counts()
st.bar_chart(gender_counts)


st.markdown("## Sexual Orientation")
orientation_counts = df['sexual_orientation'].value_counts()
st.bar_chart(orientation_counts)


st.markdown("## Race")
race_columns = [col for col in df.columns if 'race___' in col]
race_counts = df[race_columns].sum()
st.bar_chart(race_counts)

st.markdown("## Ethnicity")
ethnicity_counts = df['ethnicity'].value_counts()
st.bar_chart(ethnicity_counts)

st.markdown("## Marital Status")
marital_status_columns = [col for col in df.columns if 'marital_status___' in col]
marital_status_counts = df[marital_status_columns].sum()
st.bar_chart(marital_status_counts)


st.markdown("## Employment Status")
employ_status_columns = [col for col in df.columns if 'employ_status___' in col]
employ_status_counts = df[employ_status_columns].sum()
st.bar_chart(employ_status_counts)

# we need to do some harmonization of the USA / CA salaries
st.markdown("## Household Income")
income = df[['household_income_usa', 'household_income_ca']].copy()
income_cols = list(income.columns)
# get the upper range of their income, or if they only have one, the upper limit
for col in income_cols:
# need to extract the *last* instance of this pattern
income[f'{col}_lower'] = income[col].str.extract(r'\$(\d+,\d+)\s*$')
income[f'{col}_lower'] = income[f'{col}_lower'].str.replace(',', '')
income[f'{col}_lower'] = pd.to_numeric(income[f'{col}_lower'], errors='coerce')

# now create an integer which is higher if the value is higher
income[f'{col}_seq_num'] = income[f'{col}_lower'].rank(ascending=True, method='dense')
income[f'{col}_seq_num'] = income[f'{col}_seq_num'].fillna(-1).astype(int)


idxNan = income[col].str.contains('Prefer not to answer').fillna(False)
income.loc[idxNan, f'{col}_seq_num'] = 0

income['seq_num'] = income[['household_income_usa_seq_num', 'household_income_ca_seq_num']].max(axis=1)
# get our look-up dict for each
income_lookups = {}
for col in income_cols:
income_lookups[col] = income[
[col, f'{col}_seq_num']
].drop_duplicates().set_index(f'{col}_seq_num').to_dict()[col]

income['country'] = 'Missing'
idx = income['household_income_usa'].notnull()
income.loc[idx, 'country'] = 'USA'
idx = income['household_income_ca'].notnull()
income.loc[idx, 'country'] = 'Canada'

income_grouped = pd.crosstab(income['seq_num'], income['country'])
# as it turns out, both countries have the same values for income brackets
# so we can just use one of the mapping tables
n_missing = (income['seq_num'] == -1).sum()
income_grouped.index = income_grouped.index.map(income_lookups[col])
income_grouped = income_grouped[['USA', 'Canada']]
income_grouped.index.name = 'Household Income (CAD or USD)'
# st.write(income_grouped)

# grouped barchart
income_grouped = income_grouped.reset_index()
income_grouped = income_grouped.melt(id_vars='Household Income (CAD or USD)', var_name='Country', value_name='Count')
chart = (
alt.Chart(income_grouped)
.mark_bar()
.encode(
x=alt.X('Household Income (CAD or USD):O', axis=alt.Axis(title='Income')),
y=alt.Y('Count:Q', axis=alt.Axis(title='Count')),
color='Country:N',
tooltip=['Household Income (CAD or USD)', 'Count', 'Country']
)
)
st.altair_chart(chart, use_container_width=True)
st.write(f"{n_missing} missing a household income.")


st.markdown("## Full dataframe")
st.write(df)
Loading
Loading