# MetaChest

This notebook generates `metachest.csv`.



### Generating CSV

In [None]:
from os.path import join

import pandas as pd
import seaborn as sns
from matplotlib.ticker import EngFormatter

from common import read_toml

metachest_dir = read_toml('config.toml')['metachest_dir']

Check files are available:

In [None]:
ds_names = ('chestxray14', 'chexpert', 'mimic', 'padchest')
df = pd.concat(
    pd.read_csv(join(metachest_dir, f'{ds_name}.csv'))
    for ds_name in ds_names
)
PATHOLOGIES = sorted(list(df.columns[5:]))
df = df[list(df.columns[:5]) + PATHOLOGIES]
df.head(10)

General stats:

In [None]:
labels_df = df[PATHOLOGIES].fillna(0).astype(int)
n_samples = labels_df.shape[0]
n_labels = labels_df.sum().sum()
label_cardinality = n_labels / n_samples
label_density = (labels_df.sum(axis=1) / labels_df.shape[1]).sum() / n_samples
print(
    f"Number of samples:\t {n_samples}",
    f"Number of labels:\t {n_labels}",
    f"Label cadinality:\t {label_cardinality}",
    f"Label density:\t\t {label_density}",
    sep='\n'
)

Images per dataset:

In [None]:
df.groupby('dataset').count()['name']

Compute total dataframe:

In [None]:
# group by dataset
ds_sum_df = df[['dataset'] + PATHOLOGIES].groupby('dataset').sum().astype(int)
# sorted pathologies and datasets sseries
ds_sum_sr = ds_sum_df.sum(axis=1).sort_values(ascending=False)
pt_sum_sr = ds_sum_df.sum(axis=0).sort_values(ascending=False)
# sort grouped df
ds_sum_df = ds_sum_df.reindex(list(ds_sum_sr.index))
ds_sum_df = ds_sum_df[list(pt_sum_sr.index)]

# total df
total_df = ds_sum_df.copy()
total_df.loc[:, 'total'] = total_df.sum(axis=1)
total_df.loc['total', :] = total_df.sum(axis=0)
total_df = total_df.astype(int)

Distribution plot:

In [None]:
df_plot = ds_sum_df.T
df_plot = df_plot.rename(columns={
    'chestxray14': 'ChestX-ray14',
    'chexpert': 'CheXpert',
    'mimic': 'MIMIC',
    'padchest': 'PadChest'
})
df_plot = df_plot.rename(index={
    idx: idx.replace('_', ' ').capitalize()
    for idx in df_plot.index
})
ax = df_plot.plot.barh(stacked=True, figsize=(8, 5),
                       color=sns.color_palette('deep'))
ax.xaxis.set_major_formatter(EngFormatter())
ax.set_xlabel('Number of x-ray images')
ax.xaxis.set_tick_params(labelsize=10)
ax.legend(title='Dataset')

In [None]:
total_df.T

## Save

In [None]:
path = join(metachest_dir, 'metachest.csv')
df.to_csv(path, index=False)
path
