# MetaChest Exploration

In [None]:
from os import makedirs
from os.path import join

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.ticker import EngFormatter

from common import read_toml

In [None]:
keywords = {
    'dataset': 'Dataset',
    'chestxray14': 'ChestX-ray14',
    'chexpert': 'CheXpert',
    'metachest': 'MetaChest',
    'mimic': 'MIMIC',
    'padchest': 'PadChest',
    'mtrn': 'Meta-Train',
    'mval': 'Meta-Validation',
    'mtst': 'Meta-Test',
    'atelectasis': 'Atelectasis',
    'cardiomegaly': 'Cardiomegaly',
    'consolidation': 'Consolidation',
    'edema': 'Edema',
    'effusion': 'Effusion',
    'emphysema': 'Emphysema',
    'fibrosis': 'Fibrosis',
    'hernia': 'Hernia',
    'infiltration': 'Infiltration',
    'lung_opacity': 'Lung opacity',
    'mass': 'Mass',
    'nodule': 'Nodule',
    'pleural_thickening': 'Pleural thickening',
    'pneumonia': 'Pneumonia',
    'pneumothorax': 'Pneumothorax',
    'total': 'Total',
}

plt.rcParams.update({
    'text.latex.preamble': r'\usepackage{cmbright}',
    'text.usetex': True,
    'font.family': 'serif',
})
color = sns.color_palette('deep')

metachest_dir = read_toml('config.toml')['metachest_dir']
plot_dir = join(metachest_dir, 'plots')
makedirs(plot_dir, exist_ok=True)

## General

In [None]:
df = pd.read_csv(join(metachest_dir, 'metachest.csv'))
df.head(10)

General stats:

In [None]:
PATHOLOGIES = sorted(list(df.columns[5:]))
cols_order = ['metachest', 'chexpert', 'mimic', 'chestxray14', 'padchest']

labels_df = df[PATHOLOGIES].fillna(0).astype(int)
n_samples = labels_df.shape[0]
n_labels = labels_df.sum().sum()
label_cardinality = n_labels / n_samples
label_density = (labels_df.sum(axis=1) / labels_df.shape[1]).sum() / n_samples
print(
    f"Number of samples:\t {n_samples}",
    f"Number of labels:\t {n_labels}",
    f"Label cadinality:\t {label_cardinality}",
    f"Label density:\t\t {label_density}",
    sep='\n'
)
print('\nImages per dataset:')
df.groupby('dataset').count()['name']

Compute total dataframe:

In [None]:
# group by dataset
ds_sum_df = df[['dataset'] + PATHOLOGIES].groupby('dataset').sum().astype(int)
# sorted pathologies and datasets sseries
ds_sum_sr = ds_sum_df.sum(axis=1).sort_values(ascending=False)
pt_sum_sr = ds_sum_df.sum(axis=0).sort_values(ascending=False)
# sort grouped df
ds_sum_df = ds_sum_df.reindex(list(ds_sum_sr.index))
ds_sum_df = ds_sum_df[list(pt_sum_sr.index)]

# total df
total_df = ds_sum_df.copy()
total_df.loc[:, 'total'] = total_df.sum(axis=1)
total_df.loc['metachest', :] = total_df.sum(axis=0)
total_df = total_df.astype(int)
total_df.T[cols_order].rename(index=keywords, columns=keywords).replace(0, '').style.format(thousands=',')


Distribution plot:

In [None]:
def plot_general(ds_sum_df):
    df_plot = ds_sum_df.T
    df_plot = df_plot.rename(index=keywords, columns=keywords)
    ax = df_plot.plot.barh(stacked=True, figsize=(8, 5),
                           color=color)
    ax.xaxis.set_major_formatter(EngFormatter())
    ax.set_xlabel('Number of x-ray images')
    ax.set_xlim(0, 130000)
    ax.xaxis.set_tick_params(labelsize='small')
    ax.yaxis.set_tick_params(labelsize='medium')
    ax.legend(title='Dataset', fontsize='medium')

    plt.tight_layout()
    plt.savefig(join(plot_dir, 'metachest_general.pdf'))
    plt.savefig(join(plot_dir, 'metachest_general.jpg'))

plot_general(ds_sum_df)

Pathology co-ocurrence matrix.

In [None]:
def plot_coocc(df):
    import matplotlib.transforms as mtransforms
    import numpy as np
    from matplotlib.ticker import EngFormatter

    df = df.iloc[:, 5:]
    df = df.fillna(0).astype(int)
    df_mat = df.to_numpy()
    coocc_mat = df_mat.T.dot(df_mat)

    paths = [keywords[p] for p in df.columns]
    coocc = pd.DataFrame(coocc_mat, index=paths, columns=paths)
    mask = np.triu(coocc+1, k=1)

    fig, ax = plt.subplots(figsize=(10, 5))
    trans_offset = mtransforms.offset_copy(ax.transData, fig=fig, x=0.18)

    sns.heatmap(
        coocc,
        linewidth=1,
        cmap='RdPu',
        annot=True,
        annot_kws={"fontsize": 'x-small', 'ha': 'right',
                   'transform': trans_offset},
        fmt='g',
        cbar_kws={'label': 'Number of images with both pathologies',
                  'format': EngFormatter()},
        mask=mask,
        ax=ax
    )

    ax.set_xticklabels(ax.get_xticklabels(),
                       rotation=30, horizontalalignment='right')
    ax.figure.axes[-1].tick_params(labelsize='x-small')

    plt.title('Pathology co-ocurrence matrix of MetaChest')
    plt.tight_layout()
    plt.savefig(join(plot_dir, 'metachest_coocu.pdf'))
    plt.savefig(join(plot_dir, 'metachest_coocu.jpg'))

    return coocc


plot_coocc(df)

Age distribution.

In [None]:
def plot_age(df: pd.DataFrame):
    age_sr = df.age
    bins = len(age_sr.unique())
    age_sr.hist(bins=bins, color=color[0])
    return age_sr.describe()

plot_age(df)

## Meta-sets

In [None]:
complete_config = read_toml(join(metachest_dir, 'mtl', 'complete.toml'))
mtrn = complete_config['mtrn']
mval = complete_config['mval']
mtst = complete_config['mtst']
mclasses = [mtrn, mval, mtst]

def build_metasets_table(total_df):
    df_msets = total_df[mtrn + mval + mtst].T
    mset_vals = ['mtrn'] * len(mtrn) + ['mval'] * len(mval) + ['mtst'] * len(mtst)
    df_msets.insert(0, 'mset', mset_vals)

    df_msets_lst = []
    for mset_name in ['mtrn', 'mval', 'mtst']:
        df_mset = df_msets[df_msets['mset'] == mset_name]
        df_mset = df_mset.drop(columns=['mset'])
        df_mset.loc['total', :] = df_mset.sum(axis=0)
        sr_mset = pd.Series([0] * 5, index=df_mset.columns, name=mset_name)
        df_mset = pd.concat([sr_mset.to_frame().T, df_mset], axis=0)
        df_msets_lst.append(df_mset)
    df_msets = pd.concat(df_msets_lst).astype(int)

    return df_msets

build_metasets_table(total_df)[cols_order].rename(index=keywords, columns=keywords).replace(0, '').style.format(thousands=',')

In [None]:

def plot_metasets(ds, mclasses, figsize=(8, 5)):
    titles = ['Meta-Train (Seen) ', 'Meta-Val (Unseen) ', 'Meta-Test (Unseen) ']
    _, axs = plt.subplots(
        nrows=len(mclasses), ncols=1, tight_layout=True,
        gridspec_kw={'height_ratios': [len(mset) for mset in mclasses]},
        figsize=figsize
    )
    for mset, title, ax in zip(mclasses, titles, axs):
        ds_mset = ds[mset]
        ds_mset = ds_mset.rename(columns=keywords, index=keywords)
        ds_mset.T.plot.barh(stacked=True, ax=ax, color=color)

        ax.set_title(title, fontsize='large', loc='right', y=1.0, pad=-14)
        ax.set_xlim(0, 130000)
        if 'Test' in title:
            ax.xaxis.set_tick_params(labelsize='small')
            ax.xaxis.set_major_formatter(EngFormatter())
            ax.set_xlabel('Number of x-ray images')
            ax.legend(loc='lower right', fontsize='medium')
        else:
            ax.set_xticks([],[])
            ax.get_legend().remove()
        ax.tick_params(axis='y', which='major', labelsize='medium')

        plt.tight_layout()
        plt.savefig(join(plot_dir, 'metachest_metasets.pdf'))
        plt.savefig(join(plot_dir, 'metachest_metasets.jpg'))

plot_metasets(ds_sum_df, mclasses)