# Pretrain Dataset Visualizations

Visualizations ready to be put into the thesis!

In [None]:
mura_color = 'sienna'
lera_color = 'orange'
internal_color = 'royalblue'
btxrd_color = 'lightsteelblue'

label_0_color = 'black'
label_1_color = 'gainsboro'

train_val_color = 'darkgreen'
test_color = 'mediumseagreen'

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pretrain_df = pd.read_csv('../../visualizations/data/pretrain/metadata.csv')
pretrain_df.drop(columns=pretrain_df.columns[0], axis=1, inplace=True)
pretrain_df.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

dataset_counts = pretrain_df['dataset'].value_counts()
dataset_percentages = pretrain_df['dataset'].value_counts(normalize=True)

fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))

colors = [mura_color, lera_color]
wedges, texts = ax.pie(dataset_counts, wedgeprops=dict(width=0.5), startangle=-40, colors=colors)

bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

for i, p in enumerate(wedges):
    ang = (p.theta2 - p.theta1) / 2. + p.theta1
    ang_rad = np.deg2rad(ang)
    y = np.sin(ang_rad)
    x = np.cos(ang_rad)

    if i == 0:
        # ðŸ”¹ Compute radius halfway through the donut thickness
        r_outer = 1.0
        r_inner = 1.0 - 0.5  # width=0.5 from wedgeprops
        r = (r_outer + r_inner) / 2  # middle of donut ring

        # ðŸ”¹ Place text exactly at the middle radius, along wedge angle
        ax.text(r * x, r * y,
                f'{dataset_percentages[i]*100:.1f}%',
                ha='center', va='center', color='white',fontsize=9)
    else:
        # ðŸ”¹ Keep annotation outside for the second wedge
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = f"angle,angleA=0,angleB={ang}"
        kw["arrowprops"].update({"connectionstyle": connectionstyle})
        ax.annotate(f'{dataset_percentages[i]*100:.1f}%',
                    xy=(x, y),
                    xytext=(1.35 * np.sign(x), 1.4 * y),
                    horizontalalignment=horizontalalignment, **kw)

ax.set_title(f"Distribution of MURA and LERA in Pre-Train dataset\nTotal: {dataset_counts.sum()}")

ax.legend(wedges, dataset_counts.keys(), loc='center right', bbox_to_anchor=(1, 0, 0.5, 1))

plt.show()


In [None]:
label_counts = pretrain_df['label'].value_counts()
label_percentages = pretrain_df['label'].value_counts(normalize=True)

fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))

wedges, texts, autotexts = ax.pie(
    label_counts,
    autopct=lambda pct: f'{pct:.1f}%',
    pctdistance=0.7,
    startangle=-20,
    wedgeprops=dict(width=0.5),
    colors=[label_0_color, label_1_color]
)

# Style the text inside
colors = ["w", "black"]

for i, autotext in enumerate(autotexts):
    autotext.set_color(colors[i % len(colors)])
    autotext.set_fontsize(9)
    autotext.set_ha("center")
    autotext.set_va("center")
plt.setp(texts, size=9)

ax.set_title(f"Distribution of Labels in Pre-Train Dataset\nTotal: {label_counts.sum()}")
ax.legend(wedges, label_counts.keys(), loc='center right', bbox_to_anchor=(1, 0, 0.5, 1))

plt.show()

In [None]:
anatomy_site_value_counts = pretrain_df.groupby(['anatomy_site', 'set']).count().unstack()
anatomy_site_value_counts = anatomy_site_value_counts['dataset'] # just pick one, doesnt matter
anatomy_site_value_counts = anatomy_site_value_counts.iloc[:, [1, 0]]
anatomy_site_value_counts.columns = ['Train', 'Test']
anatomy_site_value_counts.index.rename('Anatomy Site', inplace=True)
anatomy_site_value_counts.sort_values('Train', inplace=True)

# print(anatomy_site_value_counts)

ax = anatomy_site_value_counts.plot(kind='barh', stacked=True, color=[train_val_color, test_color], title=f'Anatomy Site Occurences in Pre-Train dataset\nTotal: {anatomy_site_value_counts.sum().sum()}', xlabel='Count')
for i, (idx, row) in enumerate(anatomy_site_value_counts.iterrows()):
    total = row.sum()
    ax.text(
        total + 100,      # a small offset to the right of the bar
        i,                # bar position
        f'{int(total):,}',  # label text
        va='center'
    )
ax.set_xlim(0, anatomy_site_value_counts.sum(axis=1).max() + 1300)

plt.tight_layout()
plt.show()

In [None]:
anatomy_site_value_counts = pretrain_df.groupby(['anatomy_site', 'label']).count().unstack()
anatomy_site_value_counts = anatomy_site_value_counts['set'] # just pick one, doesnt matter
anatomy_site_value_counts['total'] = anatomy_site_value_counts[0] + anatomy_site_value_counts[1]
anatomy_site_value_counts.sort_values('total', inplace=True)
anatomy_site_value_counts.drop(columns='total', inplace=True)  # optional

# print(anatomy_site_value_counts)
# title=f'Anatomy Site Occurences in Downstream dataset\nTotal: {anatomy_site_value_counts.sum().sum()}', 
ax = anatomy_site_value_counts.plot(kind='barh', stacked=True, color=[label_0_color, label_1_color], xlabel='Count')
for i, (idx, row) in enumerate(anatomy_site_value_counts.iterrows()):
    no_tumor_count = int(row[0])
    tumor_count = int(row[1])

    if i in [0, 1, 2, 3]:
        ax.text(
            row.sum() + 10,      # a small offset to the right of the bar
            i,                   # bar position
            f'{no_tumor_count}/{tumor_count}',  # label text
            va='center',
            ha='left',
            color='black',
        )
    else:
        # Place internal count in center of internal segment
        ax.text(
            no_tumor_count / 2,  # center of internal segment
            i,                   # bar position
            str(no_tumor_count), # internal count
            va='center',
            ha='center',
            color='white'
        )
        
        # Place btxrd count in center of btxrd segment
        ax.text(
            no_tumor_count + tumor_count / 2,  # center of btxrd segment
            i,                                 # bar position
            str(tumor_count),                  # btxrd count
            va='center',
            ha='center',
            color='black'
        )
ax.set_xlim(0, anatomy_site_value_counts.sum(axis=1).max() + 50)
ax.set_ylabel('Anatomy Site')
plt.legend(labels=['Normal', 'Abnormal'], loc='lower right')

plt.tight_layout()
plt.savefig("../../visualizations/data/pretrain/label_over_anatomy_site_distribution.svg")
plt.show()