# Downstream Dataset Visualizations

Visualizations ready to be put into the thesis!

In [None]:
mura_color = 'sienna'
lera_color = 'orange'
internal_color = 'royalblue'
btxrd_color = 'lightsteelblue'

label_0_color = 'black'
label_1_color = 'gainsboro'

train_val_color = 'darkgreen'
test_color = 'mediumseagreen'

male_color = 'darkslategray'
female_color = 'teal'

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
downstream_df = pd.read_csv('../../visualizations/data/downstream/metadata.csv')
downstream_df.drop(columns=downstream_df.columns[0], axis=1, inplace=True)
downstream_df.columns = ['Dataset', 'Tumor', 'Entity', 'Anatomy Site', 'Sex', 'Age', 'Set']
downstream_df.head()

In [None]:
age = downstream_df['Age'].to_numpy()
min_age = np.min(age)
max_age = np.max(age)
median_age = np.median(age)
mean_age = np.mean(age)
std_age = np.std(age)

print(f'Min Age: {min_age}, Max Age: {max_age}, Median Age: {median_age}, Mean Age: {mean_age}, Std Age: {std_age}')

In [None]:
# Calculate age statistics
age_mean = downstream_df['Age'].mean()
age_std = downstream_df['Age'].std()
total_patients = len(downstream_df)

print("\\begin{table}[t]")
print("\\centering")
print("\\caption{Demographic and Clinical Characteristics of Downstream Dataset}")
print("\\label{tab:downstream_characteristics}")
print("\\begin{minipage}{0.48\\textwidth}")
print("\\centering")
print("\\begin{tabular}{lcc}")
print("\\toprule")

# Age section
print(f"\\textbf{{Age}} & \\textbf{{mean ± std}} & \\\\")
print(f"All patients & {age_mean:.1f} ± {age_std:.1f} &  \\\\")
print("\\midrule")

# Dataset section
print("\\textbf{Dataset} & \\textbf{\\#} & \\textbf{\\%} \\\\")
dataset_counts = downstream_df['Dataset'].value_counts()  # Remove sort_index()
for dataset, count in dataset_counts.items():
    percentage = (count / total_patients) * 100
    print(f"{dataset} & {count:,} & {percentage:.1f} \\\\")
print("\\midrule")

# Tumour section
print("\\textbf{Label} & \\textbf{\\#} & \\textbf{\\%} \\\\")
tumor_counts = downstream_df['Tumor'].value_counts()  # Remove sort_index()
for tumor, count in tumor_counts.items():
    percentage = (count / total_patients) * 100
    tumor_label = "No Tumour" if tumor == 0 else "Tumour"
    print(f"{tumor_label} & {count:,} & {percentage:.1f} \\\\")
print("\\midrule")

# Gender section
print("\\textbf{Gender} & \\textbf{\\#} & \\textbf{\\%} \\\\")
gender_counts = downstream_df['Sex'].value_counts()  # Remove sort_index()
for gender, count in gender_counts.items():
    percentage = (count / total_patients) * 100
    gender_label = "Female" if gender == 'F' else "Male"
    print(f"{gender_label} & {count:,} & {percentage:.1f} \\\\")
print("\\midrule")


# Anatomy Site section (sorted alphabetically)
print("\\textbf{Anatomy Site (alph.)} & \\textbf{\\#} & \\textbf{\\%} \\\\")
anatomy_counts = downstream_df['Anatomy Site'].value_counts().sort_index()
for site, count in anatomy_counts.items():
    percentage = (count / total_patients) * 100
    print(f"{site.capitalize()} & {count:,} & {percentage:.1f} \\\\")
print("\\bottomrule")

print("\\end{tabular}")
print("\\end{minipage}")
print("\\hfill")
print("\\begin{minipage}{0.48\\textwidth}")
print("\\centering")
print("\\begin{tabular}{lcc}")
print("\\toprule")

# Entity section (sorted alphabetically with subgroups)
print("\\textbf{Entity (alph.)} & \\textbf{\\#} & \\textbf{\\%} \\\\")

# Add undefined (no tumour) first
undefined_count = downstream_df[downstream_df['Entity'] == 'undefined'].shape[0]
if undefined_count > 0:
    undefined_percentage = (undefined_count / total_patients) * 100
    print(f"\\quad undefined (no tumour) & {undefined_count:,} & {undefined_percentage:.1f} \\\\")

# Define entity types
internal_entity_types = ['Enchondrom', 'Riesenzelltumor', 'Chondrosarkom', 'Osteochondrom',
                        'Chondroblastom', 'NOF', 'Osteosarkom', 'Knochenzyste, aneurysmatische',
                        'Dysplasie, fibröse', 'Ewing-Sarkom']

entity_counts = downstream_df['Entity'].value_counts().sort_index()
# Filter out 'undefined' if it exists
entity_counts = entity_counts[entity_counts.index != 'undefined']

# INTERNAL entities
print("\\quad INTERNAL & & \\\\")
internal_total = 0
for entity, count in entity_counts.items():
    if entity in internal_entity_types:
        percentage = (count / total_patients) * 100
        internal_total += count
        print(f"\\quad\\quad {entity} & {count:,} & {percentage:.1f} \\\\")

# BTXRD entities
print("\\quad BTXRD & & \\\\")
btxrd_total = 0
for entity, count in entity_counts.items():
    if entity not in internal_entity_types:
        percentage = (count / total_patients) * 100
        btxrd_total += count
        print(f"\\quad\\quad {entity} & {count:,} & {percentage:.1f} \\\\")

print("\\bottomrule")
print("\\end{tabular}")
print("\\end{minipage}")
print("\\end{table}")

In [None]:
anatomy_site_value_counts = downstream_df.groupby(['Anatomy Site', 'Sex']).count().unstack()
anatomy_site_value_counts = anatomy_site_value_counts['Dataset'] # just pick one, doesnt matter
anatomy_site_value_counts = anatomy_site_value_counts.iloc[:, [1, 0]]
anatomy_site_value_counts['total'] = anatomy_site_value_counts['M'] + anatomy_site_value_counts['F']
anatomy_site_value_counts.sort_values('total', inplace=True)
anatomy_site_value_counts.drop(columns='total', inplace=True)  # optional

# print(anatomy_site_value_counts)
# title=f'Anatomy Site Occurences in Downstream dataset\nTotal: {anatomy_site_value_counts.sum().sum()}',
ax = anatomy_site_value_counts.plot(kind='barh', stacked=True, color=[male_color, female_color], xlabel='Count')
for i, (idx, row) in enumerate(anatomy_site_value_counts.iterrows()):
    male_count = int(row['M'])
    female_count = int(row['F'])

    if i == 0:
        ax.text(
            row.sum() + 10,      # a small offset to the right of the bar
            i,                   # bar position
            f'{male_count}/{female_count}',  # label text
            va='center',
            ha='left',
            color='black',
        )
    else:
        # Place male count in center of male segment
        ax.text(
            male_count / 2,      # center of male segment
            i,                   # bar position
            str(male_count),     # male count
            va='center',
            ha='center',
            color='white'
        )
        
        # Place female count in center of female segment
        ax.text(
            male_count + female_count / 2,  # center of female segment
            i,                              # bar position
            str(female_count),              # female count
            va='center',
            ha='center',
            color='white'
        )
ax.set_xlim(0, anatomy_site_value_counts.sum(axis=1).max() + 50)

plt.tight_layout()
plt.savefig("../../visualizations/data/downstream/gender_over_anatomy_site_distribution.svg")
plt.show()

In [None]:
anatomy_site_value_counts = downstream_df.groupby(['Anatomy Site', 'Dataset']).count().unstack()
anatomy_site_value_counts = anatomy_site_value_counts['Entity'] # just pick one, doesnt matter
anatomy_site_value_counts = anatomy_site_value_counts.iloc[:, [1, 0]]
anatomy_site_value_counts['total'] = anatomy_site_value_counts['INTERNAL'] + anatomy_site_value_counts['BTXRD']
anatomy_site_value_counts.sort_values('total', inplace=True)
anatomy_site_value_counts.drop(columns='total', inplace=True)  # optional

# print(anatomy_site_value_counts)
# title=f'Anatomy Site Occurences in Downstream dataset\nTotal: {anatomy_site_value_counts.sum().sum()}', 
ax = anatomy_site_value_counts.plot(kind='barh', stacked=True, color=[internal_color, btxrd_color], xlabel='Count')
for i, (idx, row) in enumerate(anatomy_site_value_counts.iterrows()):
    internal_count = int(row['INTERNAL'])
    btxrd_count = int(row['BTXRD'])

    if i in [0, 1, 2]:
        ax.text(
            row.sum() + 10,      # a small offset to the right of the bar
            i,                   # bar position
            f'{internal_count}/{btxrd_count}',  # label text
            va='center',
            ha='left',
            color='black',
        )
    else:
        # Place internal count in center of internal segment
        ax.text(
            internal_count / 2,  # center of internal segment
            i,                   # bar position
            str(internal_count), # internal count
            va='center',
            ha='center',
            color='white'
        )
        
        # Place btxrd count in center of btxrd segment
        ax.text(
            internal_count + btxrd_count / 2,  # center of btxrd segment
            i,                                 # bar position
            str(btxrd_count),                  # btxrd count
            va='center',
            ha='center',
            color='black'
        )
ax.set_xlim(0, anatomy_site_value_counts.sum(axis=1).max() + 50)

plt.tight_layout()
plt.savefig("../../visualizations/data/downstream/downstream_over_anatomy_site_distribution.svg")
plt.show()

In [None]:
anatomy_site_value_counts = downstream_df.groupby(['Anatomy Site', 'Tumor']).count().unstack()
anatomy_site_value_counts = anatomy_site_value_counts['Entity'] # just pick one, doesnt matter
anatomy_site_value_counts['total'] = anatomy_site_value_counts[0] + anatomy_site_value_counts[1]
anatomy_site_value_counts.sort_values('total', inplace=True)
anatomy_site_value_counts.drop(columns='total', inplace=True)  # optional

# print(anatomy_site_value_counts)
# title=f'Anatomy Site Occurences in Downstream dataset\nTotal: {anatomy_site_value_counts.sum().sum()}', 
ax = anatomy_site_value_counts.plot(kind='barh', stacked=True, color=[label_0_color, label_1_color], xlabel='Count')
for i, (idx, row) in enumerate(anatomy_site_value_counts.iterrows()):
    no_tumor_count = int(row[0])
    tumor_count = int(row[1])

    if i in [0, 1, 2]:
        ax.text(
            row.sum() + 10,      # a small offset to the right of the bar
            i,                   # bar position
            f'{no_tumor_count}/{tumor_count}',  # label text
            va='center',
            ha='left',
            color='black',
        )
    else:
        # Place internal count in center of internal segment
        ax.text(
            no_tumor_count / 2,  # center of internal segment
            i,                   # bar position
            str(no_tumor_count), # internal count
            va='center',
            ha='center',
            color='white'
        )
        
        # Place btxrd count in center of btxrd segment
        ax.text(
            no_tumor_count + tumor_count / 2,  # center of btxrd segment
            i,                                 # bar position
            str(tumor_count),                  # btxrd count
            va='center',
            ha='center',
            color='black'
        )
ax.set_xlim(0, anatomy_site_value_counts.sum(axis=1).max() + 50)

plt.tight_layout()
plt.savefig("../../visualizations/data/downstream/tumor_over_anatomy_site_distribution.svg")
plt.show()

In [None]:
import matplotlib.patches as mpatches

downstream_df['Entity'].unique()

internal_entity_types = ['Enchondrom', 'Riesenzelltumor', 'Chondrosarkom', 'Osteochondrom',
 'Chondroblastom', 'NOF', 'Osteosarkom', 'Knochenzyste, aneurysmatische',
 'Dysplasie, fibröse', 'Ewing-Sarkom']
btxrd_entity_types = ['osteochondroma',
       'multiple osteochondromas', 'simple bone cyst', 'giant cell tumor',
       'osteofibroma', 'synovial osteochondroma', 'other bt', 'osteosarcoma',
       'other mt']

entity_value_counts = downstream_df['Entity'].value_counts()
entity_value_counts = entity_value_counts.sort_values(ascending=True)
entity_value_counts.drop('undefined', inplace=True)
entity_value_counts = pd.DataFrame({
    'Entity': entity_value_counts.index.to_list(),
    'Count': entity_value_counts.values,
    'INTERNAL': [i in internal_entity_types for i in entity_value_counts.index.to_list()]
})
entity_value_counts.set_index('Entity', inplace=True)
entity_value_counts.sort_values(by=['INTERNAL', 'Count'], inplace=True)

fig, ax = plt.subplots(figsize=(10, 8))
entity_value_counts['Count'].plot(kind='barh', ax=ax, title=f'Tumour Entity Occurences in Downstream dataset\nTotal: {entity_value_counts["Count"].sum()}', xlabel='Count')

# Add count numbers to the end of each bar
for i, (entity, row) in enumerate(entity_value_counts.iterrows()):
    count = row['Count']
    ax.text(count + 5, i, f'{int(count):,}', va='center', ha='left')

# Find the indices for INTERNAL and BTXRD groups
internal_indices = [i for i, (entity, row) in enumerate(entity_value_counts.iterrows()) if row['INTERNAL']]
btxrd_indices = [i for i, (entity, row) in enumerate(entity_value_counts.iterrows()) if not row['INTERNAL']]

# Get axis limits for proper positioning
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# Create group bounding boxes
if internal_indices:
    min_internal = min(internal_indices)
    max_internal = max(internal_indices)
    rect_internal = plt.Rectangle((-265, min_internal - 0.4), 255, max_internal - min_internal + 0.8,
                                 facecolor=internal_color, alpha=0.3, edgecolor=internal_color, 
                                 linewidth=2, clip_on=False)
    ax.add_patch(rect_internal)

if btxrd_indices:
    min_btxrd = min(btxrd_indices)
    max_btxrd = max(btxrd_indices)
    rect_btxrd = plt.Rectangle((-265, min_btxrd - 0.4), 255, max_btxrd - min_btxrd + 0.8,
                              facecolor=btxrd_color, alpha=0.3, edgecolor=btxrd_color, 
                              linewidth=2, clip_on=False)
    ax.add_patch(rect_btxrd)

# Add legend
internal_patch = mpatches.Patch(color=internal_color, alpha=0.3, label='INTERNAL')
btxrd_patch = mpatches.Patch(color=btxrd_color, alpha=0.3, label='BTXRD')
ax.legend(handles=[internal_patch, btxrd_patch], bbox_to_anchor=(-0.01, 1.1))

plt.tight_layout()
plt.show()

In [None]:
# Age Distribution Plot
plt.figure(figsize=(10, 6))
sns.kdeplot(data=downstream_df, x='Age', color=internal_color, linewidth=2)

# plt.title(f'Age Distribution in Downstream Dataset\nTotal Patients: {len(downstream_df):,}')
plt.xlabel('Age (years)')
plt.ylabel('Density')
plt.xlim(0, None)  # Cut off at zero
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("../../visualizations/data/downstream/age_distribution.svg")
plt.show()

In [None]:
# Age Distribution by Dataset
datasets = downstream_df['Dataset'].unique()

fig, axes = plt.subplots(1, len(datasets), figsize=(12, 5))
if len(datasets) == 1:
    axes = [axes]

for i, dataset in enumerate(datasets):
    dataset_data = downstream_df[downstream_df['Dataset'] == dataset]
    
    # Use dataset-specific colors
    color = internal_color if dataset == 'INTERNAL' else btxrd_color
    
    sns.kdeplot(data=dataset_data, x='Age', color=color, linewidth=2, ax=axes[i])
    
    axes[i].set_title(f'{dataset} (n={len(dataset_data)})')
    axes[i].set_xlabel('Age (years)')
    axes[i].set_ylabel('Density')
    axes[i].set_xlim(0, None)  # Cut off at zero
    axes[i].grid(True, alpha=0.3)

# plt.suptitle('Age Distribution by Dataset', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Age Distribution by Anatomy Site
anatomy_sites = downstream_df['Anatomy Site'].unique()
n_sites = len(anatomy_sites)

# Calculate subplot grid
cols = 3
rows = (n_sites + cols - 1) // cols

fig, axes = plt.subplots(rows, cols, figsize=(15, 4 * rows))
axes = axes.flatten() if n_sites > 1 else [axes]

for i, site in enumerate(anatomy_sites):
    site_data = downstream_df[downstream_df['Anatomy Site'] == site]
    
    sns.kdeplot(data=site_data, x='Age', color=internal_color, linewidth=2, ax=axes[i])
    
    axes[i].set_title(f'{site.capitalize()} (n={len(site_data)})')
    axes[i].set_xlabel('Age (years)')
    axes[i].set_ylabel('Density')
    axes[i].set_xlim(0, None)  # Cut off at zero
    axes[i].grid(True, alpha=0.3)

# Hide empty subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle('Age Distribution by Anatomy Site', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Calculate percentage of tumor samples for each age group
age_bins = [0, 10, 20, 30, 40, 50, 60, 200]
age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60+']

downstream_df['Age_Group'] = pd.cut(downstream_df['Age'], bins=age_bins, labels=age_labels, right=False)

age_tumor_stats = downstream_df.groupby('Age_Group').agg({
    'Tumor': ['sum', 'count']
}).reset_index()
age_tumor_stats.columns = ['Age_Group', 'Tumor_Count', 'Total_Count']
age_tumor_stats['Tumor_Percentage'] = (age_tumor_stats['Tumor_Count'] / age_tumor_stats['Total_Count'])

fig = plt.figure(figsize=(10, 5))
sns.set_style("whitegrid")
plot = sns.lineplot(
    data=age_tumor_stats,
    x='Age_Group',
    y='Tumor_Percentage',
    marker='o'
)
plot.set_ylim(0, 1)
plot.set_ylabel('Tumor_Percentage'.replace("_", " ").title())
plot.set_xlabel('Age')
fig.savefig("../../visualizations/data/downstream/tumor_percentage_per_age.svg")