# Explorative Data Analysis - BTXRD

In [None]:
import pandas as pd

btxrd_df = pd.read_excel('~/btxrd_dataset/BTXRD/dataset.xlsx')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
btxrd_df.head(5)

In [None]:
print(btxrd_df.columns)
print(len(btxrd_df))

In [None]:
def convert_one_hot_into_categorical(df, one_hot_columns, category_column_name):
    df[category_column_name] = None

    non_specified_errors = 0
    multiple_specified_errors = 0

    for index, row in df.iterrows():
        row_sum = sum(row[col] for col in one_hot_columns)
        if row_sum == 0:
            non_specified_errors += 1
            df.at[index, category_column_name] = 'undefined'
            continue
        elif row_sum > 1:
            multiple_specified_errors += 1
            multiple_columns = [col for col in one_hot_columns if row[col] == 1]
            df.at[index, category_column_name] = ', '.join(multiple_columns)
            continue
        else:
            for col in one_hot_columns:
                if row[col] == 1:
                    df.at[index, category_column_name] = col
                    break

    # remove the one-hot columns
    df.drop(columns=one_hot_columns, inplace=True)

    return df, non_specified_errors, multiple_specified_errors


In [None]:
# Define the anatomy site columns
anatomy_sites = ['hand', 'ulna', 'radius',
       'humerus', 'foot', 'tibia', 'fibula', 'femur', 'hip bone',
       'ankle-joint', 'knee-joint', 'hip-joint', 'wrist-joint', 'elbow-joint',
       'shoulder-joint']

# Convert one-hot encoding to categorical
btxrd_categorized_df, no_anatomy_site_encoding_errors, multiple_anatomy_site_encoding_errors = convert_one_hot_into_categorical(
    btxrd_df, anatomy_sites, 'anatomy_site'
)

print(f"Found {no_anatomy_site_encoding_errors} nothing specifed errors.")
print(f"Found {multiple_anatomy_site_encoding_errors} multiple specified errors.")
btxrd_categorized_df.head()

In [None]:
unknown_anatomy_site_df = btxrd_categorized_df[['image_id', 'anatomy_site', 'tumor']][btxrd_df['anatomy_site'] == 'undefined']
# check if all of them are without tumor
print(unknown_anatomy_site_df['tumor'].value_counts()) # -> yes they are
print(unknown_anatomy_site_df.head())

# check if all known anatomy sites are with tumor
known_anatomy_site_df = btxrd_categorized_df[['image_id', 'anatomy_site', 'tumor']][btxrd_df['anatomy_site'] != 'undefined']
print(known_anatomy_site_df['tumor'].value_counts()) # -> yes they are

In [None]:
# Filter rows where anatomy_site contains a comma (multiple sites)
multiple_site_df = btxrd_df[btxrd_df['anatomy_site'].str.contains(',', na=False)]

# Display count of multiple site occurrences
print(f"Total entries with multiple anatomy sites: {len(multiple_site_df)}")

# Count the frequency of each combination
multiple_site_counts = multiple_site_df['anatomy_site'].value_counts()
print("\nFrequency of each multiple site combination:")
print(multiple_site_counts)

multiple_site_df.head()


> Note: Checking the first few images, the ones with multiple anatomy sites defined, actually include multiple bones as specified!

In [None]:
# convert tumor type to categorical
tumor_types = ['osteochondroma',
       'multiple osteochondromas', 'simple bone cyst', 'giant cell tumor',
       'osteofibroma', 'synovial osteochondroma', 'other bt', 'osteosarcoma',
       'other mt']

btxrd_categorized_df, no_tumor_type_encoding_errors, multiple_tumor_type_encoding_errors = convert_one_hot_into_categorical(
    btxrd_categorized_df, tumor_types, 'tumor_type'
)


print(f"Found {no_tumor_type_encoding_errors} nothing specifed errors.")
print(f"Found {multiple_tumor_type_encoding_errors} multiple specified errors.")
# check if all of undefined tumor types them are without tumor
unknown_tumor_type_df = btxrd_categorized_df[['image_id', 'tumor_type', 'tumor']][btxrd_categorized_df['tumor_type'] == 'undefined']
print("\nTumor type undefined:")
print(unknown_tumor_type_df['tumor'].value_counts()) # -> yes they are
# check if all known tumor types are with tumor
known_tumor_type_df = btxrd_categorized_df[['image_id', 'tumor_type', 'tumor']][btxrd_categorized_df['tumor_type'] != 'undefined']
print("\nTumor type defined:")
print(known_tumor_type_df['tumor'].value_counts()) # -> yes they are

# Filter rows where tumor type contains a comma (multiple sites)
multiple_site_df = btxrd_df[btxrd_df['tumor_type'].str.contains(',', na=False)]

# Display count of multiple site occurrences
print(f"\nTotal entries with multiple anatomy sites: {len(multiple_site_df)}")

# Count the frequency of each combination
multiple_site_counts = multiple_site_df['tumor_type'].value_counts()
print("\nFrequency of each multiple site combination:")
print(multiple_site_counts)

In [None]:
btxrd_categorized_df.head()

In [None]:
# Categorize shooting angle
shooting_angles = ['upper limb', 'lower limb', 'pelvis']
btxrd_categorized_df, no_shooting_angle_encoding_errors, multiple_shooting_angle_encoding_errors = convert_one_hot_into_categorical(
    btxrd_categorized_df, shooting_angles, 'shooting_angle'
)

##### No categorization errors :nice:
print(f"Found {no_shooting_angle_encoding_errors} nothing specifed errors.")
print(f"Found {multiple_shooting_angle_encoding_errors} multiple specified errors.")
# check if all of undefined shooting angles them are without tumor
unknown_shooting_angle_df = btxrd_categorized_df[['image_id', 'shooting_angle', 'tumor']][btxrd_categorized_df['shooting_angle'] == 'undefined']
print("\nShooting angle undefined:")
print(unknown_shooting_angle_df['tumor'].value_counts())
# check if all known shooting angles are with tumor
known_shooting_angle_df = btxrd_categorized_df[['image_id', 'shooting_angle', 'tumor']][btxrd_categorized_df['shooting_angle'] != 'undefined']
print("\nShooting angle defined:")
print(known_shooting_angle_df['tumor'].value_counts())

# Filter rows where shooting angle contains a comma (multiple sites)
multiple_site_df = btxrd_df[btxrd_df['shooting_angle'].str.contains(',', na=False)]
# Display count of multiple site occurrences
print(f"\nTotal entries with multiple shooting angles: {len(multiple_site_df)}")
# Count the frequency of each combination
multiple_site_counts = multiple_site_df['shooting_angle'].value_counts()
print("\nFrequency of each multiple site combination:")
print(multiple_site_counts)

In [None]:
# Categorize the locales
locales = ['frontal', 'lateral', 'oblique']
btxrd_categorized_df, no_locale_encoding_errors, multiple_locale_encoding_errors = convert_one_hot_into_categorical(
    btxrd_categorized_df, locales, 'locale'
)


##### No categorization errors :nice:
print(f"Found {no_locale_encoding_errors} nothing specifed errors.")
print(f"Found {multiple_locale_encoding_errors} multiple specified errors.")
# check if all of undefined locales them are without tumor
unknown_locale_df = btxrd_categorized_df[['image_id', 'locale', 'tumor']][btxrd_categorized_df['locale'] == 'undefined']
print("\nLocale undefined:")
print(unknown_locale_df['tumor'].value_counts()) # -> yes they are
# check if all known locales are with tumor
known_locale_df = btxrd_categorized_df[['image_id', 'locale', 'tumor']][btxrd_categorized_df['locale'] != 'undefined']
print("\nLocale defined:")
print(known_locale_df['tumor'].value_counts()) # -> yes they are

# Filter rows where locale contains a comma (multiple sites)
multiple_site_df = btxrd_df[btxrd_df['locale'].str.contains(',', na=False)]
# Display count of multiple site occurrences
print(f"\nTotal entries with multiple locales: {len(multiple_site_df)}")
# Count the frequency of each combination
multiple_site_counts = multiple_site_df['locale'].value_counts()
print("\nFrequency of each multiple site combination:")
print(multiple_site_counts)

In [None]:
btxrd_categorized_df.head()

In [None]:
btxrd_categorized_df.describe()

In [None]:
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

# Set the figure style
sns.set(style="whitegrid")

# Create histogram with KDE for age
plt.figure(figsize=(12, 6))
sns.histplot(data=btxrd_categorized_df, x='age', kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# For categorical columns, create bar plots
columns_to_plot = ['center', 'gender', 'tumor', 'benign', 'malignant', 
                   'anatomy_site', 'tumor_type', 'shooting_angle', 'locale']

for col in columns_to_plot:
    # Get value counts and convert to DataFrame for proper seaborn plotting
    value_counts = btxrd_categorized_df[col].value_counts().reset_index()
    value_counts.columns = [col, 'count']
    
    # Calculate percentages
    total = len(btxrd_categorized_df)
    value_counts['percentage'] = value_counts['count'] / total * 100
    
    # Sort by count in descending order
    value_counts = value_counts.sort_values('count', ascending=False)
    
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=col, y='count', data=value_counts)
    
    # Add count and percentage on top of bars
    for i, row in value_counts.iterrows():
        ax.text(i, row['count'] + 5, 
                f"{row['count']} ({row['percentage']:.1f}%)", 
                ha='center', fontsize=9)
    
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    
    # Rotate x-axis labels if needed
    if len(value_counts) > 5 or col in ['anatomy_site', 'tumor_type']:
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Create a new figure
plt.figure(figsize=(12, 6))

# Count tumor and no-tumor per center
tumor_counts = btxrd_categorized_df.groupby(['center', 'tumor']).size().unstack(fill_value=0)

# Rename columns for better readability
if 0 in tumor_counts.columns and 1 in tumor_counts.columns:
    tumor_counts = tumor_counts.rename(columns={0: 'No Tumor', 1: 'Tumor'})

# Create the grouped bar chart
tumor_counts.plot(kind='bar', ax=plt.gca())

# Add labels and title
plt.xlabel('Center')
plt.ylabel('Count')
plt.title('Distribution of Tumor vs No Tumor by Center')

# Add count values on top of the bars
for i, center in enumerate(tumor_counts.index):
    for j, col in enumerate(tumor_counts.columns):
        count = tumor_counts.loc[center, col]
        plt.text(i + (j-0.5) * 0.3, count + 5, str(count), 
                 ha='center', va='bottom', fontsize=9)

# Add percentage labels
for i, center in enumerate(tumor_counts.index):
    total = tumor_counts.loc[center].sum()
    for j, col in enumerate(tumor_counts.columns):
        count = tumor_counts.loc[center, col]
        percentage = (count / total) * 100
        plt.text(i + (j-0.5) * 0.3, count/2, f"{percentage:.1f}%", 
                ha='center', va='center', fontsize=9, color='white', fontweight='bold')

plt.tight_layout()
plt.legend(title='Status')
plt.show()

In [None]:
# Sanity check for tumor, benign, and malignant
benign_malignant_tumor_df = btxrd_categorized_df[btxrd_categorized_df['tumor'] == 1][['benign', 'malignant']]
benign_malignant_tumor_df[['benign', 'malignant']] = benign_malignant_tumor_df[['benign', 'malignant']].astype(int)

# Check if benign and malignant are mutually exclusive
if (1 == benign_malignant_tumor_df['benign'] + benign_malignant_tumor_df['malignant']).all():
    print("Sanitiy check passed: benign and malignant are mutually exclusive.")
else:
    print("Sanitiy check failed: benign and malignant are NOT mutually exclusive.")
    # Display the rows that violate the condition
    print(benign_malignant_tumor_df[benign_malignant_tumor_df['benign'] + benign_malignant_tumor_df['malignant'] != 1])

In [None]:
import json
import os
import seaborn as sns
import pandas as pd

import matplotlib.pyplot as plt
from tqdm import tqdm  # For progress bar

# Filter images with tumor=1
tumor_images = btxrd_categorized_df[btxrd_categorized_df['tumor'] == 1]
print(f"Found {len(tumor_images)} images with tumor=1")

# Create lists to store the extracted information
image_data = []
bbox_data = []

# Loop through each image with tumor=1
for image_id in tumor_images['image_id']:
    # Construct path to the JSON file
    json_path = os.path.join('~/btxrd_dataset/BTXRD/Annotations', f"{str(image_id).split('.')[0]}.json")
    
    try:
        with open(os.path.expanduser(json_path), 'r') as f:
            data = json.load(f)
        
        # Extract image dimensions
        img_height = data.get('imageHeight')
        img_width = data.get('imageWidth')
        
        if img_height and img_width:
            img_area = img_height * img_width
            image_data.append({
                'image_id': image_id,
                'width': img_width,
                'height': img_height,
                'area': img_area
            })
            
        # Extract bounding box information
        for shape in data.get('shapes', []):
            if shape.get('shape_type') == 'rectangle':
                points = shape.get('points', [])
                if len(points) == 2:
                    # Calculate bounding box dimensions
                    x1, y1 = points[0]
                    x2, y2 = points[1]
                    
                    bbox_width = abs(x2 - x1)
                    bbox_height = abs(y2 - y1)
                    bbox_area = bbox_width * bbox_height
                    
                    bbox_data.append({
                        'image_id': image_id,
                        'width': bbox_width,
                        'height': bbox_height,
                        'area': bbox_area
                    })
            
    except Exception as e:
        print(f"Error processing {image_id}: {e}")

# Create dataframes from the collected data
image_df = pd.DataFrame(image_data)
bbox_df = pd.DataFrame(bbox_data)

print(f"Extracted dimensions for {len(image_df)} images and {len(bbox_df)} bounding boxes.")

> NOTE: there are images with multiple bounding boxes!

In [None]:
# print bounding boxes of images that have multiple bounding boxes
multiple_bbox_df = bbox_df[bbox_df['image_id'].duplicated(keep=False)]
print(f"Found {len(multiple_bbox_df)} bounding boxes for images with multiple bounding boxes.")

# show the counts for each image id
multiple_bbox_counts = multiple_bbox_df['image_id'].value_counts()
print("\nFrequency of images with multiple bounding boxes:")
print(multiple_bbox_counts)

# Display the first few rows
multiple_bbox_df.head(10)

In [None]:
# Plot image dimensions
plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
sns.histplot(image_df['width'], kde=True, color='blue')
plt.title('Distribution of Image Width')
plt.xlabel('Width (pixels)')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.histplot(image_df['height'], kde=True, color='green')
plt.title('Distribution of Image Height')
plt.xlabel('Height (pixels)')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.histplot(image_df['area'], kde=True, color='red')
plt.title('Distribution of Image Area')
plt.xlabel('Area (pixels²)')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Plot bounding box dimensions
plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
sns.histplot(bbox_df['width'], kde=True, color='purple')
plt.title('Distribution of Bounding Box Width')
plt.xlabel('Width (pixels)')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.histplot(bbox_df['height'], kde=True, color='orange')
plt.title('Distribution of Bounding Box Height')
plt.xlabel('Height (pixels)')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.histplot(bbox_df['area'], kde=True, color='brown')
plt.title('Distribution of Bounding Box Area')
plt.xlabel('Area (pixels²)')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Calculate and plot ratio of bounding box area to image area
bbox_df['image_area'] = bbox_df['image_id'].map(image_df.set_index('image_id')['area'])
bbox_df['ratio'] = bbox_df['area'] / bbox_df['image_area']

plt.figure(figsize=(10, 6))
sns.histplot(bbox_df['ratio'], kde=True, color='teal')
plt.title('Distribution of Bounding Box to Image Area Ratio')
plt.xlabel('Ratio (bbox area / image area)')
plt.ylabel('Count')
plt.axvline(bbox_df['ratio'].mean(), color='red', linestyle='--', 
            label=f'Mean: {bbox_df["ratio"].mean():.4f}')
plt.axvline(bbox_df['ratio'].median(), color='green', linestyle='-.', 
            label=f'Median: {bbox_df["ratio"].median():.4f}')
plt.legend()
plt.show()

# Display summary statistics
print("Image Dimensions Statistics:")
print(image_df[['width', 'height', 'area']].describe())

print("\nBounding Box Dimensions Statistics:")
print(bbox_df[['width', 'height', 'area']].describe())

## Example images

In [None]:
import os
import json
import numpy as np
from PIL import Image

import matplotlib.pyplot as plt
import matplotlib.patches as patches

def display_images_for_anatomy_site(site):
    """Display two example images for a given anatomy site with annotations."""
    # Get images for this anatomy site
    site_images = btxrd_categorized_df[btxrd_categorized_df['anatomy_site'] == site]
    
    if len(site_images) == 0:
        print(f"No images found for anatomy site: {site}")
        return
    
    # Try to get one benign and one malignant if possible
    benign_images = site_images[site_images['benign'] == 1]
    malignant_images = site_images[site_images['malignant'] == 1]
    
    # Initialize selected images list
    selected_images = []
    
    # Add a benign example if available
    if len(benign_images) > 0:
        selected_images.append(benign_images.sample(1).iloc[0])
    
    # Add a malignant example if available
    if len(malignant_images) > 0 and (len(selected_images) == 0 or 
                                      selected_images[0]['image_id'] not in malignant_images['image_id'].values):
        selected_images.append(malignant_images.sample(1).iloc[0])
    
    # If we don't have 2 examples yet, fill with random ones
    remaining_images = site_images[~site_images['image_id'].isin([img['image_id'] for img in selected_images])]
    while len(selected_images) < 2 and len(remaining_images) > 0:
        selected_images.append(remaining_images.sample(1).iloc[0])
        remaining_images = remaining_images[~remaining_images['image_id'].isin([img['image_id'] for img in selected_images])]
    
    # If we have examples, create a figure
    if selected_images:
        fig, axes = plt.subplots(1, len(selected_images), figsize=(15, 6))
        
        # Ensure axes is always a list-like object even if there's only one subplot
        if len(selected_images) == 1:
            axes = [axes]
        
        # Display each selected image
        for j, (img_row, ax) in enumerate(zip(selected_images, axes)):
            img_id = img_row['image_id']
            
            # Determine tumor status
            if img_row['tumor'] == 1:
                if img_row['benign'] == 1:
                    tumor_status = "Benign Tumor"
                elif img_row['malignant'] == 1:
                    tumor_status = "Malignant Tumor"
                else:
                    tumor_status = "Tumor (Unknown Type)"
            else:
                tumor_status = "No Tumor"
            
            # Load image
            img_path = os.path.join('~/btxrd_dataset/BTXRD/images', img_id)
            
            try:
                img = Image.open(os.path.expanduser(img_path))
                
                # Display the image
                ax.imshow(img, cmap='gray')
                
                # Load JSON annotation
                json_path = os.path.join('~/btxrd_dataset/BTXRD/Annotations', f"{str(img_id).split('.')[0]}.json")
                
                # Try to load and draw annotations
                try:
                    with open(os.path.expanduser(json_path), 'r') as f:
                        annotation = json.load(f)
                    
                    # Draw annotations
                    for shape in annotation.get('shapes', []):
                        points = shape.get('points', [])
                        shape_type = shape.get('shape_type', '')
                        
                        if shape_type == 'rectangle' and len(points) == 2:
                            x1, y1 = points[0]
                            x2, y2 = points[1]
                            width = x2 - x1
                            height = y2 - y1
                            rect = patches.Rectangle((x1, y1), width, height, linewidth=2, edgecolor='r', facecolor='none')
                            ax.add_patch(rect)
                            
                        elif shape_type == 'polygon' and len(points) > 2:
                            # Convert points to numpy array for polygon
                            poly_points = np.array(points)
                            ax.fill(poly_points[:, 0], poly_points[:, 1], color='yellow', alpha=0.3)
                            
                except Exception as e:
                    print(f"Error loading annotation for {img_id}: {e}")
                
                ax.set_title(f"{tumor_status}\n{img_id}")
                ax.axis('off')
                
            except Exception as e:
                print(f"Error loading image {img_id}: {e}")
                ax.text(0.5, 0.5, f"Error loading image: {img_id}", 
                       horizontalalignment='center', verticalalignment='center')
                ax.axis('off')
        
        plt.suptitle(f"Anatomy Site: {site}", fontsize=16)
        plt.tight_layout()
        plt.subplots_adjust(top=0.85)
        plt.show()

# Get anatomy sites with only anatomy site
valid_anatomy_sites = btxrd_categorized_df['anatomy_site'].unique()
valid_anatomy_sites = [site for site in valid_anatomy_sites if ',' not in site]

# Display images for each anatomy site
for site in valid_anatomy_sites:
    display_images_for_anatomy_site(site)

## Anatomy site of healthy samples

In [None]:
# Read the Excel file with healthy anatomy sites
healthy_anatomy_sites_df = pd.read_excel('~/btxrd_dataset/BTXRD/BTXRD_healthy_anatomy_sites.xlsx')
# make all to lower case
healthy_anatomy_sites_df['choice'] = healthy_anatomy_sites_df['choice'].str.lower()


# Print the first 10 samples
print("First 10 samples from BTXRD_healthy_anatomy_sites.xlsx:")
healthy_anatomy_sites_df.head(10)

In [None]:
# Count occurrences of each anatomy site in the healthy dataset
healthy_site_counts = healthy_anatomy_sites_df['choice'].value_counts().reset_index()
healthy_site_counts.columns = ['anatomy_site', 'count']

# Calculate percentages
total_healthy = len(healthy_anatomy_sites_df)
healthy_site_counts['percentage'] = healthy_site_counts['count'] / total_healthy * 100

# Sort by count in descending order
healthy_site_counts = healthy_site_counts.sort_values('count', ascending=False)

# Create the plot
plt.figure(figsize=(12, 6))
ax = sns.barplot(x='anatomy_site', y='count', data=healthy_site_counts)

# Add count and percentage labels on top of bars
for i, row in healthy_site_counts.iterrows():
    ax.text(i, row['count'] + 5, 
            f"{row['count']} ({row['percentage']:.1f}%)", 
            ha='center', fontsize=9)

plt.title('Distribution of Healthy Anatomy Sites')
plt.xlabel('Anatomy Site')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print comparison with the dataset anatomy sites
print("Mapping between healthy annotations and dataset annotations:")
print("Healthy dataset unique sites:", sorted(healthy_anatomy_sites_df['choice'].unique()))

In [None]:
def get_combined_anatomy_site_category(anatomy_sites: list[str]) -> str:
   """
   This is a helper function to map the anatomy sites from the INTERNAL and the BTXRD dataset to a common categorization.
   It takes anatomy sites from either dataset and returns a common category.
   It accepts a list, since in the BTXRD dataset sometimes multiple anatomy sites are given.
   """

   if len(anatomy_sites) == 0:
      return "undefined"

   # see https://www.notion.so/Combining-BTXRD-and-INTERNAL-1e0bfa858aa980dbad78d2a1f3a29ce4?pvs=4#1ecbfa858aa980859fa7d88da1238f1c
   mapping = {
    "Clavicula": "shoulder",
    "Scapula": "shoulder",
    "shoulder-joint": "shoulder",

    "Humerus": "upper arm",
    "humerus": "upper arm",
    "humerus, shoulder-joint": "upper arm",

    "elbow-joint": "elbow",
    "Ulna": "lower arm",
    "ulna": "lower arm",
    "Radius": "lower arm",
    "radius": "lower arm",
    "ulna, radius": "lower arm",
    "hand, radius": "lower arm",
    "hand, ulna, radius": "lower arm",

    "hand": "hand",
    "wrist-joint": "hand",
    "Manus": "hand",

    "Columna vertebralis": "spine",

    "Os pubis": "hip",
    "Os ischii": "hip",
    "Os sacrum": "hip",
    "Os ilium": "hip",
    "hip-joint": "hip",
    "hip bone": "hip",
    "hip bone, hip-joint": "hip",

    "Femur": "upper leg",
    "femur": "upper leg",
    "femur, hip bone": "upper leg",

    "Patella": "knee",
    "knee-joint": "knee",

    "Tibia": "lower leg",
    "Fibula": "lower leg",
    "tibia": "lower leg",
    "fibula": "lower leg",
    "ankle-joint": "lower leg",
    "tibia, fibula": "lower leg",
    "foot, tibia, fibula": "lower leg",

    "Pes": "foot",
    "foot": "foot",
    "foot, ankle-joint": "foot",

    "tibia, fibula, femur": "leg",
    "tibia, femur": "leg",
    "fibula, femur": "leg",
    "tibia, fibula, femur, hip bone": "leg",
    "tibia, fibula, hip bone": "leg",

    "ulna, radius, humerus": "arm",
    "ulna, humerus": "arm",
    "radius, humerus": "arm",
   }

   try:
      if len(anatomy_sites) == 0:
         return mapping[anatomy_sites[0]]
      else:
         anatomy_site = ", ".join(anatomy_sites)
         return mapping[anatomy_site]
   except KeyError:
      print(f"Warning: Anatomy site '{anatomy_sites}' not found in mapping.")
      raise

def get_anatomy_site_from_sample(sample):
        anatomy_sites = ['hand', 'ulna', 'radius',
       'humerus', 'foot', 'tibia', 'fibula', 'femur', 'hip bone',
       'ankle-joint', 'knee-joint', 'hip-joint', 'wrist-joint', 'elbow-joint',
       'shoulder-joint']
        # check for the columns anatomy_sites, which one is set to 1
        selected = sample[anatomy_sites] == 1
        combined_category_anatomy_site = get_combined_anatomy_site_category(
            sample[anatomy_sites].index[selected].tolist()
        )
        return combined_category_anatomy_site

In [None]:
btxrd_df = pd.read_excel('~/btxrd_dataset/BTXRD/dataset.xlsx')


anatomy_sites_tumor = [get_anatomy_site_from_sample(row) for _, row in btxrd_df[btxrd_df['tumor'] == 1].iterrows()]
anatomy_sites_healthy = healthy_anatomy_sites_df['choice'].tolist()

assert len(anatomy_sites_healthy) + len(anatomy_sites_tumor) == len(btxrd_df), "The number of anatomy sites from healthy and tumor samples does not match the total number of samples in the dataset."

In [None]:
# Create a combined DataFrame with counts for all unique anatomy sites
healthy_counts = pd.Series(anatomy_sites_healthy).value_counts()
tumor_counts = pd.Series(anatomy_sites_tumor).value_counts()

# Get all unique sites
all_sites = sorted(set(healthy_counts.index) | set(tumor_counts.index))

# Create a DataFrame with both counts
combined_counts = pd.DataFrame({
    'Healthy': [healthy_counts.get(site, 0) for site in all_sites],
    'Tumor': [tumor_counts.get(site, 0) for site in all_sites]
}, index=all_sites)

# Sort by total count (descending)
combined_counts['Total'] = combined_counts.sum(axis=1)
combined_counts = combined_counts.sort_values('Total', ascending=False)
combined_counts = combined_counts.drop('Total', axis=1)

# Plot stacked bar chart
fig, ax = plt.subplots(figsize=(14, 8))
combined_counts.plot(kind='bar', stacked=True, ax=ax, color=['#2ecc71', '#e74c3c'])

# Customize the plot
ax.set_title('Distribution of Anatomy Sites: Healthy vs. Tumor', fontsize=16)
ax.set_xlabel('Anatomy Site', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.legend(title='Category')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Add count labels on bars
for i, site in enumerate(combined_counts.index):
    healthy_count = combined_counts.loc[site, 'Healthy']
    tumor_count = combined_counts.loc[site, 'Tumor']
    total = healthy_count + tumor_count
    
    if healthy_count > 0:
        ax.text(i, healthy_count/2, f"{int(healthy_count)}", 
                ha='center', va='center', color='black', fontweight='bold')
    
    if tumor_count > 0:
        ax.text(i, healthy_count + tumor_count/2, f"{int(tumor_count)}", 
                ha='center', va='center', color='black', fontweight='bold')

plt.show()