# Explorative Data Analysis - Internal Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

file_path = '~/DATA/included_patients.xlsx'
included_patients_df = pd.read_excel(file_path)

In [None]:
pd.set_option('display.max_columns', None)
included_patients_df.head(10)

In [None]:
included_patients_df.columns

In [None]:
# Get unique pseudo accnr values
unique_pseudo_accnrs = included_patients_df['pseudo accnr'].dropna().unique()

# Display the results
print(f"Number of unique pseudo accnr values: {len(unique_pseudo_accnrs)}")
print("\nUnique pseudo accnr values:")
for accnr in unique_pseudo_accnrs:
    print(f"- {accnr}")

In [None]:
# Display the unique values of entity, side, localisation_1, epi, direction
print("Unique entity values:")
print(included_patients_df['entity'].unique())
print("\nUnique side values:")
print(included_patients_df['side'].unique())
print("\nUnique localisation_1 values:")
print(included_patients_df['localisation_1'].unique())
print("\nUnique epi values:")
print(included_patients_df['epi'].unique())
print("\nUnique direction values:")
print(included_patients_df['direction'].unique())

# Count the number of unique values for each column
print("\nNumber of unique values:")
print(f"entity: {len(included_patients_df['entity'].unique())}")
print(f"side: {len(included_patients_df['side'].unique())}")
print(f"localisation_1: {len(included_patients_df['localisation_1'].unique())}")
print(f"epi: {len(included_patients_df['epi'].unique())}")
print(f"direction: {len(included_patients_df['direction'].unique())}")

In [None]:
included_patients_df.describe()

## Sanity Check For Patient in both Train and Test Set

In [None]:
# Get patients in each dataset
train_patients = included_patients_df[included_patients_df['dataset'] == 'train']['pat_nr'].unique()
test_patients = included_patients_df[included_patients_df['dataset'] == 'test']['pat_nr'].unique()

# Find patients that appear in both datasets
patients_in_both = set(train_patients).intersection(set(test_patients))

# Display results
if len(patients_in_both) > 0:
    print(f"⚠️ ALERT: Found {len(patients_in_both)} patients with data in both train and test datasets!")
    print("\nPatients appearing in both datasets:")
    for patient in patients_in_both:
        train_count = included_patients_df[(included_patients_df['pat_nr'] == patient) & 
                                        (included_patients_df['dataset'] == 'train')].shape[0]
        test_count = included_patients_df[(included_patients_df['pat_nr'] == patient) & 
                                       (included_patients_df['dataset'] == 'test')].shape[0]
        print(f"- Patient {patient}: {train_count} entries in train, {test_count} entries in test")
else:
    print("✅ All clear! No patients appear in both train and test datasets.")

# Calculate total number of patients in each dataset
print(f"\nTotal patients in train dataset: {len(train_patients)}")
print(f"Total patients in test dataset: {len(test_patients)}")

In [None]:
# Calculate the counts of train and test datasets
dataset_counts = included_patients_df['dataset'].value_counts()

# Calculate percentages
dataset_percentages = (dataset_counts / len(included_patients_df)) * 100

# Create a DataFrame for better display
dataset_distribution = pd.DataFrame({
    'Dataset': dataset_counts.index,
    'Count': dataset_counts.values,
    'Percentage': dataset_percentages.values
})

# Display the table
print("Dataset Distribution:")
print(dataset_distribution.to_string(index=False, float_format=lambda x: f"{x:.2f}%"))

# Create a pie chart
plt.figure(figsize=(10, 6))
plt.pie(dataset_counts, labels=dataset_counts.index, autopct='%1.1f%%', 
        colors=['skyblue', 'lightgreen'], explode=[0.05, 0], startangle=90, shadow=True)
plt.title('Distribution of Datasets (Train vs Test)', fontsize=16)
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Count the occurrences of each sex
sex_counts = included_patients_df['sex'].value_counts()
sex_percentages = (sex_counts / len(included_patients_df)) * 100

# Create a DataFrame for plotting
sex_data = pd.DataFrame({
    'Sex': sex_counts.index,
    'Count': sex_counts.values,
    'Percentage': sex_percentages.values
})

# Plot the bar chart
plt.figure(figsize=(8, 6))
sns.barplot(data=sex_data, x='Sex', y='Count', palette='pastel')

# Annotate the bars with percentages
for index, row in sex_data.iterrows():
    plt.text(index, row['Count'] + 10, f"{row['Percentage']:.1f}%", ha='center', fontsize=12)

plt.title('Distribution of Sex Attribute', fontsize=16)
plt.xlabel('Sex', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Count occurrences of each patient number
patient_occurrences = included_patients_df['pat_nr'].value_counts()

# Count how many patients have 1, 2, 3, ... occurrences
occurrence_counts = patient_occurrences.value_counts().sort_index()

# Create a DataFrame for plotting
occurrence_data = pd.DataFrame({
    'Number of Occurrences': occurrence_counts.index,
    'Number of Patients': occurrence_counts.values
})

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=occurrence_data, x='Number of Occurrences', y='Number of Patients', palette='pastel')

# Annotate the bars with exact counts
for index, row in occurrence_data.iterrows():
    plt.text(index, row['Number of Patients'] + 2, f"{row['Number of Patients']}", ha='center', fontsize=12)

plt.title('Distribution of Patient Occurrences', fontsize=16)
plt.xlabel('Number of Occurrences per Patient', fontsize=14)
plt.ylabel('Number of Patients', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

# print the total number of patients
total_patients = len(included_patients_df['pat_nr'].unique())
print(f'Total number of patients: {total_patients}')

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(included_patients_df['age_initialdiagnosis'], kde=True, bins=80, 
             color='skyblue', stat='count', edgecolor='black', linewidth=0.8)

plt.title('Distribution of Age at Initial Diagnosis', fontsize=16)
plt.xlabel('Age at Initial Diagnosis', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Count occurrences of each entity type
entity_counts = included_patients_df['entity'].value_counts().reset_index()
entity_counts.columns = ['Entity', 'Count']

# Sort by count for better visualization
entity_counts = entity_counts.sort_values('Count', ascending=False)

# Create the plot
plt.figure(figsize=(14, 10))
bar_plot = sns.barplot(data=entity_counts, x='Count', y='Entity', palette='viridis')

# Add count annotations to the bars
for i, v in enumerate(entity_counts['Count']):
    bar_plot.text(v + 3, i, str(v), va='center', fontsize=10)

# Set labels and title
plt.title('Distribution of Entity Types', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Entity Type', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
# Count occurrences of each localisation_old
entity_counts = included_patients_df['localisation_old'].value_counts().reset_index()
entity_counts.columns = ['Entity', 'Count']

# Sort by count for better visualization
entity_counts = entity_counts.sort_values('Count', ascending=False)

# Create the plot
plt.figure(figsize=(14, 10))
bar_plot = sns.barplot(data=entity_counts, x='Count', y='Entity', palette='viridis')

# Add count annotations to the bars
for i, v in enumerate(entity_counts['Count']):
    bar_plot.text(v + 3, i, str(v), va='center', fontsize=10)

# Set labels and title
plt.title('Distribution of Localisation Old', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Localisation Old', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()


entity_counts = included_patients_df['localisation_1'].value_counts().reset_index()
entity_counts.columns = ['Entity', 'Count']

# Sort by count for better visualization
entity_counts = entity_counts.sort_values('Count', ascending=False)

# Create the plot
plt.figure(figsize=(14, 10))
bar_plot = sns.barplot(data=entity_counts, x='Count', y='Entity', palette='viridis')

# Add count annotations to the bars
for i, v in enumerate(entity_counts['Count']):
    bar_plot.text(v + 3, i, str(v), va='center', fontsize=10)

# Set labels and title
plt.title('Distribution of Localisation 1', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Localisation 1', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()


entity_counts = included_patients_df['localisation_2'].value_counts().reset_index()
entity_counts.columns = ['Entity', 'Count']

# Sort by count for better visualization
entity_counts = entity_counts.sort_values('Count', ascending=False)

# Create the plot
plt.figure(figsize=(14, 10))
bar_plot = sns.barplot(data=entity_counts, x='Count', y='Entity', palette='viridis')

# Add count annotations to the bars
for i, v in enumerate(entity_counts['Count']):
    bar_plot.text(v + 3, i, str(v), va='center', fontsize=10)

# Set labels and title
plt.title('Distribution of Localisation 2', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Localisation 2', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

> NOTE: I should use Localisation 1!

In [None]:
# Count the number of samples per year
year_counts = included_patients_df['year_initialdiagnosis'].value_counts().sort_index()

# Create a DataFrame for plotting
year_data = pd.DataFrame({
    'Year': year_counts.index.astype(int),  # Convert years to integers to remove .0
    'Number of Samples': year_counts.values
})

# Plot
plt.figure(figsize=(14, 8))
sns.barplot(data=year_data, x='Year', y='Number of Samples', color='skyblue')  # Single color

# Add count annotations to the bars
for i, v in enumerate(year_data['Number of Samples']):
    plt.text(i, v + 5, str(v), ha='center', fontsize=10)

# Set labels and title
plt.title('Number of Samples per Year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Samples', fontsize=14)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Count occurrences of each value in the 'side' column
import math
side_counts = included_patients_df['side'].value_counts(dropna=False)

# Create a DataFrame for better display
side_distribution = pd.DataFrame({
    'Side': ['Left (li)', 'Right (re)', 'Empty (blank)', 'NA/Undefined'],
    'Count': [
        side_counts.get('li', 0),
        side_counts.get('re', 0),
        side_counts.get(' ', 0),
        side_counts.get(math.nan, 0),
    ],
    'Percentage': [
        side_counts.get('li', 0) / len(included_patients_df) * 100,
        side_counts.get('re', 0) / len(included_patients_df) * 100,
        side_counts.get(' ', 0) / len(included_patients_df) * 100,
        side_counts.get(math.nan, 0) / len(included_patients_df) * 100
    ]
})

# Display the table with formatted percentages
print(side_distribution.to_string(index=False, float_format=lambda x: f"{x:.2f}%"))

# Plot the distribution
plt.figure(figsize=(10, 6))
bars = plt.bar(side_distribution['Side'], side_distribution['Count'])

# Add count and percentage labels
for bar in bars:
    height = bar.get_height()
    count = int(height)
    percentage = count / len(included_patients_df) * 100
    plt.text(bar.get_x() + bar.get_width()/2., height + 5,
             f'{count}\n({percentage:.1f}%)',
             ha='center', va='bottom', fontsize=10)

plt.title('Distribution of Side Values', fontsize=16)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Get the first 10 samples
first_10 = included_patients_df.head(10)

# Check if bounding box coordinates exist in the dataframe
if all(col in included_patients_df.columns for col in ['xtl', 'ytl', 'xbr', 'ybr']):
    # Calculate width and height
    dimensions = pd.DataFrame({
        'Sample': first_10.index,
        'xtl': first_10['xtl'],
        'ytl': first_10['ytl'],
        'xbr': first_10['xbr'],
        'ybr': first_10['ybr'],
        'Width': first_10['xbr'] - first_10['xtl'],
        'Height': first_10['ybr'] - first_10['ytl'],
        'Area': (first_10['xbr'] - first_10['xtl']) * (first_10['ybr'] - first_10['ytl'])
    })
    
    print("Bounding Box Dimensions for the First 10 Samples:")
    print(dimensions)
else:
    print("Bounding box coordinates (xtl, ytl, xbr, ybr) not found in the dataframe.")
    print("Available columns:", included_patients_df.columns.tolist())

In [None]:
# Calculate bounding box dimensions for all samples
bbox_dimensions = pd.DataFrame({
    'Width': included_patients_df['xbr'] - included_patients_df['xtl'],
    'Height': included_patients_df['ybr'] - included_patients_df['ytl'],
    'Area': (included_patients_df['xbr'] - included_patients_df['xtl']) * (included_patients_df['ybr'] - included_patients_df['ytl'])
})

# Create histograms with KDE for Width, Height, and Area
fig, axes = plt.subplots(3, 1, figsize=(12, 18))

# Plot Width histogram
sns.histplot(bbox_dimensions['Width'], kde=True, bins=50, color='skyblue', 
             stat='count', edgecolor='black', linewidth=0.8, ax=axes[0])
axes[0].set_title('Distribution of Bounding Box Width', fontsize=16)
axes[0].set_xlabel('Width (pixels)', fontsize=14)
axes[0].set_ylabel('Count', fontsize=14)
axes[0].tick_params(labelsize=12)

# Plot Height histogram
sns.histplot(bbox_dimensions['Height'], kde=True, bins=50, color='lightgreen', 
             stat='count', edgecolor='black', linewidth=0.8, ax=axes[1])
axes[1].set_title('Distribution of Bounding Box Height', fontsize=16)
axes[1].set_xlabel('Height (pixels)', fontsize=14)
axes[1].set_ylabel('Count', fontsize=14)
axes[1].tick_params(labelsize=12)

# Plot Area histogram
sns.histplot(bbox_dimensions['Area'], kde=True, bins=50, color='salmon', 
             stat='count', edgecolor='black', linewidth=0.8, ax=axes[2])
axes[2].set_title('Distribution of Bounding Box Area', fontsize=16)
axes[2].set_xlabel('Area (pixels²)', fontsize=14)
axes[2].set_ylabel('Count', fontsize=14)
axes[2].tick_params(labelsize=12)

plt.tight_layout()
plt.show()

# Display summary statistics for the dimensions
print("Summary Statistics for Bounding Box Dimensions:")
print(bbox_dimensions.describe())

## Example images

In [None]:
import os
import matplotlib.patches as patches
from PIL import Image


def display_images_for_anatomy_site(site):
    """Display two example images for a given anatomy site with annotations."""
    # Get images for this anatomy site
    site_images = included_patients_df[included_patients_df['localisation_1'] == site]
    
    if len(site_images) == 0:
        print(f"No images found for anatomy site: {site}")
        return
    
    selected_images = site_images.sample(n=min(2, len(site_images)), random_state=1)
    # Check if we have images to display
    if len(selected_images) == 0:
        print(f"No images available for anatomy site: {site}")
        return
    else:
        fig, axes = plt.subplots(1, len(selected_images), figsize=(15, 6))
        
        # Ensure axes is always a list-like object even if there's only one subplot
        if len(selected_images) == 1:
            axes = [axes]
        
        # Display each selected image
        for j, (ax) in enumerate(axes):
            img_row = selected_images.iloc[j]
            # Load image
            img_path = os.path.join('~/DATA/initial_images', img_row['image'])
            
            try:
                img = Image.open(os.path.expanduser(img_path))
                
                # Display the image
                ax.imshow(img, cmap='gray')

                try:  
                    x1, y1 = img_row['xtl'], img_row['ytl']
                    x2, y2 = img_row['xbr'], img_row['ybr']
                    width = x2 - x1
                    height = y2 - y1
                    rect = patches.Rectangle((x1, y1), width, height, linewidth=2, edgecolor='r', facecolor='none')
                    ax.add_patch(rect)
                            
                except Exception as e:
                    print(f"Error loading bounding box for {img_row}: {e}")
                
                ax.axis('off')
                
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
                ax.text(0.5, 0.5, f"Error loading image: {img_path}", 
                       horizontalalignment='center', verticalalignment='center')
                ax.axis('off')
        
        plt.suptitle(f"Anatomy Site: {site}", fontsize=16)
        plt.tight_layout()
        plt.subplots_adjust(top=0.85)
        plt.show()

anatomy_sites = included_patients_df['localisation_1'].unique()
for site in anatomy_sites:
    display_images_for_anatomy_site(site)

In [None]:
# I found a faulty crop by accident, here it is:

import os
from PIL import Image

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

# Get the specific image ID
image_id = "1.2.840.113654.2.70.1.134954549720182833992293760283424368191[1]13078_1.png"

# Find the row for this image ID
image_row = included_patients_df[included_patients_df['image'] == image_id]

if len(image_row) == 0:
    print(f"Image ID not found in the dataset: {image_id}")
else:
    # Get the first matching row
    image_row = image_row.iloc[0]
    
    # Get bounding box coordinates
    xtl, ytl = image_row['xtl'], image_row['ytl']
    xbr, ybr = image_row['xbr'], image_row['ybr']
    
    # Create figure with two subplots
    fig, axes = plt.subplots(1, 2, figsize=(15, 7))
    
    # Load the image
    img_path = os.path.join('~/DATA/initial_images', image_row['image'])
    img_path_cropped = os.path.join('~/DATA/images_bounding_box_15_500_BILINEAR', image_row['image'])
    
    try:
        img = Image.open(os.path.expanduser(img_path))
        
        # Display original image with bounding box
        axes[0].imshow(img, cmap='gray')
        width = xbr - xtl
        height = ybr - ytl
        rect = patches.Rectangle((xtl, ytl), width, height, linewidth=2, edgecolor='r', facecolor='none')
        axes[0].add_patch(rect)
        axes[0].set_title('Original Image with Bounding Box', fontsize=14)
        axes[0].axis('off')
        
        # Display cropped image
        cropped_img = Image.open(os.path.expanduser(img_path_cropped))
        axes[1].imshow(cropped_img, cmap='gray')
        axes[1].set_title('Cropped Region', fontsize=14)
        axes[1].axis('off')
        
        # Add metadata as text
        plt.figtext(0.5, 0.01, 
                   f"Patient: {image_row['pat_nr']} | Age: {image_row['age_initialdiagnosis']} | "
                   f"Entity: {image_row['entity']} | Location: {image_row['localisation_1']} | "
                   f"Year: {int(image_row['year_initialdiagnosis']) if not pd.isna(image_row['year_initialdiagnosis']) else 'Unknown'}",
                   ha='center', fontsize=12)
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error loading or processing image: {e}")
        print(f"Image path: {img_path}")

In [None]:
# Count number of patients for each entity
patient_counts_by_entity = included_patients_df.groupby('entity')['pat_nr'].nunique().reset_index()
patient_counts_by_entity.columns = ['Entity', 'Number of Patients']

# Sort by count for better visualization
patient_counts_by_entity = patient_counts_by_entity.sort_values('Number of Patients', ascending=False)

# Create the plot
plt.figure(figsize=(14, 10))
bar_plot = sns.barplot(data=patient_counts_by_entity, x='Number of Patients', y='Entity', palette='viridis')

# Add count annotations to the bars
for i, v in enumerate(patient_counts_by_entity['Number of Patients']):
    bar_plot.text(v + 1, i, str(v), va='center', fontsize=10)

# Set labels and title
plt.title('Number of Patients per Entity Type', fontsize=16)
plt.xlabel('Number of Patients', fontsize=14)
plt.ylabel('Entity Type', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

# Print total number of patients
total_patients = included_patients_df['pat_nr'].nunique()
print(f'Total number of unique patients: {total_patients}')