# Explorative Data Analysis - MURA

In [None]:
import pandas as pd

train_labels_file_path = '~/muramskxrays/MURA-v1.1/train_labeled_studies.csv'
train_labels_df = pd.read_csv(train_labels_file_path, header=None)
train_image_path_file_path = '~/muramskxrays/MURA-v1.1/train_image_paths.csv'
train_image_paths_df = pd.read_csv(train_image_path_file_path, header=None)

validation_file_path = '~/muramskxrays/MURA-v1.1/valid_labeled_studies.csv'
validation_labels_df = pd.read_csv(validation_file_path, header=None)
validation_image_path_file_path = '~/muramskxrays/MURA-v1.1/valid_image_paths.csv'
validation_image_path_df = pd.read_csv(validation_image_path_file_path, header=None)

In [None]:
pd.set_option('display.max_colwidth', None)
print(train_labels_df.head())
print(train_image_paths_df.head())

print(validation_labels_df.head())
print(validation_image_path_df.head())

In [None]:
# Extract anatomy site
def extract_anatomy_site(path):
    try:
        # Find string after "XR_" and before next "/"
        parts = path.split('XR_')[1].split('/')
        return parts[0]
    except:
        print(f"Error processing path: {path}")
        return 'Unknown'

# Extract anatomy sites for training and validation sets
train_image_paths_df['anatomy_site'] = train_image_paths_df.iloc[:, 0].apply(extract_anatomy_site)
validation_image_path_df['anatomy_site'] = validation_image_path_df.iloc[:, 0].apply(extract_anatomy_site)

# Function to extract study information
def extract_study_info(path):
    try:
        # Split by patient and study
        parts = path.split('patient')[1]
        # Patient ID is before the next /
        patient_id = parts.split('/')[0]
        # Study part is after the patient_id/
        study_part = parts.split('/')[1]
        # Extract study number
        study_number = study_part.split('_')[0].replace('study', '')
        return patient_id, study_number
    except:
        print(f"Error processing path: {path}")
        return 'Unknown', 'Unknown'

# Create DataFrames to store patient and study information
train_image_paths_df[['patient_id', 'study_number']] = train_image_paths_df.iloc[:, 0].apply(lambda x: pd.Series(extract_study_info(x)))
train_image_paths_df.columns = ['image_path', 'anatomy_site', 'patient_id', 'study_number']
validation_image_path_df[['patient_id', 'study_number']] = validation_image_path_df.iloc[:, 0].apply(lambda x: pd.Series(extract_study_info(x)))
validation_image_path_df.columns = ['image_path', 'anatomy_site', 'patient_id', 'study_number']

In [None]:
print(train_image_paths_df.head())
print(validation_image_path_df.head())

In [None]:
train_image_paths_df['study_path'] = train_image_paths_df['image_path'].apply(lambda x: '/'.join(x.split('/')[:-1]) + '/')
validation_image_path_df['study_path'] = validation_image_path_df['image_path'].apply(lambda x: '/'.join(x.split('/')[:-1]) + '/')

# Merge with labels
train_labels_df.columns = ['study_path', 'label']
validation_labels_df.columns = ['study_path', 'label']

train_image_paths_df = train_image_paths_df.merge(train_labels_df, on='study_path', how='left')
validation_image_path_df = validation_image_path_df.merge(validation_labels_df, on='study_path', how='left')

# drop study_path
train_image_paths_df.drop(columns=['study_path'], inplace=True)
validation_image_path_df.drop(columns=['study_path'], inplace=True)

print(train_image_paths_df.head())
print(validation_image_path_df.head())


In [None]:
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

# Get the total counts for train and validation
train_count = len(train_image_paths_df)
valid_count = len(validation_image_path_df)
total_count = train_count + valid_count

# Create a dataframe for plotting
data = pd.DataFrame({
    'Dataset': ['Training Set', 'Validation Set'],
    'Count': [train_count, valid_count]
})

# Create the plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=data, x='Dataset', y='Count')

# Add the counts and percentages to the bars
for i, (count, percentage) in enumerate(zip(data['Count'], data['Count']/total_count*100)):
    ax.text(i, count/2, f'Count: {count}\n({percentage:.1f}%)', 
            ha='center', va='center', color='white', fontweight='bold', fontsize=12)

# Set title and labels
plt.title('Distribution of Samples between Training and Validation Sets', fontsize=16)
plt.xlabel('Dataset', fontsize=14)
plt.ylabel('Number of Images', fontsize=14)

# Add the total at the top of the figure
plt.figtext(0.5, 0.01, f'Total number of images: {total_count}', 
            ha='center', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Get counts for image level
train_image_label_counts = train_image_paths_df['label'].value_counts().reset_index()
train_image_label_counts.columns = ['Label', 'Count']
train_image_label_counts['Label'] = train_image_label_counts['Label'].apply(
    lambda x: 'Negative (0)' if x == 0 else 'Positive (1)'
)
train_image_data = train_image_label_counts.sort_values('Label', ascending=False)

valid_image_label_counts = validation_image_path_df['label'].value_counts().reset_index()
valid_image_label_counts.columns = ['Label', 'Count']
valid_image_label_counts['Label'] = valid_image_label_counts['Label'].apply(
    lambda x: 'Negative (0)' if x == 0 else 'Positive (1)'
)
valid_image_data = valid_image_label_counts.sort_values('Label', ascending=False)

# Get counts for study level
train_study_level = train_image_paths_df.groupby(['patient_id', 'study_number', 'label']).size().reset_index(name='images_count')
train_study_label_counts = train_study_level['label'].value_counts().reset_index()
train_study_label_counts.columns = ['Label', 'Count']
train_study_label_counts['Label'] = train_study_label_counts['Label'].apply(
    lambda x: 'Negative (0)' if x == 0 else 'Positive (1)'
)
train_study_data = train_study_label_counts.sort_values('Label', ascending=False)

valid_study_level = validation_image_path_df.groupby(['patient_id', 'study_number', 'label']).size().reset_index(name='images_count')
valid_study_label_counts = valid_study_level['label'].value_counts().reset_index()
valid_study_label_counts.columns = ['Label', 'Count']
valid_study_label_counts['Label'] = valid_study_label_counts['Label'].apply(
    lambda x: 'Negative (0)' if x == 0 else 'Positive (1)'
)
valid_study_data = valid_study_label_counts.sort_values('Label', ascending=False)

# Create figure for training data with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))
fig.suptitle('Distribution of Normal vs Abnormal Cases in Training Set', fontsize=18)

# Study level plot for training
sns.barplot(data=train_study_data, x='Label', y='Count', ax=ax1)
total_train_studies = train_study_data['Count'].sum()
for bar, count in zip(ax1.patches, train_study_data['Count']):
    percentage = (count / total_train_studies) * 100
    ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 50,
            f'{count}\n({percentage:.2f}%)', ha='center', fontsize=12)
ax1.set_title('Study Level', fontsize=16)
ax1.set_xlabel('Label', fontsize=14)
ax1.set_ylabel('Count', fontsize=14)
ax1.set_ylim(0, train_study_data['Count'].max() * 1.15)

# Image level plot for training
sns.barplot(data=train_image_data, x='Label', y='Count', ax=ax2)
total_train_images = train_image_data['Count'].sum()
for bar, count in zip(ax2.patches, train_image_data['Count']):
    percentage = (count / total_train_images) * 100
    ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 500,
            f'{count}\n({percentage:.2f}%)', ha='center', fontsize=12)
ax2.set_title('Image Level', fontsize=16)
ax2.set_xlabel('Label', fontsize=14)
ax2.set_ylabel('Count', fontsize=14)
ax2.set_ylim(0, train_image_data['Count'].max() * 1.15)

plt.tight_layout()
plt.subplots_adjust(top=0.88)
plt.show()

# Create figure for validation data with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))
fig.suptitle('Distribution of Normal vs Abnormal Cases in Validation Set', fontsize=18)

# Study level plot for validation
sns.barplot(data=valid_study_data, x='Label', y='Count', ax=ax1)
total_valid_studies = valid_study_data['Count'].sum()
for bar, count in zip(ax1.patches, valid_study_data['Count']):
    percentage = (count / total_valid_studies) * 100
    ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 20,
            f'{count}\n({percentage:.2f}%)', ha='center', fontsize=12)
ax1.set_title('Study Level', fontsize=16)
ax1.set_xlabel('Label', fontsize=14)
ax1.set_ylabel('Count', fontsize=14)
ax1.set_ylim(0, valid_study_data['Count'].max() * 1.15)

# Image level plot for validation
sns.barplot(data=valid_image_data, x='Label', y='Count', ax=ax2)
total_valid_images = valid_image_data['Count'].sum()
for bar, count in zip(ax2.patches, valid_image_data['Count']):
    percentage = (count / total_valid_images) * 100
    ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 40,
            f'{count}\n({percentage:.2f}%)', ha='center', fontsize=12)
ax2.set_title('Image Level', fontsize=16)
ax2.set_xlabel('Label', fontsize=14)
ax2.set_ylabel('Count', fontsize=14)
ax2.set_ylim(0, valid_image_data['Count'].max() * 1.15)

plt.tight_layout()
plt.subplots_adjust(top=0.88)
plt.show()

In [None]:
# Count anatomy sites
train_site_counts = train_image_paths_df['anatomy_site'].value_counts().reset_index()
train_site_counts.columns = ['Anatomy Site', 'Count']
train_site_counts = train_site_counts.sort_values('Count', ascending=False)

valid_site_counts = validation_image_path_df['anatomy_site'].value_counts().reset_index()
valid_site_counts.columns = ['Anatomy Site', 'Count']
valid_site_counts = valid_site_counts.sort_values('Count', ascending=False)

# Plot training data
plt.figure(figsize=(12, 6))
ax = sns.barplot(data=train_site_counts, x='Anatomy Site', y='Count')

# Add total number and percentage to the top of the bars
total_train = train_site_counts['Count'].sum()
for bar, count in zip(ax.patches, train_site_counts['Count']):
    percentage = (count / total_train) * 100
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5,
            f'{count}\n({percentage:.1f}%)', ha='center', fontsize=10)

plt.title('Distribution of Anatomy Sites in Training Set', fontsize=16)
plt.xlabel('Anatomy Site', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

# Plot validation data
plt.figure(figsize=(12, 6))
ax = sns.barplot(data=valid_site_counts, x='Anatomy Site', y='Count')

# Add total number and percentage to the top of the bars
total_valid = valid_site_counts['Count'].sum()
for bar, count in zip(ax.patches, valid_site_counts['Count']):
    percentage = (count / total_valid) * 100
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5,
            f'{count}\n({percentage:.1f}%)', ha='center', fontsize=10)

plt.title('Distribution of Anatomy Sites in Validation Set', fontsize=16)
plt.xlabel('Anatomy Site', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Count images per patient
train_patient_counts = train_image_paths_df['patient_id'].value_counts().reset_index()
train_patient_counts.columns = ['Patient ID', 'Number of Images']

valid_patient_counts = validation_image_path_df['patient_id'].value_counts().reset_index()
valid_patient_counts.columns = ['Patient ID', 'Number of Images']

# Get the distribution of image counts per patient
train_distribution = train_patient_counts['Number of Images'].value_counts().reset_index()
train_distribution.columns = ['Images per Patient', 'Number of Patients']
train_distribution = train_distribution.sort_values('Images per Patient')

valid_distribution = valid_patient_counts['Number of Images'].value_counts().reset_index()
valid_distribution.columns = ['Images per Patient', 'Number of Patients']
valid_distribution = valid_distribution.sort_values('Images per Patient')

# Plot for training set
plt.figure(figsize=(16, 8))
ax = sns.barplot(data=train_distribution, x='Images per Patient', y='Number of Patients')

# Add total number and percentage to the top of the bars
total_train = train_distribution['Number of Patients'].sum()
for bar, count in zip(ax.patches, train_distribution['Number of Patients']):
    percentage = (count / total_train) * 100
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
            f'{count}\n({percentage:.1f}%)', ha='center', fontsize=10)

plt.title('Distribution of Images per Patient in Training Set', fontsize=16)
plt.xlabel('Number of Images', fontsize=14)
plt.ylabel('Number of Patients', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Plot for validation set
plt.figure(figsize=(16, 8))
ax = sns.barplot(data=valid_distribution, x='Images per Patient', y='Number of Patients')

# Add total number and percentage to the top of the bars
total_valid = valid_distribution['Number of Patients'].sum()
for bar, count in zip(ax.patches, valid_distribution['Number of Patients']):
    percentage = (count / total_valid) * 100
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
            f'{count}\n({percentage:.1f}%)', ha='center', fontsize=10)

plt.title('Distribution of Images per Patient in Validation Set', fontsize=16)
plt.xlabel('Number of Images', fontsize=14)
plt.ylabel('Number of Patients', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Count number of unique studies per patient
train_studies_per_patient = train_image_paths_df.groupby('patient_id')['study_number'].nunique().reset_index()
train_studies_per_patient.columns = ['Patient ID', 'Number of Studies']

valid_studies_per_patient = validation_image_path_df.groupby('patient_id')['study_number'].nunique().reset_index()
valid_studies_per_patient.columns = ['Patient ID', 'Number of Studies']

# Get distribution of studies per patient
train_study_distribution = train_studies_per_patient['Number of Studies'].value_counts().reset_index()
train_study_distribution.columns = ['Studies per Patient', 'Number of Patients']
train_study_distribution = train_study_distribution.sort_values('Studies per Patient')

valid_study_distribution = valid_studies_per_patient['Number of Studies'].value_counts().reset_index()
valid_study_distribution.columns = ['Studies per Patient', 'Number of Patients']
valid_study_distribution = valid_study_distribution.sort_values('Studies per Patient')

# Plot for training set
plt.figure(figsize=(12, 6))
ax = sns.barplot(data=train_study_distribution, x='Studies per Patient', y='Number of Patients')

# Add total number and percentage to the top of the bars
total_train_patients = train_study_distribution['Number of Patients'].sum()
for bar, count in zip(ax.patches, train_study_distribution['Number of Patients']):
    percentage = (count / total_train_patients) * 100
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5,
            f'{count}\n({percentage:.1f}%)', ha='center', fontsize=10)

plt.title('Distribution of Studies per Patient in Training Set', fontsize=16)
plt.xlabel('Number of Studies', fontsize=14)
plt.ylabel('Number of Patients', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

# Plot for validation set
plt.figure(figsize=(12, 6))
ax = sns.barplot(data=valid_study_distribution, x='Studies per Patient', y='Number of Patients')

# Add total number and percentage to the top of the bars
total_valid_patients = valid_study_distribution['Number of Patients'].sum()
for bar, count in zip(ax.patches, valid_study_distribution['Number of Patients']):
    percentage = (count / total_valid_patients) * 100
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5,
            f'{count}\n({percentage:.1f}%)', ha='center', fontsize=10)

plt.title('Distribution of Studies per Patient in Validation Set', fontsize=16)
plt.xlabel('Number of Studies', fontsize=14)
plt.ylabel('Number of Patients', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

## Example images

In [None]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

# Function to display example images using subplots
def display_example_images(site):
    # Get one normal and one abnormal image for the given site
    normal_example = train_image_paths_df[(train_image_paths_df['anatomy_site'] == site) & 
                                          (train_image_paths_df['label'] == 0)]['image_path'].iloc[0]
    abnormal_example = train_image_paths_df[(train_image_paths_df['anatomy_site'] == site) & 
                                            (train_image_paths_df['label'] == 1)]['image_path'].iloc[0]
    
    # Prepend MURA dataset path
    normal_example = os.path.expanduser('~/muramskxrays/' + normal_example)
    abnormal_example = os.path.expanduser('~/muramskxrays/' + abnormal_example)

    # Create a single figure with two subplots
    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    fig.suptitle(f'{site.upper()} X-ray Examples', fontsize=18)

    # Load and show normal image
    try:
        img = Image.open(normal_example)
        axs[0].imshow(np.array(img), cmap='gray')
        axs[0].set_title('Normal')
        axs[0].axis('off')
    except Exception as e:
        print(f"Error loading normal image: {e}")
        print(f"Path: {normal_example}")

    # Load and show abnormal image
    try:
        img = Image.open(abnormal_example)
        axs[1].imshow(np.array(img), cmap='gray')
        axs[1].set_title('Abnormal')
        axs[1].axis('off')
    except Exception as e:
        print(f"Error loading abnormal image: {e}")
        print(f"Path: {abnormal_example}")

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to fit suptitle
    plt.show()

# Get unique anatomy sites
anatomy_sites = train_image_paths_df['anatomy_site'].unique()

# Display examples for each anatomy site
for site in anatomy_sites[1:]:
    print(f"\n--- {site.upper()} X-RAY EXAMPLES ---")
    display_example_images(site)
