# Explorative Data Analysis - LERA

In [None]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
lera_df = pd.read_csv('~/lera_dataset/LERA Dataset/labels.csv', header=None)

# Display the first few rows to verify
lera_df.head()

In [None]:
lera_df.describe()

In [None]:
# Create a full dataset from all the image paths, s.t. each row is a single image

# Traverse the dataset and create a list of all images
import os

base_dir = '~/lera_dataset/LERA Dataset/'
base_dir = os.path.expanduser(base_dir)

lera_images_df = pd.DataFrame(columns=['image_path', 'case_number', 'anatomy_site', 'label'])

subfolders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

for subfolder in subfolders:
    images = [f for f in os.listdir(os.path.join(os.path.join(base_dir, subfolder), 'ST-1')) if f.endswith('.png')]
    if len(images) == 0:
        print(f'No images found in {os.path.join(base_dir, subfolder)}')
        continue
    for image in images:
        image_path = os.path.join(os.path.join(base_dir, subfolder), 'ST-1', image)
        anatomy_site = lera_df[lera_df[0] == int(subfolder)].iloc[0, 1]
        anatomy_site = anatomy_site.replace("XR ", "")
        label = lera_df[lera_df[0] == int(subfolder)].iloc[0, 2]
        lera_images_df = pd.concat([lera_images_df, pd.DataFrame({'image_path': [image_path], 'case_number': [int(subfolder)], 'anatomy_site': [anatomy_site], 'label': [label]})], ignore_index=True)

lera_images_df = lera_images_df.reset_index(drop=True)

> One folder does not contain any images which makes it useless

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
lera_images_df.head(10)

In [None]:
# Count the total number of unique cases
unique_cases = lera_images_df['case_number'].nunique()
print(unique_cases)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# 1. Count normal vs abnormal cases (folders) using lera_images_df
# Group by number first to count each case only once
case_counts = lera_images_df[['case_number', 'label']].drop_duplicates().astype({'label': int})['label'].value_counts().reset_index()
case_counts.columns = ['Status', 'Count']
case_counts['Status'] = case_counts['Status'].map({0: 'Normal', 1: 'Abnormal'})
case_counts['Percentage'] = (case_counts['Count'] / case_counts['Count'].sum()) * 100

# 2. Count normal vs abnormal images
image_counts = lera_images_df['label'].astype(int).value_counts().reset_index()
image_counts.columns = ['Status', 'Count']
image_counts['Status'] = image_counts['Status'].map({0: 'Normal', 1: 'Abnormal'})
image_counts['Percentage'] = (image_counts['Count'] / image_counts['Count'].sum()) * 100

# 3. Create the bar charts
sns.barplot(x='Status', y='Count', data=case_counts, ax=ax1, palette='viridis', hue='Status')
ax1.set_title('Distribution of Cases (Folders)')
ax1.set_ylabel('Number of Cases')
for i, row in case_counts.iterrows():
    ax1.text(i, row['Count'] + 1, f'{row["Count"]} ({row["Percentage"]:.1f}%)', 
             ha='center', va='bottom')

sns.barplot(x='Status', y='Count', data=image_counts, ax=ax2, palette='viridis', hue='Status')
ax2.set_title('Distribution of Images')
ax2.set_ylabel('Number of Images')
for i, row in image_counts.iterrows():
    ax2.text(i, row['Count'] + 20, f'{row["Count"]} ({row["Percentage"]:.1f}%)', 
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import defaultdict

# Define the base directory
base_dir = '~/lera_dataset/LERA Dataset/'
base_dir = os.path.expanduser(base_dir)

# Dictionary to store the number of subfolders for each main folder
subfolder_counts = {}
subfolder_names = {}

# Get only the immediate subfolders of the base directory
main_subfolders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

for subfolder in main_subfolders:
    subfolder_path = os.path.join(base_dir, subfolder)
    # Get the subfolders within this main subfolder
    sub_subfolders = [d for d in os.listdir(subfolder_path) if os.path.isdir(os.path.join(subfolder_path, d))]
    
    # Store the count
    subfolder_counts[subfolder] = len(sub_subfolders)
    # Store the names
    subfolder_names[subfolder] = sub_subfolders

# Create a DataFrame for easier visualization
subfolder_df = pd.DataFrame({
    'MainFolder': list(subfolder_counts.keys()),
    'SubfolderCount': list(subfolder_counts.values())
})

# Count how many main folders have X number of subfolders
count_distribution = subfolder_df['SubfolderCount'].value_counts().reset_index()
count_distribution.columns = ['Number of Subfolders', 'Count of Main Folders']
count_distribution = count_distribution.sort_values('Number of Subfolders')

# Group main folders by number of subfolders
folders_by_count = defaultdict(list)
for folder, count in subfolder_counts.items():
    folders_by_count[count].append(folder)

# Create a bar chart
plt.figure(figsize=(12, 7))
ax = sns.barplot(x='Number of Subfolders', y='Count of Main Folders', data=count_distribution)

# Add labels
plt.title('Distribution of Subfolder Counts', fontsize=16)
plt.xlabel('Number of Subfolders', fontsize=14)
plt.ylabel('Count of Main Folders', fontsize=14)

# Add count labels on top of each bar
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{int(height)}', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='bottom',
                xytext=(0, 5), textcoords='offset points')

plt.tight_layout()
plt.show()

# Print information about which folders have which number of subfolders
print("Details of Main Folders by Number of Subfolders:")
for count, folders in sorted(folders_by_count.items()):
    print(f"\n{len(folders)} main folder(s) have {count} subfolder(s):")
    for folder in sorted(folders):
        subfolder_list = subfolder_names[folder]
        print(f"  {folder}: {subfolder_list}")

> Insight: Every numbered subfolder has the subfolder ST-1.

In [None]:
# Create a full sequence from 1001 to 1182
expected_folders = [str(i) for i in range(1001, 1183)]

# Find missing folders by comparing with existing folders
missing_folders = set(expected_folders) - set(main_subfolders)

# Sort the missing folders for better readability
missing_folders = sorted(missing_folders)

print(f"Missing folders ({len(missing_folders)}):")
print(missing_folders)

> There are 13 folders too little compared to what they state in their paper.

In [None]:
number_of_images = lera_images_df.groupby(['case_number'])[['image_path']].count().value_counts().reset_index()
number_of_images.columns = ['Number of Images', 'Count']

plt.figure(figsize=(12, 7))
plt.title('Distribution of Number of Images per Case')
plt.xlabel('Number of Images')
plt.ylabel('Number of Cases')
ax = sns.barplot(x='Number of Images', y='Count', data=number_of_images)
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{int(height)}', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='bottom',
                xytext=(0, 5), textcoords='offset points')
plt.show()

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

# 1. Distribution of anatomy sites across cases (each case counted once)
case_anatomy = lera_images_df[['case_number', 'anatomy_site']].drop_duplicates()
case_anatomy_counts = case_anatomy['anatomy_site'].value_counts().reset_index()
case_anatomy_counts.columns = ['Anatomy Site', 'Count']
case_anatomy_counts['Percentage'] = (case_anatomy_counts['Count'] / case_anatomy_counts['Count'].sum()) * 100

# 2. Distribution of anatomy sites across all images
image_anatomy_counts = lera_images_df['anatomy_site'].value_counts().reset_index()
image_anatomy_counts.columns = ['Anatomy Site', 'Count']
image_anatomy_counts['Percentage'] = (image_anatomy_counts['Count'] / image_anatomy_counts['Count'].sum()) * 100

# Plot for cases
sns.barplot(x='Anatomy Site', y='Count', data=case_anatomy_counts, ax=ax1, hue='Anatomy Site')
ax1.set_title('Distribution of Anatomy Sites Across Cases', fontsize=16)
ax1.set_xlabel('Anatomy Site', fontsize=14)
ax1.set_ylabel('Number of Cases', fontsize=14)

# Add case count labels
for i, row in case_anatomy_counts.iterrows():
    ax1.text(i, row['Count'] + 1, f"{row['Count']} ({row['Percentage']:.1f}%)", 
             ha='center', va='bottom', fontsize=12)

# Plot for images
sns.barplot(x='Anatomy Site', y='Count', data=image_anatomy_counts, ax=ax2, hue='Anatomy Site')
ax2.set_title('Distribution of Anatomy Sites Across Images', fontsize=16)
ax2.set_xlabel('Anatomy Site', fontsize=14)
ax2.set_ylabel('Number of Images', fontsize=14)

# Add image count labels
for i, row in image_anatomy_counts.iterrows():
    ax2.text(i, row['Count'] + 20, f"{row['Count']} ({row['Percentage']:.1f}%)", 
             ha='center', va='bottom', fontsize=12)

plt.tight_layout()
plt.show()

## Example images

In [None]:
from PIL import Image
import numpy as np

import matplotlib.pyplot as plt

# Get unique anatomy sites
anatomy_sites = lera_images_df['anatomy_site'].unique()

# Set up the figure
fig, axes = plt.subplots(len(anatomy_sites), 2, figsize=(12, 5*len(anatomy_sites)))

# For each anatomy site, display one normal and one abnormal image
for i, site in enumerate(anatomy_sites):
    # Filter dataframe for this anatomy site
    site_df = lera_images_df[lera_images_df['anatomy_site'] == site]
    
    # Get one normal image (label=0)
    normal_img_path = site_df[site_df['label'] == 0]['image_path'].iloc[0]
    
    # Get one abnormal image (label=1)
    abnormal_img_path = site_df[site_df['label'] == 1]['image_path'].iloc[0]
    
    # Display normal image
    normal_img = Image.open(normal_img_path).convert('L')
    axes[i, 0].imshow(np.array(normal_img), cmap='gray')
    axes[i, 0].set_title(f"{site} - Normal (Label=0)")
    axes[i, 0].axis('off')
    
    # Display abnormal image
    abnormal_img = Image.open(abnormal_img_path).convert('L')
    axes[i, 1].imshow(np.array(abnormal_img), cmap='gray')
    axes[i, 1].set_title(f"{site} - Abnormal (Label=1)")
    axes[i, 1].axis('off')

plt.tight_layout()
plt.show()