# Explorative Data Analysis - Internal Dataset Non Tumor Samples

These non-tumor samples have only been provided later during the thesis (end of June).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

file_path = '~/DATA/subset_1800_patients_new.csv'
healthies = pd.read_csv(file_path)
print(healthies.shape)

In [None]:
pd.set_option('display.max_columns', None)
# show entire columns dont abbreviate the values in the columns
pd.set_option('display.max_colwidth', None)

healthies.head(10)

In [None]:
def extract_sex(df, sex_column):
    col = df[sex_column].str.extract(r"CS:\s*'([MF])'")

    # Optional: replace unexpected values with None or a default
    col = col.where(col.isin(['M', 'F']), None)

    return col
    
age_col = extract_sex(healthies, 'PatientSex')
healthies['sex_cleaned'] = age_col

number_of_undefined_sex = healthies['sex_cleaned'].isnull().sum()
print(f"Number of undefined sex entries: {number_of_undefined_sex}")

healthies.head(10)


In [None]:
def extract_age(df, age_column):
    col = df[age_column].str.extract(r"AS:\s*'(\d{3})Y'")

    # Optional: replace unexpected values with None or a default
    col = col[0].where(col[0].str.match(r'^\d{3}$'), None)

    # make column integer
    col = pd.to_numeric(col, errors='coerce')

    return col
    
age_col = extract_age(healthies, 'PatientAge')
healthies['age_cleaned'] = age_col

number_of_undefined_age = healthies['age_cleaned'].isnull().sum()
print(f"Number of undefined age entries: {number_of_undefined_age}")

healthies.head(10)

In [None]:
def extract_anatomy_site(df, anatomy_site_column):
    col = df[anatomy_site_column].str.extract(r"CS:\s*'([A-Z ]+)'")

    # Optional: replace unexpected values with None or a default
    col = col.where(col.notna(), None)

    return col
    
anatomy_site_col = extract_anatomy_site(healthies, 'BodyPartExamined')
healthies['anatomy_site_cleaned'] = anatomy_site_col

number_of_undefined_anatomy_site = healthies['anatomy_site_cleaned'].isnull().sum()
print(f"Number of undefined anatomy_site entries: {number_of_undefined_anatomy_site}")

healthies.head(10)

In [None]:
# remove rows where either sex, age or anatomy site is undefined
# Count original number of rows
original_count = len(healthies)

# Remove rows where either sex, age or anatomy site is undefined
healthies = healthies.dropna(subset=['sex_cleaned', 'age_cleaned', 'anatomy_site_cleaned'])

# Count new number of rows
new_count = len(healthies)

# Calculate and print number of removed rows
removed_count = original_count - new_count
print(f"Original number of rows: {original_count}")
print(f"Number of rows after cleaning: {new_count}")
print(f"Number of rows removed: {removed_count}")

In [None]:
import seaborn as sns

# Count the occurrences of each anatomy site
anatomy_counts = healthies['anatomy_site_cleaned'].value_counts().reset_index()
anatomy_counts.columns = ['Anatomy Site', 'Count']

# Create a bar chart
plt.figure(figsize=(12, 6))
sns.barplot(x='Anatomy Site', y='Count', data=anatomy_counts)
plt.title('Distribution of Anatomy Sites')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print the list of unique anatomy sites (sorted alphabetically)
print("List of all anatomy sites (sorted alphabetically):")
print(sorted(anatomy_counts['Anatomy Site'].to_list()))
print(f"Total number of unique anatomy sites: {len(anatomy_counts)}")

In [None]:
# map anatomy sites to coarse anatomy sites
anatomy_mapping = {
    'ABDOMEN': 'undefined',
    'AC GELENK': 'shoulder',
    'AC JOINT': 'shoulder',
    'ANKLE': 'foot',
    'ARM': 'arm',
    'BECKEN': 'hip',
    'BEIN': 'leg',
    'BRUSTBEIN': 'undefined',
    'BWS': 'spine',
    'C SPINE': 'spine',
    'CALCANEUS': 'foot',
    'CHEST': 'undefined',
    'CLAVICLE': 'shoulder',
    'CLAVICULA': 'shoulder',
    'COCCYX': 'spine',
    'CSPINE': 'spine',
    'ELBOW': 'elbow',
    'ELLENBOGEN': 'elbow',
    'EXTREMITAT': 'undefined',
    'FEMUR': 'upper leg',
    'FERSENBEIN': 'foot',
    'FINGER': 'hand',
    'FOOT': 'foot',
    'FOREARM': 'lower arm',
    'FUSS': 'foot',
    'HAND': 'hand',
    'HANDGELENK': 'hand',
    'HIP': 'hip',
    'HUEFTE': 'hip',
    'HUMERUS': 'upper arm',
    'HWS': 'spine',
    'HWS DENS': 'spine',
    'ISG': 'hip',
    'JOCHBOGEN': 'spine',
    'KNEE': 'knee',
    'KNIE': 'knee',
    'KNIESCHEIBE': 'knee',
    'L SPINE': 'spine',
    'LEG': 'leg',
    'LSPINE': 'spine',
    'LUNGE': 'undefined',
    'LWS': 'spine',
    'NASAL BONES': 'undefined',
    'NASENBEIN': 'undefined',
    'OBERARM': 'upper arm',
    'OBERSCHENKEL': 'lower leg',
    'ORBITA': 'undefined',
    'PATELLA': 'knee',
    'PELVIS': 'hip',
    'RIBS': 'undefined',
    'RIPPEN': 'undefined',
    'SACRUM': 'spine',
    'SCAPULA': 'shoulder',
    'SCHULTER': 'shoulder',
    'SCHULTERBLATT': 'shoulder',
    'SHOULDER': 'shoulder',
    'SKULL': 'undefined',
    'SPINE': 'spine',
    'SPRUNGGELENK': 'foot',
    'T SPINE': 'spine',
    'THORAX': 'undefined',
    'THORAX BETT': 'undefined',
    'TIBIA': 'lower leg',
    'TIBIA FIBULA': 'lower leg',
    'TOES': 'foot',
    'TSPINE': 'spine',
    'UNTERSCHENKEL': 'lower leg',
    'WIRBELSAULE': 'spine',
    'WRIST': 'hand',
    'ZAHNMEDIZIN': 'undefined'
}

In [None]:
healthies['anatomy_site_coarse'] = healthies['anatomy_site_cleaned'].map(anatomy_mapping)
healthies.head(2)

In [None]:
# remove rows where anatomy site coarse is undefined and print how many rows were removed
original_count = len(healthies)
healthies = healthies[healthies['anatomy_site_coarse'] != 'undefined']
new_count = len(healthies)
removed_count = original_count - new_count
print(f"Original number of rows: {original_count}")
print(f"Number of rows after cleaning: {new_count}")
print(f"Number of rows removed: {removed_count}")

In [None]:
import seaborn as sns

# Count the occurrences of each anatomy site
anatomy_counts = healthies['anatomy_site_coarse'].value_counts().reset_index()
anatomy_counts.columns = ['Anatomy Site', 'Count']

# Create a bar chart
plt.figure(figsize=(12, 6))
sns.barplot(x='Anatomy Site', y='Count', data=anatomy_counts)
plt.title('Distribution of Anatomy Sites')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print the list of unique anatomy sites (sorted alphabetically)
print("List of all coarse anatomy sites (sorted alphabetically):")
print(sorted(anatomy_counts['Anatomy Site'].to_list()))
print(f"Total number of unique coarse anatomy sites: {len(anatomy_counts)}")

In [None]:
# Create a figure
plt.figure(figsize=(10, 6))

# Plot age distribution using seaborn
sns.histplot(data=healthies, x='age_cleaned', kde=True, bins=20)

# Add a vertical line for the mean age
mean_age = healthies['age_cleaned'].mean()
plt.axvline(mean_age, color='red', linestyle='--', label=f'Mean: {mean_age:.1f}')

# Add a vertical line for the median age
median_age = healthies['age_cleaned'].median()
plt.axvline(median_age, color='green', linestyle='-.', label=f'Median: {median_age:.1f}')

# Set labels and title
plt.title('Age Distribution of Patients')
plt.xlabel('Age (years)')
plt.ylabel('Count')
plt.legend()

# Print some statistics
print(f"Age Statistics:")
print(f"Min: {healthies['age_cleaned'].min()}")
print(f"Max: {healthies['age_cleaned'].max()}")
print(f"Mean: {mean_age:.2f}")
print(f"Median: {median_age:.2f}")
print(f"Standard Deviation: {healthies['age_cleaned'].std():.2f}")

plt.tight_layout()
plt.show()

In [None]:
# Count the occurrences of each sex
sex_counts = healthies['sex_cleaned'].value_counts().reset_index()
sex_counts.columns = ['Sex', 'Count']

# Calculate percentages
total = sex_counts['Count'].sum()
sex_counts['Percentage'] = (sex_counts['Count'] / total * 100).round(1)

# Create a bar chart
plt.figure(figsize=(8, 6))
bars = sns.barplot(x='Sex', y='Count', data=sex_counts, palette='viridis')

# Add count and percentage labels on top of bars
for i, bar in enumerate(bars.patches):
    bars.text(
        bar.get_x() + bar.get_width()/2, 
        bar.get_height() + 5, 
        f"{sex_counts['Count'].iloc[i]} ({sex_counts['Percentage'].iloc[i]}%)",
        ha='center'
    )

# Set labels and title
plt.title('Distribution of Patients by Sex')
plt.xlabel('Sex')
plt.ylabel('Count')

# Print summary statistics
print(f"Sex Distribution:")
for i, row in sex_counts.iterrows():
    print(f"{row['Sex']}: {row['Count']} ({row['Percentage']}%)")

plt.tight_layout()
plt.show()

In [None]:
# only keep FilePath column and age, sex and anatomy site cleaned
healthies_important = healthies[['FilePath', 'age_cleaned', 'sex_cleaned', 'anatomy_site_coarse']]
# rename the columns
healthies_important.columns = ['file', 'age', 'sex', 'anatomy_site']
# only use the last file name after the last slash for file
healthies_important['file'] = healthies_important['file'].str.split('/').str[-1]

print(len(healthies_important))

healthies_important.head()

In [None]:
import os
import PIL
import numpy as np
from glob import glob

image_folder = '/mnt/nfs/homedirs/benjamins/DATA/healthy_subset_new/'

# Create a figure with 5 subplots (1 row, 5 columns)
fig, axes = plt.subplots(1, 5, figsize=(20, 5))

# Expand the tilde in the path
image_folder_expanded = os.path.expanduser(image_folder)

# Get the first 5 file names from healthies_important
first_5_files = healthies_important['file'].iloc[:5].values

# Display each image
for i, ax in enumerate(axes):
    try:
        # Construct the full path to the image
        # Since the file column contains dicom UIDs, we need to find the corresponding image file
        file_path = healthies_important['file'].iloc[i] + '.png'
        image_path = os.path.join(image_folder_expanded, file_path)
        print(image_path)
        
        img = PIL.Image.open(image_path)
        ax.imshow(img, cmap='gray')
        ax.set_title(f"Image {i+1}\nAnatomy Site {healthies_important['anatomy_site'].iloc[i]}\nAge: {healthies_important['age'].iloc[i]}\nSex: {healthies_important['sex'].iloc[i]}")
        ax.axis('off')
    except Exception as e:
        ax.text(0.5, 0.5, f"Error: {str(e)}", ha='center', va='center')
        ax.axis('off')

plt.tight_layout()
plt.show()

In [None]:
import os

# check if the png image exists.
# It it is missing it means that the conversion from DICOM to PNG failed.

# Record the original number of rows
original_count = len(healthies_important)

# Expand the image folder path
image_folder_expanded = os.path.expanduser(image_folder)

# Function to check if the PNG file exists
def file_exists(filename):
    full_path = os.path.join(image_folder_expanded, filename + '.png')
    return os.path.exists(full_path)

# Create a mask of rows where the file exists
file_exists_mask = healthies_important['file'].apply(file_exists)

# Filter the DataFrame to keep only rows where the file exists
healthies_important = healthies_important[file_exists_mask]

# Calculate how many rows were removed
new_count = len(healthies_important)
removed_count = original_count - new_count

# Print results
print(f"Original number of rows: {original_count}")
print(f"Number of rows after filtering: {new_count}")
print(f"Number of rows removed (files not found): {removed_count}")

In [None]:
healthies_important.head()

In [None]:
# for each entry read the patient id from file + '.dcm' and save the cleaned dataframe
import pydicom
def get_patient_id_from_dicom_file(file):
    # Construct the full path to the DICOM file
    dicom_file_path = os.path.join(image_folder_expanded, file + '.dcm')
    
    # Read the DICOM file
    try:
        dicom_data = pydicom.dcmread(dicom_file_path)
        return dicom_data.PatientID if 'PatientID' in dicom_data else None
    except Exception as e:
        print(f"Error reading {dicom_file_path}: {e}")
        return None
    
# Apply the function to the 'file' column and create a new 'patient_id' column
healthies_important['patient_id'] = healthies_important['file'].apply(get_patient_id_from_dicom_file)
healthies_important.head()

In [None]:
# check if patient_id is unique
if healthies_important['patient_id'].is_unique:
    print("All patient IDs are unique.")

In [None]:
# add actual path to the file column
healthies_important['file'] = healthies_important['file'].apply(lambda x: os.path.join(image_folder_expanded, x + '.png'))
healthies_important.head(3)

In [None]:
# export the DataFrame to a CSV file
output_file_path = os.path.expanduser('~/DATA/healthy_subset_new_cleaned.csv')
healthies_important.to_csv(output_file_path, index=False)
print(f"DataFrame exported to {output_file_path}")

## Create a test split

In [None]:
from sklearn.model_selection import StratifiedKFold


data = healthies_important.to_dict(orient='records')
stratification_labels = [d["anatomy_site"] for d in data]

# ---- Split: Train/Val (80%) vs Test (20%) ---- #
sgkf1 = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

train_val_indices, test_indices = next(
    sgkf1.split(data, stratification_labels)
)
train_val_dicts = [data[i] for i in train_val_indices]
test_dicts = [data[i] for i in test_indices]

### Save the created test split by appending file path and bool to the test_set_split.csv

In [None]:
test_set_split_df = pd.read_csv(os.path.expanduser('~/DATA/test_set_split.csv'))
test_set_split_df.head()

In [None]:
new_test_set_split_df = pd.DataFrame({
    'image_path': [d['file'] for d in test_dicts],
    'test set': [True] * len(test_dicts)
})
new_test_set_split_df = pd.concat([new_test_set_split_df, pd.DataFrame({
    'image_path': [d['file'] for d in train_val_dicts],
    'test set': [False] * len(train_val_dicts)})])
print(len(new_test_set_split_df))

new_test_set_split_df.head()


In [None]:
# concat and asve the new test set split
test_set_split_df = pd.concat([test_set_split_df, new_test_set_split_df], ignore_index=True)


# BE CAREFUL: ONLY SAVE THIS IF YOU ARE SURE ABOUT WHAT YOU ARE DOING
# test_set_split_df.to_csv(os.path.expanduser('~/DATA/test_set_split.csv'), index=False)
# print("New test set split saved to ~/DATA/test_set_split.csv")