## Imports

In [None]:
import pandas as pd
import numpy as np
import random
import os

## Specify paramaters and directories

In [None]:
# type of images you're want to use train - i.e. UNIL (unilateral squared radiographs) or CROP (segmented images)
IMAGE_TYPE = "UNIL"

# Size of balanced test and validation datasets
TEST_IMAGES_PER_CLASS = 15
VALID_IMAGES_PER_CLASS = 10

# Directories
INPUT_DIR = ""
OUTPUT_DIR = ""

# Path and name to excel file containing segmentation training and test datasets ("Unet_train_test.xlsx")...
# ...so that images used to train unet won't be included in classification training and testing sets...
# ...otherwise might favourably bias results
SEGMENTATION_DATASET_FILE_PATH = ""

## Making a dataframe containing all images with model labels

In [None]:
# make a list of all filenames

os.chdir(INPUT_DIR)
filenames = os.listdir()
for file in filenames:
    if file[-8:] != f"{IMAGE_TYPE}.png":
        filenames.remove(file)

# make a list of all labels (integer based labels)

int_labels_all = []
for filename in filenames:
    label = filename[0:2]
    int_labels_all.append(label)

# make a list of all labels (string versions)
    
labels_all = []
if label == "01":
        labels_all.append("Hip_DepuySynthes_Corail_Collar")
    elif label == "02":
        labels_all.append("Hip_DepuySynthes_Corail_NilCol")
    elif label == "03":
        labels_all.append("Hip_JRIOrtho_FurlongEvolution_Collar")
    elif label == "04":
        labels_all.append("Hip_JRIOrtho_FurlongEvolution_Collar")
    elif label == "05":
        labels_all.append("Hip_SmithAndNephew_Anthology")
    elif label == "06":
        labels_all.append("Hip_SmithAndNephew_Polarstem_NilCol")
    elif label == "07":
        labels_all.append("Hip_Stryker_AccoladeII")
    elif label == "08":
        labels_all.append("Hip_Stryker_Exeter")
    elif label == "21":
        labels_all.append("Knee_Depuy_Synthes_Sigma")
    elif label == "22":
        labels_all.append("Knee_SmithAndNephew_GenesisII")
    elif label == "23":
        labels_all.append("Knee_SmithAndNephew_Legion")
    elif label == "24":
        labels_all.append("Knee_ZimmerBiomet_Oxford")

# Make a dataframe of filenames and corresponding labels

data_tuples = list(zip(filenames, labels_all))
full_dataset = pd.DataFrame(data_tuples, columns = ['filenames','labels'])

In [None]:
# Print a count of images per implant model class
    
count_of_labels = full_dataset.groupby('labels').count()
print("Count of all labels")
print(count_of_labels)

## Generating test dataset
##### This dataset will be reserved for final classification model testing only, to avoid information leak from hyperparameter optimisation

In [None]:
# Create list of images used to train unet, so that can ensure these are not included in test dataset
unet_images = pd.read_excel(SEGMENTATION_DATASET_FILE_PATH, 
                            sheet_name="training", 
                            usecols=["filenames"], 
                            dtype=str).to_numpy()
unet_image = unet_images[:,0]

# List of all model classes
classes = list(set(full_dataset['labels']))

# Function to pull a random set of entries from a list
def list_random_numbers_inrange(rand_range, list_length, seed=5):
    random.seed(seed)
    random_list = random.sample(range(1,rand_range-1),list_length)
    return random_list                           

# Create test dataframe
test_df = pd.DataFrame()
training_df = full_dataset

for c in classes:
    # Generate list of row indices for all images in class used to train unet
    unet_image_indices = [index for index, row in full_dataset.iterrows() if row[1] in unet_images]
    # Generate a list of row indices for all of a particular class
    class_indices = [index for index, row in full_dataset.iterrows() if row[1] == c]
    class_indices_excluding_unet = [index for index in class_indices if not index in unet_image_indices]
    # Pull 15 random entries from the list (i.e. corresponding to 15 images from that class)
    class_indices_selection = list_random_numbers_inrange(list_length=TEST_IMAGES_PER_CLASS, 
                                                          rand_range = len(class_indices_excluding_unet), 
                                                          seed=6)
    class_indices_test = [class_indices_excluding_unet[i] for i in class_indices_selection]
    # Pull and append rows with those indices from full_dataset to a new dataframe of test examples
    test_df = test_df.append(full_dataset.iloc[class_indices_test])
    # Remove those indices to leave behind a dataframe of training examples
    training_df = training_df.drop(index = class_indices_test)

print("Dataframe containing separated balanced test dataset has been generated: test_df")
print("The number of images in the test dataset is:")
print(f"{len(test_df)} images in total")
print("or")
print(f"{len(test_df)/len(classes)} images per class")
print(f"training_df now has: {len(training_df)} images")


In [None]:
# Re-index train_df so that there are no skipped indices (otherwise results in empty rows in future dataframes)
training_df.reset_index(inplace=True)
training_df.drop(['index'], axis=1, inplace=True)

files_train = training_df['filenames']
labels_train = training_df['labels']

## Generating training and validation datasets

In [None]:
# List of all included classes
classes = list(set(training_df['labels']))

# Create validation dataframe
valid_df = pd.DataFrame()
train_df = training_df

for c in classes:
    # Generate list of row indices for all images in class used to train unet
    unet_image_indices = [index for index, row in EXAMPLES_TO_INCLUDE_DF.iterrows() if row[1] in unet_images]
    # Generate a list of row indices for all of a particular class
    class_indices = [index for index, row in training_df.iterrows() if row[1] == c]
    class_indices_excluding_unet = [index for index in class_indices if not index in unet_image_indices]
    # Pull 10 random entries from the list (i.e. corresponding to 10 images from that class)
    class_indices_selection = list_random_numbers_inrange(list_length=VALID_IMAGES_PER_CLASS, 
                                                          rand_range = len(class_indices_excluding_unet), 
                                                          seed=7)
    class_indices_valid = [class_indices[i] for i in class_indices_selection]
    # Pull and append rows with those indices from train_df to a new dataframe of validation examples
    valid_df = valid_df.append(training_df.iloc[class_indices_valid])
    # Remove those indices to leave behind a dataframe of training examples
    train_df = train_df.drop(index = class_indices_valid)

print("Dataframe containing separated validation dataset has been generated: valid_df")
print("The number of images in the validation dataset is:")
print(f"{len(valid_df)} images in total")
print("or")
print(f"{len(valid_df)/len(classes)} images per class")
print(f"train_df now has: {len(train_df)} images")

## Exporting training, validation, and test dataframes into an Excel file

In [None]:
# save training and validation dataframes to an excel sheet
if not os.path.exists(f'classification_training_validation_test_{IMAGE_TYPE}.xlsx'):
    with pd.ExcelWriter(f'classification_training_validation_test_{IMAGE_TYPE}.xlsx') as writer:
        # training dataset during hyperparameter optimisation
        train_df.to_excel(writer, sheet_name='TRAIN_DF', index=False) 
        # validation dataset for hyperaparameter optimisation
        valid_df.to_excel(writer, sheet_name='VALID_DF', index=False)
        # combined train and val sets
        training_df.to_excel(writer, sheet_name="FINAL_TRAIN_DF", index=False)
        # reserved dataset for final testing
        test_df.to_excel(writer, sheet_name="TEST_DF", index=False) 
else:
     print(f"IMPORTANT: classification_training_validation_test_{IMAGE_TYPE}.xlsx already exists in the directory, if you want to overwrite it you need to manually delete the current version first.")