In [41]:
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
from PIL import Image

from keras.applications.resnet50 import ResNet50
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [42]:
# Paths
aug_wsi_images = '/kaggle/input/augmented/augmented/wsi'
aug_tma_images = '/kaggle/input/augmented/augmented/tma'
model_file = '/kaggle/working/model.h5'

classes = ['CC', 'EC', 'HGSC', 'LGSC', 'MC']

# If the best prediction probability is below that threshold, it's labelled as 'Other'
threshold = 0.3

In [43]:
# To convert labels to one-hot vectors and vice-versa
def values_to_one_hot(values, classes):
    vector = []
    for value in values:
        one_hot = np.zeros(5)
        one_hot[classes.index(value)] = 1
        vector.append(one_hot)
    return np.array(vector)

def one_hot_to_values(vector, classes):
    values = []
    for one_hot in vector:
        if np.max(one_hot) < threshold:
            value = 'Other'
        else:
            value = classes[np.argmax(one_hot)]
        values.append(value)
    return np.array(values)

# To load images and labels from 'root' folder
def load_data(root):
    files = os.listdir(root)
    n = len(files)
    
    images = []
    labels = []
    confidences = []    

    # Loading augmented images and labels
    for i, file in enumerate(files):
        if file.endswith(".png"):
            image_id = os.path.splitext(file)[0]
            label_path = os.path.join(root, f"{image_id}.txt")

            if os.path.exists(label_path):
                with open(label_path, "r") as label_file:
                    label, confidence = label_file.read().strip().split()
                image_path = os.path.join(root, file)
                image = Image.open(image_path)
                image = image.resize((224,224)) # Resizing to 244x244 for Resnet model
                images.append(np.array(image))
                labels.append(label)
                confidences.append(confidence)
        print(f'Loading images ({root}): {i+1} / {n}',end='\r')
    print()

    # Converting labels to one-hot vectors 
    labels_one_hot = values_to_one_hot(labels, classes)
    # Reshaping image array for model training
    images = np.array(images).reshape(-1, 224, 224, 3)
    
    return images, labels_one_hot, confidences

In [44]:
# Loading all training images 
images_wsi, labels_wsi, confidences_wsi = load_data(aug_wsi_images)
images_tma, labels_tma, confidences_tma = load_data(aug_tma_images)

images = np.concatenate((images_wsi, images_tma))
labels_one_hot = np.concatenate((labels_wsi, labels_tma))

Loading images (/kaggle/input/augmented/augmented/wsi): 1168 / 1168
Loading images (/kaggle/input/augmented/augmented/tma): 80 / 80


In [45]:
# To test with original data

train_csv = '/kaggle/input/UBC-OCEAN/train.csv'
train_images = '/kaggle/input/UBC-OCEAN/train_images/'
train_thumbnails = '/kaggle/input/UBC-OCEAN/train_thumbnails/'

def load_image(idx):
    try:
        image = Image.open(train_thumbnails+str(idx)+'_thumbnail.png')
    except:
        image = Image.open(train_images+str(idx)+'.png')
    image = image.resize((224,224)) 
    image = np.array(image)
    return image

training_df = pd.read_csv(train_csv)
n = len(training_df)

images_val = []
labels_val = []

for i, (idx, label) in enumerate(zip(training_df['image_id'], training_df['label'])):
    image = load_image(idx)
    images_val.append(image)
    labels_val.append(label)
    print(f'Loading images ({train_images}): {i+1} / {n}',end='\r')

images_val = np.array(images_val).reshape(-1, 224, 224, 3)
labels_one_hot_val = values_to_one_hot(labels_val, classes)

Loading images: 538 / 538

In [46]:
# Loading pre-trained ResNet50 model
# adding a final layer to change the number of output classes
base_model = ResNet50(weights='imagenet', include_top=False)
x = base_model.output
x = GlobalAveragePooling2D()(x)
predictions = Dense(len(classes), activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# Not changing pre-trained layers
for layer in base_model.layers:
  layer.trainable = False

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training model
model.fit(images, labels_one_hot, epochs=20, batch_size=64, validation_data=(images_val, labels_one_hot_val))

# Predicting validation set labels
predicted = one_hot_to_values(model.predict(images_val), classes)

print(f'Balanced accuracy: {balanced_accuracy_score(labels_val, predicted)}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Balanced accuracy: 0.24237404559985204


In [47]:
# Saving model as .h5 file
model.save(model_file)