In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from PIL import Image

from keras.applications.resnet50 import ResNet50
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [2]:
# Paths
train_csv = '/kaggle/input/UBC-OCEAN/train.csv'
train_images = '/kaggle/input/UBC-OCEAN/train_images/'
train_thumbnails = '/kaggle/input/UBC-OCEAN/train_thumbnails/'
model_file = '/kaggle/working/model.h5'


classes = ['CC', 'EC', 'HGSC', 'LGSC', 'MC']

# If the best prediction probability is below that threshold, it's labelled as 'Other'
threshold = 0.3

In [3]:
# Functions to convert labels to one-hot vectors and vice-versa
def values_to_one_hot(values, classes):
    vector = []
    for value in values:
        one_hot = np.zeros(5)
        one_hot[classes.index(value)] = 1
        vector.append(one_hot)
    return np.array(vector)

def one_hot_to_values(vector, classes):
    values = []
    for one_hot in vector:
        if np.max(one_hot) < threshold:
            value = 'Other'
        else:
            value = classes[np.argmax(one_hot)]
        values.append(value)
    return np.array(values)

# Function to load image and preprocess it
def load_image(idx):
    # Using TMAs and WSI thumbnails
    try:
        image = Image.open(train_thumbnails+str(idx)+'_thumbnail.png')
    except:
        image = Image.open(train_images+str(idx)+'.png')
    # Resizing to 244x244 for Resnet model
    image = image.resize((224,224)) 
    image = np.array(image)
    return image
    

In [4]:
# Reading csv file
training_df = pd.read_csv(train_csv)
n = len(training_df)

# Loading all training images 
image_ids = []
images = []
labels = []
for i, (idx, label) in enumerate(zip(training_df['image_id'], training_df['label'])):
    image = load_image(idx)
    image_ids.append(idx)
    images.append(image)
    labels.append(label)
    print(f'Loading images: {i+1} / {n}',end='\r')

# Converting labels to one-hot vectors 
labels_one_hot = values_to_one_hot(labels, classes)
# Reshaping image array for model training
images = np.array(images).reshape(-1, 224, 224, 3)

Loading images: 538 / 538

In [11]:
# Loading pre-trained ResNet50 model
# adding a final layer to change the number of output classes
base_model = ResNet50(weights='imagenet', include_top=False)
x = base_model.output
x = GlobalAveragePooling2D()(x)
predictions = Dense(len(classes), activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# Not changing pre-trained layers
for layer in base_model.layers:
  layer.trainable = False

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train-valitation split
X_train, X_val, y_train, y_val = train_test_split(images, labels_one_hot, test_size=0.3)

# Training model
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_val, y_val))

# Predicting validation set labels
predicted = one_hot_to_values(model.predict(X_val), classes)
y_val = one_hot_to_values(y_val, classes)

print(f'Balanced accuracy: {balanced_accuracy_score(y_val, predicted)}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Balanced accuracy: 0.336962481962482


In [6]:
# Saving model as .h5 file
model.save(model_file)