In [None]:
# Input data files are available in the read-only "../input/" directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#import numpy as np
#import pandas as pd

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))



# Using TMAs and thumbnails

**Imports**

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import pickle
import cv2
import keras_cv
import keras_core as keras
import pandas as pd
import numpy as np
import tensorflow as tf
from PIL import Image

from keras.applications.resnet50 import ResNet50
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

To submit, change to True

In [None]:
submission = False

**Loading images**

In [None]:
if not submission:
    label_encoder = LabelEncoder()
    one_hot_encoder = OneHotEncoder(sparse=False)

    training_df = pd.read_csv('/kaggle/input/UBC-OCEAN/train.csv')
    #training_df['path'] = [f"{i}_thumbnail.png" for i in training_df['image_id']]

    image_ids = []
    images = []
    labels = []
    for i, (idx, label) in enumerate(zip(training_df['image_id'], training_df['label'])):
        try:
            image = Image.open("/kaggle/input/UBC-OCEAN/train_thumbnails/"+str(idx)+'_thumbnail.png')
        except:
            image = Image.open("/kaggle/input/UBC-OCEAN/train_images/"+str(idx)+'.png')
        image = image.resize((224,224))
        image = np.array(image)
        image_ids.append(idx)
        images.append(image)
        labels.append(label)
        print(f'Loading images: {i} / 537',end='\r')

    labels_one_hot = one_hot_encoder.fit_transform(label_encoder.fit_transform(labels).reshape(-1, 1))
    images = np.array(images).reshape(-1, 224, 224, 3)

**Training model**

In [None]:
if not submission:
    base_model = ResNet50(weights='imagenet', include_top=False)
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    predictions = Dense(5, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=predictions)

    for layer in base_model.layers:
      layer.trainable = False

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train, X_test, y_train, y_test = train_test_split(images, labels_one_hot, test_size=0.3)

    model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

    predicted = model.predict(X_test)
    predicted = label_encoder.inverse_transform(np.argmax(predicted, axis=1))
    y_test = label_encoder.inverse_transform(np.argmax(y_test, axis=1))

    print(f'Balanced accuracy: {balanced_accuracy_score(y_test, predicted)}')

**Saving and loading model**

In [None]:
if not submission:
    with open('model.pkl', 'wb') as file:  
        pickle.dump(model, file)
    with open('l_encoder.pkl', 'wb') as file:  
        pickle.dump(label_encoder, file)  
    with open('o_h_encoder.pkl', 'wb') as file:  
        pickle.dump(one_hot_encoder, file) 
else:
    with open('/kaggle/input/ml-course-project/model.pkl', 'rb') as file:  
        model = pickle.load(file)
    with open('/kaggle/input/ml-course-project/l_encoder.pkl', 'rb') as file:  
        label_encoder = pickle.load(file)  
    with open('/kaggle/input/ml-course-project/o_h_encoder.pkl', 'rb') as file:  
        one_hot_encoder = pickle.load(file)

**Loading test images and making predictions**

In [None]:
if submission:
    test_df = pd.read_csv('/kaggle/input/UBC-OCEAN/test.csv')

    test_image_ids = []
    test_images = []
    for idx in test_df['image_id']:
        try:
            image = Image.open("/kaggle/input/UBC-OCEAN/test_thumbnails/"+str(idx)+'_thumbnail.png')
        except:
            image = Image.open("/kaggle/input/UBC-OCEAN/test_images/"+str(idx)+'.png')
        image = image.resize((224,224))
        image = np.array(image)
        test_image_ids.append(idx)
        test_images.append(image)

    test_images = np.array(test_images).reshape(-1, 224, 224, 3)

    test_predicted = model.predict(test_images)
    test_predicted = label_encoder.inverse_transform(np.argmax(test_predicted, axis=1))

    test_df["label"] = test_predicted

    submission_df = test_df[["image_id", "label"]]
    submission_df.to_csv("submission.csv", index=False)

# Augmentation

In [None]:

'''import os
import random
import math
from pathlib import Path
import pandas as pd
import PIL.Image
import PIL
import numpy as np
from tqdm import tqdm
PIL.Image.MAX_IMAGE_PIXELS = None  # Otherwise it thinks that the images are oversized and may be compression bombs

boring_cutoff = 0.3  # pictures with bigger proportion of boring pixels will be discarded
borders_expansion = 0.1  # Expand picture by how much on the sides? Useful for better representing borders
aug_side_proportion = 0.1  # Crop lenght as a fraction of longest side of input
aug_side_proportion_var = 0.1  # Crop length variation
aug_side_px = 786  # Crop will be downscaled to size (aug_side_px, aug_side_px)
do_random_flip = True  # Whether to allow augmenter to flip cuts.

def augmenting_generator(picture: PIL.Image, n_crops: int, tumor_class=None, mask=None):
    max_side = max(picture.size)

    def p_to_px(p):
        return int(p * max_side)

    def pre_crop_size(angle, size):
        pc_size = angle % (math.tau / 4)
        pc_size = min(pc_size, math.tau / 4 - pc_size)
        pc_size = size * (math.sin(pc_size) + math.cos(pc_size))
        return pc_size

    succesful = 0
    total = 0
    while succesful < n_crops:
        # Failsafe: one black image shall never just indefinitely hang the entire code
        if total > 3 * n_crops:
            break

        angle = random.uniform(0, math.tau)
        size = aug_side_proportion * (1.0 + random.uniform(-aug_side_proportion_var, aug_side_proportion_var))
        pc_size = pre_crop_size(angle, size)

        chosen = []
        for side_prop in [x / max_side for x in picture.size]:
            chosen.append(random.uniform(-borders_expansion, side_prop + borders_expansion - pc_size))
        pre_crop = picture.crop((p_to_px(chosen[0]), p_to_px(chosen[1]),
                                 p_to_px(chosen[0]) + p_to_px(pc_size), p_to_px(chosen[1]) + p_to_px(pc_size)))

        pre_crop = pre_crop.rotate(angle / math.tau * 360.0)
        rot_off = (pre_crop.size[0] - p_to_px(size)) // 2
        pre_crop = pre_crop.crop((rot_off, rot_off, pre_crop.size[0] - rot_off, pre_crop.size[1] - rot_off))

        crop = pre_crop.resize((aug_side_px, aug_side_px), PIL.Image.Resampling.LANCZOS)
        if do_random_flip and random.randint(0, 1) == 1:
            crop = crop.transpose(PIL.Image.FLIP_TOP_BOTTOM)

        # Ouch, I don't like this code. Looks not performant at all.
        total += 1
        boring = np.sum(np.sum(np.array(crop), axis=2) == 0) / aug_side_px / aug_side_px
        if boring > boring_cutoff:
            continue

        succesful += 1
        yield crop, 1.0'''

In [None]:
'''tma_crops = 2000  # Per image, there are 25 TMA images in the training set
non_tma_crops = 300  # Per image, there are 513 non-TMA images

source_path = Path('/kaggle/input/UBC-OCEAN/train_images')
mask_path = Path('/kaggle/input/ubc-ovarian-cancer-competition-supplemental-masks')
to_path = Path('/kaggle/working/augmented')
main_csv = Path('/kaggle/input/UBC-OCEAN/train.csv')

csv_data = pd.read_csv(main_csv)
source_files = list(Path(source_path).rglob("*.[pP][nN][gG]"))
mask_files = os.listdir(mask_path)
to_path.mkdir(parents=True, exist_ok=True)'''

In [None]:
'''for pat in tqdm(source_files):
    p = pat.parts[-1]
    info = csv_data.loc[csv_data['image_id'] == int(p.split('.')[0])]
    tumor_class = list(info['label'])[0]
    is_tma = list(info['is_tma'])[0]
    n_crops = tma_crops if is_tma else non_tma_crops
    to_path_suffix = 'tma' if is_tma else 'wsi'

    img = PIL.Image.open(pat)
    mask: PIL.Image = None
    if p in mask_files:
        mask = PIL.Image.open(mask_path / p)
    i = 0
    for crop, confidence in augmenting_generator(picture=img, n_crops=20, tumor_class=tumor_class, mask=mask):
        save_to = to_path / to_path_suffix / Path(*pat.parts[2:-1])
        save_to.mkdir(parents=True, exist_ok=True)
        fn = f'crop_{str(p).split(".")[0]}_{i}'
        crop.save(save_to / str(fn + '.png'))
        open(save_to / str(fn + '.txt'), 'w').write(f'{tumor_class} {confidence}')
        i += 1
    img.close()
    if mask is not None:
        mask.close()'''