In [None]:
import os
import numpy as np
import pandas as pd
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.model_selection import GroupKFold

In [None]:
def auto_select_accelerator():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    
    return strategy


def build_decoder(with_labels=True, target_size=(256, 256), ext='jpg'):
    def decode(path):
        file_bytes = tf.io.read_file(path)

        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)
        else:
            raise ValueError("Image extension not supported")
        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, target_size)

        return img
    
    def decode_with_labels(path, label):
        return decode(path), label
    
    return decode_with_labels if with_labels else decode


def build_augmenter(with_labels=True):
    def augment(img):
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        return img
    
    def augment_with_labels(img, label):
        return augment(img), label
    
    return augment_with_labels if with_labels else augment


def build_dataset(paths, labels=None, bsize=128, cache=True,
                  decode_fn=None, augment_fn=None,
                  augment=True, repeat=True, shuffle=1024, 
                  cache_dir=""):
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)
    
    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)
    
    if augment_fn is None:
        augment_fn = build_augmenter(labels is not None)
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.cache(cache_dir) if cache else dset
    dset = dset.map(augment_fn, num_parallel_calls=AUTO) if augment else dset
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)
    
    return dset

In [None]:
COMPETITION_NAME = "hpa-768768"
strategy = auto_select_accelerator()
BATCH_SIZE = strategy.num_replicas_in_sync * 16
GCS_DS_PATH = KaggleDatasets().get_gcs_path(COMPETITION_NAME)

In [None]:
GCS_DS_PATH

In [None]:
#green
load_dir = f"/kaggle/input/{COMPETITION_NAME}/"
df = pd.read_csv('../input/classification-label-csv-green/df_green.csv')
label_cols = df.columns[2:21]
paths = GCS_DS_PATH + '/' + df['ID'] + '.png'
labels = df[label_cols].values

In [None]:
from kaggle_datasets import KaggleDatasets
import tensorflow as tf

label2id_dict = {
 'Nucleoplasm': 0,
 'Nuclear Membrane': 1,
 'Nucleoli': 2,
 'Nucleoli Fibrillar Center': 3,
 'Nuclear Speckles': 4,
 'Nuclear Bodies': 5,
 'Endoplasmic Reticulum': 6,
 'Golgi Apparatus': 7,
 'Intermediate Filaments': 8,
 'Actin Filaments': 9,
 'Microtubules': 10,
 'Mitotic Spindle': 11,
 'Centrosome': 12,
 'Plasma Membrane': 13,
 'Mitochondria': 14,
 'Aggresome': 15,
 'Cytosol': 16,
 'Vesicles': 17,
 'Negative': 18
}

# GCS_DS_PATH = KaggleDatasets().get_gcs_path(COMPETITION_NAME)
# load_dir = GCS_DS_PATH

COMPETITION_NAME = "hpa-single-cell-image-classification"
load_dir = f"../input/{COMPETITION_NAME}/train/"
load_dir = "gs://green_channels/"

#Preprocessing Dataset 
df = pd.read_csv('../input/classification-label-csv-green/df_green.csv')
# df['label_count'] = df.Label.str.split("|").str.len()
# df = df[df.label_count == 1]
# df['label_name'] = df["Label"].apply(lambda x: l_dict[int(x)])
df['path'] = df["ID"].apply(lambda x: load_dir + x + ".png")

In [None]:
label_cols = df.columns[2:21]

file_format = ""

training_df = pd.read_csv("../input/human-cell-atlas-training/training_df.csv")
train_paths = load_dir + training_df['ID'] + file_format
training_df["path"] = train_paths
train_labels = training_df[label_cols].values

valid_df = pd.read_csv("../input/human-cell-atlas-training/valid_df.csv")
valid_paths = load_dir + valid_df['ID'] + file_format
valid_df["path"] = valid_paths
valid_labels = valid_df[label_cols].values

test_df = pd.read_csv("../input/human-cell-atlas-training/test_df.csv")
test_paths = load_dir + test_df['ID'] + file_format
test_df["path"] = test_paths
test_labels = test_df[label_cols].values

sample_df = df.sample(frac=0.01)
sample_paths = load_dir + sample_df['ID'] + file_format
sample_labels = sample_df[label_cols].values


In [None]:
IMSIZE = (224, 240, 260, 300, 380, 456, 528, 600, 675)
IMS = 7

decoder = build_decoder(with_labels=True, target_size=(IMSIZE[IMS], IMSIZE[IMS]))
test_decoder = build_decoder(with_labels=False, target_size=(IMSIZE[IMS], IMSIZE[IMS]))

train_dataset = build_dataset(
    train_paths, train_labels, bsize=BATCH_SIZE, decode_fn=decoder
)

valid_dataset = build_dataset(
    valid_paths, valid_labels, bsize=BATCH_SIZE, decode_fn=decoder,
    repeat=False, shuffle=False, augment=False
)

test_dataset = build_dataset(
    test_paths, cache=False, bsize=BATCH_SIZE, decode_fn=test_decoder,
    repeat=False, shuffle=False, augment=False
)

In [None]:
try:
    n_labels = train_labels.shape[1]
except:
    n_labels = 1
    
with strategy.scope():
    model = tf.keras.Sequential([
        tf.keras.applications.ResNet152V2(
            input_shape =(IMSIZE[IMS], IMSIZE[IMS], 3),
            weights='imagenet',
            include_top=False),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(n_labels, activation='sigmoid')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(multi_label=True)])
        
    model.summary()

In [None]:
colour = '_green'
steps_per_epoch = train_paths.shape[0] // BATCH_SIZE
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    f'ResNet152V2_model{colour}.h5', save_best_only=True, monitor='val_loss', mode='min')
lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", patience=3, min_lr=1e-6, mode='min')

In [None]:
history = model.fit(
    train_dataset, 
    epochs=20,
    verbose=1,
    callbacks=[checkpoint, lr_reducer],
    steps_per_epoch=steps_per_epoch,
    validation_data=valid_dataset)

In [None]:
hist_dfResNet152V2 = pd.DataFrame(history.history)
hist_dfResNet152V2.to_csv(f'history_ResNet152V2{colour}.csv')

In [None]:

import matplotlib.pyplot as plt
def plot_hist(hist):
    columns = list(hist.columns) 
    plt.plot(hist[columns[1]])
    plt.plot(hist[columns[3]])
    plt.title("Model Accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epouch")
    plt.legend(["train", "validation"], loc = "upper left")


def plot_loss(hist):
    columns = list(hist.columns) 
    plt.plot(hist[columns[0]])
    plt.plot(hist[columns[2]])
    plt.title("Model Loss")
    plt.ylabel("Loss")
    plt.xlabel("epouch")
    plt.legend(["train", "validation"], loc = "upper right")

new_data_frame = history.history
new_data_frame = pd.DataFrame(new_data_frame)
plot_loss(new_data_frame)


In [None]:
plot_hist(new_data_frame)

In [None]:
import seaborn as sn
from sklearn.metrics import classification_report, confusion_matrix


prediction_probs = model.predict(test_dataset, verbose=1)
prediction_classes = np.argmax(prediction_probs, axis=-1)

In [None]:
y_true = list(test_df["Label"])
y_true = list(map(lambda x: int(x), y_true))

cmat = confusion_matrix(y_true, prediction_classes)
figure = plt.figure(figsize=(12,12))
sn.heatmap(cmat,annot=True, fmt='')




In [None]:
print(classification_report(y_true, prediction_classes))