Import Libraries

In [None]:
import pandas as pd
from os import listdir
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score
from keras import backend
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.optimizers import SGD
import os
import zipfile
import matplotlib as plt

Load Dataset and Unzip

In [None]:
import kaggle

In [None]:
os.environ['KAGGLE_USERNAME'] = "sanjayratnayake"

In [None]:
os.environ['KAGGLE_KEY'] = "6f91a84af950a57266320cc14164d052"

In [None]:
!kaggle competitions download -c human-protein-atlas-image-classification

In [None]:
with zipfile.ZipFile('/home/sanjaya_ratnayake_d/human-protein-atlas-image-classification.zip', 'r') as zip_ref:
    zip_ref.extractall('/home/sanjaya_ratnayake_d')

Load CSV and Prepare Dataframe of files/labels

In [None]:
filename = "/home/sanjaya_ratnayake_d/train.csv"

In [None]:
mapping_csv = pd.read_csv(filename)

In [None]:
mapping_csv.head()

In [None]:
mapping_csv['Id'] = mapping_csv['Id'] + '_green'

In [None]:
folder = "/home/sanjaya_ratnayake_d/train/"

In [None]:
idtarget = {}
for filename in listdir(folder):
    if filename.endswith('_green.png'):
        pass
    else:
        idtarget[filename[:-4]] = 28

In [None]:
df2 = pd.DataFrame.from_dict(idtarget, orient='index')

In [None]:
df3 = df2.reset_index()

In [None]:
df3.columns = ['Id', 'Target']

In [None]:
df3.shape

In [None]:
mapping_csv.shape

In [None]:
df3['Target'] = df3['Target'].astype(object)

In [None]:
dffinal = pd.concat([mapping_csv, df3], axis=0)

In [None]:
dffinal = dffinal.reset_index()

In [None]:
dffinal = dffinal.drop(['index'], axis=1)

In [None]:
dffinal['Target'] = dffinal['Target'].astype(str)

In [None]:
dffinal.shape

In [None]:
folder2x = "/home/sanjaya_ratnayake_d/trainset/"

In [None]:
dflist = []
for file in os.listdir(folder2x):
    dflist.append(file[:-4])

In [None]:
filtered = dffinal['Id'].isin(dflist)

In [None]:
dffinal2 = dffinal[filtered]

In [None]:
dffinal3 = dffinal2.reset_index(drop=True)

In [None]:
dffinal3

In [None]:
dffinal3.shape

In [None]:
#File mappings of filename to label
def create_file_mapping(dffinal3):
    mapping = {}
    for i in range(len(dffinal3)):
        name, tags = dffinal3['Id'][i], dffinal3['Target'][i]
        mapping[name] = tags.split()
    return mapping 

In [None]:
#One Hot Encode tags for individual files
def one_hot_encode(tags, mapping):
        encoding = np.zeros(29)
        for tag in tags:
            tag = int(tag)
            encoding[tag] = 1
        return encoding

In [None]:
#Load all images into memory
def load_dataset(path, file_mapping):
    photos, targets = list(), list()
    n = 0
    for filename in listdir(folder2x):
        n = n+1
        print("picture number {}".format(n))
        #load image
        photo = load_img(path + filename, target_size=(512,512))
        #convert to numpy array
        photo = img_to_array(photo)
        tags = file_mapping[filename[:-4]]
        #one hot encode
        target = one_hot_encode(tags, file_mapping)
        #store
        photos.append(photo)
        targets.append(target)
    X = np.asarray(photos, dtype = 'uint8')
    y = np.asarray(targets, dtype = 'uint8')
    return X, y

In [None]:
file_mapping = create_file_mapping(dffinal3)

In [None]:
folder3x = "/home/sanjaya_ratnayake_d/trainset/"

In [None]:
X, y = load_dataset(folder3x, file_mapping)

In [None]:
savez_compressed('protein_data.npz', X, y)

In [None]:
#load train and test data
def load_dataset2():
    data = load('protein_data.npz')
    X, y = data['arr_0'], data['arr_1']
    trainX, testX, trainY, testY = sk.train_test_split(X, y , test_size=0.3, random_state=1)
    print(trainX.shape, trainY.shape, testX.shape, testY.shape)
    return trainX, trainY, testX, testY

In [None]:
#Calculate fbeta score
def fbeta(y_true, y_pred, beta=2):
    y_pred = backend.clip(y_pred, 0, 1)
    #calculate true positives, false positives, false negatives
    tp = backend.sum(backend.round(backend.clip(y_true*y_pred, 0, 1)), axis=1)
    fp = backend.sum(backend.round(backend.clip(y_pred - y_true, 0, 1)), axis=1)
    fn = backend.sum(backend.round(backend.clip(y_true - y_pred, 0, 1)), axis = 1)
    #calculate precision
    p = tp / (tp + fp + backend.epsilon())
    #calculate recall
    r = tp / (tp + fn +backend.epsilon())
    #calculate fbeta, averaged across each class
    bb = beta ** 2
    fbeta_score = backend.mean((1 + bb) * (p * r) / (bb * p + r + backend.epsilon())
    return fbeta_score

In [None]:
#define model
def define_model(in_shape(512,512,3), out_shape=29):
    model = Sequential()
    model.add(Conv2D(512, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=in_shape))
    model.add(Conv2D(512, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2,2)))
    model.add(Conv2D(128, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(128, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2,2)))
    model.add(Conv2D(64, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(64, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2,2)))
    model.add(Conv2D(32, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(32, (3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2,2)))
    model.add(Flatten())
    model.add(Dense(29, activation='relu', kernel_initializer='he_uniform'))
    mode.add(Dense(out_shape, activation='sigmoid'))
    #compile model
    opt=SGD(lr=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[fbeta])
    return model

In [None]:
datagen = ImageDataGenerator(rescale=1.0/255.0)

In [None]:
train_it = datagen.flow(trainX, trainY, batch_size=128)
test_it = datagen.flow(testX, testY, batch_size=128)

In [None]:
history = model.fit_generator(train_it, steps_per_epoch=len(train_it),
    validation_data=test_it, validation_steps=len(test_it), epochs=50, verbose=0)

In [None]:
loss, fbeta = model.evaluate_generator(test_it, steps=len(test_it), verbose=0)
print('> loss=%.3f, fbeta=%.3f' % (loss, fbeta))

In [None]:
def summarize_diagnostics(history):
    # plot loss
    pyplot.subplot(211)
    pyplot.title('Cross Entropy Loss')
    pyplot.plot(history.history['loss'], color='blue', label='train')
    pyplot.plot(history.history['val_loss'], color='orange', label='test')
    # plot accuracy
    pyplot.subplot(212)
    pyplot.title('Fbeta')
    pyplot.plot(history.history['fbeta'], color='blue', label='train')
    pyplot.plot(history.history['val_fbeta'], color='orange', label='test')
    # save plot to file
    filename = sys.argv[0].split('/')[-1]
    pyplot.savefig(filename + '_plot.png')
    pyplot.close()

In [None]:
# run the test harness for evaluating a model
def run_test_harness():
    # load dataset
    trainX, trainY, testX, testY = load_dataset2()
    # create data generator
    datagen = ImageDataGenerator(rescale=1.0/255.0)
    # prepare iterators
    train_it = datagen.flow(trainX, trainY, batch_size=128)
    test_it = datagen.flow(testX, testY, batch_size=128)
    # define model
    model = define_model()
    # fit model
    history = model.fit_generator(train_it, steps_per_epoch=len(train_it),
        validation_data=test_it, validation_steps=len(test_it), epochs=50, verbose=0)
    # evaluate model
    loss, fbeta = model.evaluate_generator(test_it, steps=len(test_it), verbose=0)
    print('> loss=%.3f, fbeta=%.3f' % (loss, fbeta))
    # learning curves
    summarize_diagnostics(history)

In [None]:
run_test_harness()

In [None]:
folder4x = "/home/sanjaya_ratnayake_d/trainset/"

In [None]:
def load_dataset3(folder4x): 
    photos = list() 
    for filename in listdir(folder4x): 
        #load image 
        photo = load_img(path + filename, target_size=(512,512)) 
        #convert to numpy array 
        photo = img_to_array(photo) 
        tags = file_mapping[filename[:-4]]  
        #store 
        photos.append(photo) 
    X = np.asarray(photos, dtype = 'uint8')  
    return X

In [None]:
X = load_dataset3(folder4x)

In [None]:
testresults = model.predict(X)

train_yhat = np.asarray([np.ones(trainY.shape[1]) for _ in range(trainY.shape[0])])
test_yhat = np.asarray([ones(testY.shape[1]) for _ in range(testY.shape[0])])

train_score = fbeta_score(trainY, train_yhat, 2, average='samples')
test_score = fbeta_score(testY, test_yhat, 2, average='samples')

print('All Ones: train=%.3f, test=%.3f' % (train_score, test_score))

In [None]:
n = 0

In [None]:
n = n+1
        print("picture number {}".format(n))