# Imports

In [1]:
from __future__ import print_function, division
import warnings
warnings.filterwarnings("ignore")

import os
import sys
import math
import operator
import numpy as np
import pandas as pd
import pickle as pkl
import tifffile as tif
import matplotlib.pyplot as plt
from multiprocessing import Pool
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from keras import layers
from keras import models
from keras.utils import Sequence
from keras.models import load_model, save_model, model_from_json
from livelossplot import PlotLossesKeras

os.chdir("../../")

Using TensorFlow backend.


# Data Preprocessing

In [2]:
class Data_Preprocess():
    
    def init_load(self, root_dir, csv_file):
        self.df = pd.read_csv(csv_file, low_memory=False)
        self.path = root_dir
        self.all_encoded = {}
        
    def train_test_data_loading(self):
        self.x_train, self.x_test, self.y_train, self.y_test = [], [], [], []
        for i,cls in enumerate(self.df['class'].unique()):
            self.all_encoded[cls] = i
            for j,order in enumerate(self.df[self.df['class']==cls]['order'].unique()):
                self.all_encoded[order] = j
                for k,family in enumerate(self.df[(self.df['class']==cls) & (self.df['order']==order)]['family'].unique()):
                    self.all_encoded[family] = k
                    for l,genus in enumerate(self.df[(self.df['class']==cls) & (self.df['order']==order) & (self.df['family']==family)]['genus'].unique()):
                        self.all_encoded[genus] = l
                        for m,species in enumerate(self.df[(self.df['class']==cls) & (self.df['order']==order) & (self.df['family']==family) & (self.df['genus']==genus)]['species_glc_id'].unique()):
                            path = self.path+"train/"+cls+"/"+order+"/"+family+"/"+genus+"/"+str(species)+"/"
                            self.x_train.extend([path+i for i in os.listdir(path)])
                            path = self.path+"test/"+cls+"/"+order+"/"+family+"/"+genus+"/"+str(species)+"/"
                            self.x_test.extend([path+i for i in os.listdir(path)])
                            self.all_encoded[str(species)] = m
        
        np.random.shuffle(self.x_train)
        np.random.shuffle(self.x_test)
        
        tmp = np.array(list(data.all_encoded.values())).reshape(-1,1)
        ohe = OneHotEncoder(sparse=False)
        tmp = ohe.fit_transform(tmp.reshape(-1,1))
        
        for im in self.x_train:
            l = im.split("/")
            c, o, f, g, s = tmp[self.all_encoded[l[3]]], tmp[self.all_encoded[l[4]]], tmp[self.all_encoded[l[5]]], tmp[self.all_encoded[l[6]]], tmp[self.all_encoded[l[7]]]
            self.y_train.append([c,o,f,g,s])
            
        for im in self.x_test:
            l = im.split("/")
            c, o, f, g, s = tmp[self.all_encoded[l[3]]], tmp[self.all_encoded[l[4]]], tmp[self.all_encoded[l[5]]], tmp[self.all_encoded[l[6]]], tmp[self.all_encoded[l[7]]]
            self.y_test.append([c,o,f,g,s])
        
    def ordered_call(self, root_dir, csv_file):
        print("Creating the data preprocessing object and loading csv")
        self.init_load(root_dir, csv_file)
        print("Done!")
        print("Loading test and train image paths and corresponding labels")
        self.train_test_data_loading()
        print("Done!")

In [3]:
data = Data_Preprocess()

In [4]:
data.ordered_call(root_dir="Data/Hierarchial Data/", csv_file="occurrences_train.csv")

Creating the data preprocessing object and loading csv
Done!
Loading test and train image paths and corresponding labels
Done!


In [5]:
np.array(data.x_train).shape, np.array(data.y_train).shape, np.array(data.x_test).shape, np.array(data.y_test).shape

((152980,), (152980, 5, 123), (65563,), (65563, 5, 123))

In [16]:
for i in range(140000,141000,1):
    for j in range(5):
        print(np.argmax(data.y_train[i][j]))
    print("\n\n")

0
3
0
10
0



0
4
0
0
2



0
1
0
3
0



0
1
0
1
0



0
2
0
2
3



0
0
0
0
0



0
0
0
1
0



0
0
0
0
0



0
10
0
0
0



0
5
0
0
0



0
2
0
6
8



0
0
0
0
2



0
1
0
0
0



0
3
0
8
0



0
2
0
1
2



0
10
0
8
0



0
3
0
0
0



0
0
0
0
0



0
1
0
10
1



0
8
0
0
0



0
4
0
0
9



0
3
0
0
0



0
8
0
0
0



0
0
0
0
0



0
0
0
0
0



0
10
0
2
0



0
6
0
0
17



0
6
0
0
0



0
0
0
23
0



0
2
0
6
0



0
10
0
10
0



0
1
0
5
0



0
5
0
0
0



0
2
0
0
1



0
1
0
0
0



0
3
0
10
0



0
0
0
0
0



0
2
0
10
0



0
10
0
0
0



0
0
0
2
0



0
0
0
0
0



0
1
0
0
0



0
2
0
1
0



0
1
0
1
2



0
0
0
0
0



0
8
0
2
0



0
6
0
0
0



0
2
0
6
7



0
1
0
0
0



0
10
0
1
0



0
0
0
7
0



0
0
0
0
7



0
6
0
0
1



0
10
0
3
0



0
1
0
5
0



0
0
0
1
0



0
0
0
0
13



0
3
0
7
0



0
2
0
10
1



0
3
0
6
0



0
7
0
0
0



0
6
0
0
1



0
6
0
0
0



0
10
0
0
0



0
1
0
0
5



0
9
0
0
0



0
1
1
0
3



0
10
0
3
0



0
4
0
0
0



0
3
0
0
0



0
0
0
9
0



0
0
0
0
2



0
1
0
1
0



0
2
0
1
0



0
1
0
0
0



0
1
0
0

14
0



0
0
0
0
0



0
0
0
0
0



0
0
0
0
0



0
0
0
0
0



0
1
0
4
0



0
1
0
6
0



0
11
0
0
0



0
0
0
0
3



0
0
0
0
5



0
6
0
0
9



0
8
0
6
0



0
0
0
0
0



0
10
0
2
4



0
0
0
7
0



0
10
0
0
0



0
0
1
0
0



0
1
0
7
0



0
1
0
4
0



0
0
0
1
0



0
10
0
0
0



0
10
0
2
0



0
1
0
0
0



0
0
0
0
0



0
0
0
0
0



0
10
0
8
0



0
12
0
0
0



0
0
0
0
0



0
8
0
6
0



0
10
0
3
0



0
0
0
0
0



0
2
0
2
3



0
2
0
5
0



0
9
0
0
0



0
1
0
0
0



0
6
0
6
0



0
1
0
6
0



0
3
0
0
0



0
10
0
0
0



0
1
0
0
0



0
1
0
4
0



0
2
0
2
5



0
1
0
6
0



0
6
0
0
0



0
1
0
1
0



0
0
1
0
0



0
0
0
0
1



0
1
0
3
0



0
0
0
2
0



0
2
0
6
7



0
0
0
0
0



0
8
0
6
0



0
4
0
0
0



0
2
0
2
3



0
10
0
8
0



0
1
0
3
3



0
0
0
5
0



0
0
0
0
0



0
5
0
0
0



0
0
0
2
0



0
0
0
0
0



0
10
0
3
0



0
1
0
0
0



0
1
0
0
0



0
2
0
2
0



0
10
0
4
0



0
8
0
0
0



0
0
0
0
0



0
2
0
11
0



0
0
0
0
0



0
2
0
3
0



0
0
0
4
0



0
0
0
0
0



0
0
0
0
0



0
10
0
0
0



0
10
0
8
0



0


# Model

In [17]:
class CNN_Model:
    
    def __init__(self, data_object):
        self.img_height = 64
        self.img_width = 64
        self.img_channels = 53
        self.cardinality = 32
        self.data_object = data_object
        self.num_classes = 123
    
    def residual_network(self, x):
        """
        ResNeXt by default. For ResNet set `cardinality` = 1 above.

        """
        def add_common_layers(y):
            y = layers.BatchNormalization()(y)
            y = layers.LeakyReLU()(y)

            return y

        def grouped_convolution(y, nb_channels, _strides):
            # when `cardinality` == 1 this is just a standard convolution
            if self.cardinality == 1:
                return layers.Conv2D(nb_channels, kernel_size=(3, 3), strides=_strides, padding='same')(y)

            assert not nb_channels % self.cardinality
            _d = nb_channels // self.cardinality

            # in a grouped convolution layer, input and output channels are divided into `cardinality` groups,
            # and convolutions are separately performed within each group
            groups = []
            for j in range(self.cardinality):
                group = layers.Lambda(lambda z: z[:, :, :, j * _d:j * _d + _d])(y)
                groups.append(layers.Conv2D(_d, kernel_size=(3, 3), strides=_strides, padding='same')(group))

            # the grouped convolutional layer concatenates them as the outputs of the layer
            y = layers.concatenate(groups)

            return y

        def residual_block(y, nb_channels_in, nb_channels_out, _strides=(1, 1), _project_shortcut=False):
            """
            Our network consists of a stack of residual blocks. These blocks have the same topology,
            and are subject to two simple rules:
            - If producing spatial maps of the same size, the blocks share the same hyper-parameters (width and filter sizes).
            - Each time the spatial map is down-sampled by a factor of 2, the width of the blocks is multiplied by a factor of 2.
            """
            shortcut = y

            # we modify the residual building block as a bottleneck design to make the network more economical
            y = layers.Conv2D(nb_channels_in, kernel_size=(1, 1), strides=(1, 1), padding='same')(y)
            y = add_common_layers(y)

            # ResNeXt (identical to ResNet when `cardinality` == 1)
            y = grouped_convolution(y, nb_channels_in, _strides=_strides)
            y = add_common_layers(y)

            y = layers.Conv2D(nb_channels_out, kernel_size=(1, 1), strides=(1, 1), padding='same')(y)
            # batch normalization is employed after aggregating the transformations and before adding to the shortcut
            y = layers.BatchNormalization()(y)

            # identity shortcuts used directly when the input and output are of the same dimensions
            if _project_shortcut or _strides != (1, 1):
                # when the dimensions increase projection shortcut is used to match dimensions (done by 1×1 convolutions)
                # when the shortcuts go across feature maps of two sizes, they are performed with a stride of 2
                shortcut = layers.Conv2D(nb_channels_out, kernel_size=(1, 1), strides=_strides, padding='same')(shortcut)
                shortcut = layers.BatchNormalization()(shortcut)

            y = layers.add([shortcut, y])

            # relu is performed right after each batch normalization,
            # expect for the output of the block where relu is performed after the adding to the shortcut
            y = layers.LeakyReLU()(y)

            return y

        # conv1
        x = layers.Conv2D(64, kernel_size=(7, 7), strides=(2, 2), padding='same')(x)
        x = add_common_layers(x)

        # conv2
        x = layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)
        for i in range(3):
            project_shortcut = True if i == 0 else False
            x = residual_block(x, 128, 256, _project_shortcut=project_shortcut)

        # conv3
        for i in range(4):
            # down-sampling is performed by conv3_1, conv4_1, and conv5_1 with a stride of 2
            strides = (2, 2) if i == 0 else (1, 1)
            x = residual_block(x, 256, 512, _strides=strides)

        # conv4
        for i in range(6):
            strides = (2, 2) if i == 0 else (1, 1)
            x = residual_block(x, 512, 1024, _strides=strides)

        # conv5
        for i in range(3):
            strides = (2, 2) if i == 0 else (1, 1)
            x = residual_block(x, 1024, 2048, _strides=strides)

        x = layers.GlobalAveragePooling2D()(x)
        
        x = layers.RepeatVector(5)(x)
        x = layers.LSTM(256, recurrent_dropout=0.3, return_sequences=True)(x)
        x = layers.LSTM(256, recurrent_dropout=0.3, return_sequences=True)(x)
        x = layers.LSTM(256, recurrent_dropout=0.3, return_sequences=True)(x)
        x = layers.TimeDistributed(layers.Dense(256))(x)
        x = layers.TimeDistributed(layers.Dense(256))(x)
        x = layers.TimeDistributed(layers.Dense(256))(x)
        out = layers.TimeDistributed(layers.Dense(123, activation='softmax'))(x)

        return out

    def model_create(self, time_steps, batch_size):
        image_tensor = layers.Input(shape=(self.img_height, self.img_width, self.img_channels))
        network_output = self.residual_network(image_tensor)  
        model = models.Model(inputs=[image_tensor], outputs=[network_output])
        print(model.summary())
        # Compiling the CNN
        model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy','mae'])
        return model
    
    def fit_generator(self, num_epochs=10, batch_size=32, crop_size=16, time_steps=5):        
        try:
            classifier = load_model("Code/Models/CNN_LSTM_Reduced_ResNext.h5")
        except:
            print("Training")
            classifier = self.model_create(time_steps=time_steps, batch_size=batch_size)
            train_data = ImageDataGenerator(self.data_object.x_train, self.data_object.y_train, batch_size, crop_size)
            #test_data = ImageDataGenerator(self.data_object.x_test, self.data_object.y_test, batch_size, crop_size)
            history = classifier.fit_generator(train_data, epochs=num_epochs, use_multiprocessing=True,shuffle=True)
            return classifier
            #Error saving the file.
            classifier.save("Code/Models/CNN_LSTM_Embedded_ResNext2.h5")
        print("Testing")
        test_data = ImageDataGenerator(self.data_object.x_test, self.data_object.y_test, batch_size, crop_size)
        scores = classifier.evaluate_generator(test_data, use_multiprocessing=True)
        print("Loss : ", scores[0])
        print("Metrics : ", scores[1:])
        return classifier

In [18]:
class ImageDataGenerator(Sequence):
    
    def __init__(self, x_metadata, y_metadata, batch_size, crop_size):
        self.x = x_metadata
        self.y = y_metadata
        self.batch_size = batch_size
        self.cp = crop_size
        self.dic = {0:[0,120,165,210],1:[35,62,85],2:[7,22,50],3:[0,1,2,3,4],4:[20,60,140],5:[60,100],6:[0,1,2,3,4],7:[1,2,4,8],8:[1,2],9:[0,1,2,3,4]}
        self.conv_dic = {0:[133,1176],1:[-10.00984,18.36730],2:[7.846126,20.94560],3:[41.182110,59.95573],4:[302.772980,777.74048],5:[6.182446,36.54550],6:[-28.248663,5.33183],7:[16.744829,41.94211],8:[-14.122952,22.96798],9:[-17.672335,26.44534],10:[-2.738379,26.44534],11:[-17.672335,11.73241],12:[318.297485,2543.30225],13:[43.063732,285.43790],14:[3.022581,135.58406],15:[8.283675,57.78888],16:[121.616867,855.52594],17:[19.868601,421.27750],18:[19.868601,851.60620],19:[60.590000,520.31244],20:[-187.999999,4672.000000]}
    
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    
    def binarization(self,image,un):
        img = np.zeros((64,64,len(un)))
        for i in range(len(un)):
            img[:,:,i] = (image.copy())
            img[:,:,i][img[:,:,i] != un[i]] = 0
        return img   
    
    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        x = []
        for i in range(len(batch_x)):
            tempf = tif.imread(batch_x[i])[:21,:,:]
            for k in range(21):
                tempf[k] = self.conv_dic[k][0] + (self.conv_dic[k][1] - self.conv_dic[k][0]) * ((tempf[k]/255.0) - 0.1) / 0.8
            tempf = np.transpose(tempf,(1,2,0))
            
            l = []
            temp = tif.imread(batch_x[i])[21:,:,:]
            for k in range(10):
                un = np.array(self.dic[k])
                un = un[un != 0]
                img=np.transpose(self.binarization(temp[k],un),(2,0,1)).tolist()
                l.extend(img)
            x.append(np.concatenate((tempf,np.transpose(np.array(l),(1,2,0))), axis=2))
        return np.array(x), np.array(batch_y)

In [19]:
model_object = CNN_Model(data)

In [None]:
classifier = model_object.fit_generator(num_epochs=1, batch_size=32, crop_size=32)

Training


In [None]:
print("Testing")
test_data = ImageDataGenerator(model_object.data_object.x_test, model_object.data_object.y_test, 32, 32)
scores = classifier.evaluate_generator(test_data, use_multiprocessing=True)
print("Loss : ", scores[0])
print("Metrics : ", scores[1:])

# MRR calculation

In [24]:
test_data = ImageDataGenerator(model_object.data_object.x_test, model_object.data_object.y_test, 32, 32)
predictions = loaded_model.predict_generator(test_data, use_multiprocessing=True)

In [25]:
res = 0
res0 = 0
for i in range(len(predictions)):
    cnt = 0
    for j in range(5):
        if(np.argmax(predictions[i][j]) == np.argmax(model_object.data_object.y_test[i][j])):
            cnt+=1
        else:
            break
    if(cnt == 5):
        res+=1.
        res0+=1.
    else:
        res+=1/3336.
        res0+=0

In [26]:
print(res/len(predictions), res0/len(predictions))

0.17000925592953436 0.16976038314293124


In [30]:
pd.DataFrame([res/len(predictions),res0/len(predictions)]).to_csv("Test_Result1.csv", sep=',',index=False)

In [21]:
len(predictions)

65563

In [23]:
# serialize model to JSON
model_json = classifier.to_json()
with open("Code/Models/Reduced_Taxonomy_Classifier.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
classifier.save_weights("Code/Models/Reduced_Taxonomy_Classifier.h5")
print("Saved model to disk")

Saved model to disk


In [20]:
json_file = open('Code/Models/Reduced_Taxonomy_Classifier.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("Code/Models/Reduced_Taxonomy_Classifier.h5")
print("Loaded model from disk")

Loaded model from disk
