In [1]:
import os
import sys
import math
import operator
import numpy as np
import pandas as pd
import pickle as pkl
import keras.backend as K
import tifffile as tif
from keras.layers import Dense
from keras.layers import Conv2D
from multiprocessing import Pool
from keras.utils import Sequence
from keras.layers import Flatten,Layer
from keras.layers import MaxPool2D,Dropout
from keras.models import Sequential 
from keras.layers import Reshape
from collections import OrderedDict
from keras.layers import TimeDistributed
from keras.layers import LSTM
from keras.layers import Permute
from keras.models import load_model, save_model
os.chdir("../../")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
 class Round(Layer):

    def __init__(self, **kwargs):
        super(Round, self).__init__(**kwargs)

    def get_output(self, train=False):
        X = self.get_input(train)
        return K.round(X)

    def get_config(self):
        config = {"name": self.__class__.__name__}
        base_config = super(Round, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [13]:
class ImageDataGenerator(Sequence):
    
    def __init__(self, x_metadata, y_metadata, batch_size, crop_size):
        self.x = x_metadata
        self.y = y_metadata
        self.batch_size = batch_size
        self.cp = crop_size
    
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        arr = []
        for file_name in batch_x:
            arr.append(np.array(tif.imread(file_name), dtype=int)[:,self.cp:-self.cp,self.cp:-self.cp]/255.0)
            
        return np.array(arr), np.array(batch_y)         

class CNN_Model:
    
    def __init__(self, directory):
        
        self.path = directory
        
        df = pd.read_csv("occurrences_train.csv",low_memory=False)
        ll = list(df['class'].unique()) + list(df['order'].unique()) + list(df['family'].unique()) + list(df['genus'].unique()) + list(df['species_glc_id'].unique())
        mapper = {}
        counter = 0
        for i in (ll):
            mapper[str(i)] = counter
            counter += 1

        df['class'] = df['class'].map(mapper)
        df['order'] = df.order.map(mapper)
        df['family'] = df.family.map(mapper)
        df['genus'] = df.genus.map(mapper)
        df['species_glc_id'] = df.species.map(mapper)
        
        with open("Data/hierarchy_data.pkl","rb") as f:
            hd = pkl.load(f)
        with open("Data/class_encoding.pkl","rb") as f:
            self.classes = pkl.load(f)
        with open("Data/order_encoding.pkl","rb") as f:
            self.orders = pkl.load(f)
        with open("Data/family_encoding.pkl","rb") as f:
            self.families = pkl.load(f)
        with open("Data/genus_encoding.pkl","rb") as f:
            self.genuses = pkl.load(f)
        with open("Data/specie_encoding.pkl","rb") as f:
            self.species = pkl.load(f)

        self.train_pathdata_x = []
        self.train_seq_y = []
        self.test_pathdata_x = []
        self.test_seq_y = []
        
        for cls in hd.keys():
            for order in hd[cls].keys():
                for family in hd[cls][order].keys():
                    for genus in hd[cls][order][family].keys():
                        for specie in hd[cls][order][family][genus]:
                            for im in os.listdir(self.path+"train/"+str(self.classes[cls])+"/"+str(self.orders[order])
                                                 +"/"+str(self.families[family])+"/"+str(self.genuses[genus])+"/"+str(specie)):
                                self.train_pathdata_x.append(self.path+"train/"+str(self.classes[cls])+"/"+str(self.orders[order])
                                                             +"/"+str(self.families[family])+"/"+str(self.genuses[genus])+"/"+str(specie)+"/"
                                                             +im)
                            
        for cls in hd.keys():
            for order in hd[cls].keys():
                for family in hd[cls][order].keys():
                    for genus in hd[cls][order][family].keys():
                        for specie in hd[cls][order][family][genus]:
                            for im in os.listdir(self.path+"test/"+str(self.classes[cls])+"/"+str(self.orders[order])
                                                 +"/"+str(self.families[family])+"/"+str(self.genuses[genus])+"/"+str(specie)):
                                self.test_pathdata_x.append(self.path+"test/"+str(self.classes[cls])+"/"+str(self.orders[order])
                                                             +"/"+str(self.families[family])+"/"+str(self.genuses[genus])+"/"+str(specie)+"/"
                                                             +im)
        
        np.random.shuffle(self.train_pathdata_x)
        np.random.shuffle(self.test_pathdata_x)
        
        for p in self.train_pathdata_x:
            y = p.split("/")
            c,o,f,g,s = 0,0,0,0,0
            for k,v in self.classes.items():
                if(v == int(y[3])):
                    c = mapper[k]
            for k,v in self.orders.items():
                if(v == int(y[4])):
                    o = mapper[k]
            for k,v in self.families.items():
                if(v == int(y[5])):
                    f = mapper[k]
            for k,v in self.genuses.items():
                if(v == int(y[6])):
                    g = mapper[k]
            s = mapper[y[7]]
            self.train_seq_y.append([o,f,g,s])
            
        for p in self.test_pathdata_x:
            y = p.split("/")
            c,o,f,g,s = 0,0,0,0,0
            for k,v in self.classes.items():
                if(v == int(y[3])):
                    c = mapper[k]
            for k,v in self.orders.items():
                if(v == int(y[4])):
                    o = mapper[k]
            for k,v in self.families.items():
                if(v == int(y[5])):
                    f = mapper[k]
            for k,v in self.genuses.items():
                if(v == int(y[6])):
                    g = mapper[k]
            s = mapper[y[7]]
            self.test_seq_y.append([o,f,g,s])

            
    def model_create(self, time_steps=5, batch_size=32):
        
        model = Sequential()
        model.add(Conv2D(filters=32,kernel_size=(2,2),activation='relu',input_shape=(33,32,32)))
        model.add(Conv2D(filters=64,kernel_size=(4,4),activation='relu'))
        model.add(Conv2D(filters=128,kernel_size=(6,6),activation='relu'))
        model.add(Conv2D(filters=256,kernel_size=(10,10),activation='relu'))
        model.add(MaxPool2D(pool_size=(2,2)))
        model.add(Reshape((7*7,256)))
        model.add(Dropout(0.2))
        model.add(LSTM(4))
        model.add(Dropout(0.2))
        model.add(Dense(4))
        model.add(Round())
        model.compile(loss='mae',optimizer='Adam',metrics=['mae','accuracy','categorical_accuracy'])
        model.summary()
        return model
    
    def fit_generator(self, num_epochs=10, batch_size=32, crop_size=16, time_steps=5):        
        try:
            classifier = load_model("Code/Models/CNN-RNN_1.h5")
        except:
            print("Training")
            classifier = self.model_create(time_steps=time_steps, batch_size=batch_size)
            train_data = ImageDataGenerator(self.train_pathdata_x[:50000], self.train_seq_y[:50000], batch_size, crop_size)
            history = classifier.fit_generator(train_data, epochs=num_epochs, use_multiprocessing=True,shuffle=True)
            classifier.save("Code/Models/CNN-RNN_1.h5")
            
        
        print("Testing")
        test_data = ImageDataGenerator(self.test_pathdata_x[:20000], self.test_seq_y[:20000], batch_size, crop_size)
        scores = classifier.evaluate_generator(test_data, use_multiprocessing=True)
        print("Loss : ", scores[0])
        print("Metrics : ", scores[1:])
        return classifier

In [14]:
ob = CNN_Model("Data/Hierarchial Data/")

In [15]:
len(ob.train_pathdata_x), len(ob.train_seq_y), len(ob.test_pathdata_x), len(ob.test_seq_y)

(152980, 152980, 65563, 65563)

In [16]:
model = ob.fit_generator(num_epochs=1, batch_size=64, time_steps=5)

Testing
Loss :  658.7006265625
Metrics :  [0.0, 0.0]


In [12]:
test_data = ImageDataGenerator(ob.train_pathdata_x[:100], ob.train_seq_y[:100], batch_size=100, crop_size=16)
ypred = model.predict_generator(test_data)
ytest = ob.train_seq_y[:100]
for i in range(len(ypred)):
    print(ypred[i], ytest[i])

[3.5556173 4.806175  4.822678  4.8115134] [17, 73, 292, 2074]
[3.5556173 4.806175  4.822678  4.8115134] [26, 88, 286, 1307]
[3.5556173 4.806175  4.822678  4.8115134] [11, 67, 550, 1810]
[3.5556173 4.806175  4.822678  4.8115134] [20, 94, 744, 2349]
[3.5556173 4.806175  4.822678  4.8115134] [10, 66, 760, 2393]
[3.5556173 4.806175  4.822678  4.8115134] [10, 66, 287, 1309]
[3.5556173 4.806175  4.822678  4.8115134] [32, 101, 492, 1684]
[3.5556173 4.806175  4.822678  4.8115134] [16, 72, 414, 1526]
[3.5556173 4.806175  4.822678  4.8115134] [18, 102, 401, 1506]
[3.5556173 4.806175  4.822678  4.8115134] [10, 75, 663, 2047]
[3.5556173 4.806175  4.822678  4.8115134] [16, 72, 236, 1718]
[3.5556173 4.806175  4.822678  4.8115134] [11, 67, 631, 2220]
[3.5556173 4.806175  4.822678  4.8115134] [26, 88, 429, 1549]
[3.5556173 4.806175  4.822678  4.8115134] [22, 89, 311, 1911]
[3.5556173 4.806175  4.822678  4.8115134] [32, 101, 852, 2686]
[3.5556173 4.806175  4.822678  4.8115134] [16, 98, 838, 2919]
[3.55