# Imports

In [5]:
from __future__ import print_function, division
import warnings
warnings.filterwarnings("ignore")

import os
import sys
import math
import operator
import numpy as np
import pandas as pd
import pickle as pkl
import tifffile as tif
import matplotlib.pyplot as plt
from multiprocessing import Pool
from sklearn.utils import shuffle
from keras import layers
from keras import models
from keras.utils import Sequence
from keras.models import load_model, save_model
#from livelossplot import PlotLossesKeras

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F
# from torchvision import transforms, utils
# from torch.utils.data import Dataset, DataLoader
os.chdir("../../")

# Data Preprocessing

### Class Declaration

In [8]:
class ImageDataGenerator(Sequence):
    
    def __init__(self, x_metadata, y_metadata, batch_size, crop_size):
        self.x = x_metadata
        self.y = y_metadata
        self.batch_size = batch_size
        self.cp = crop_size
        #self.dic = {0:3,1:2,2:2,3:1,4:3,5:3,6:1,7:1,8:1,9:1}
        #self.dic = {0:4,1:3,2:3,3:5,4:3,5:2,6:5,7:4,8:2,9:5}
        self.conv_dic = {0:[133,1176],1:[-10.00984,18.36730],2:[7.846126,20.94560],3:[41.182110,59.95573],4:[302.772980,777.74048],5:[6.182446,36.54550],6:[-28.248663,5.33183],7:[16.744829,41.94211],8:[-14.122952,22.96798],9:[-17.672335,26.44534],10:[-2.738379,26.44534],11:[-17.672335,11.73241],12:[318.297485,2543.30225],13:[43.063732,285.43790],14:[3.022581,135.58406],15:[8.283675,57.78888],16:[121.616867,855.52594],17:[19.868601,421.27750],18:[19.868601,851.60620],19:[60.590000,520.31244],20:[-187.999999,4672.000000]}
    
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        x1 = []
        x2 = []
        x3 = []
        feat=[]
        for i in range(len(batch_x)):
            #x.append(np.transpose(tif.imread(batch_x[i])[:21,self.cp:-self.cp,self.cp:-self.cp],(1,2,0)))
            temp = tif.imread(batch_x[i])[:3,:,:]
            for k in range(3):
                temp[k] = self.conv_dic[k][0] + (self.conv_dic[k][1] - self.conv_dic[k][0]) * ((temp[k]/255.0) - 0.1) / 0.8
            x1.append(np.transpose(temp[0],(1,2,0)))
            x2.append(np.transpose(temp[1],(1,2,0)))
            x3.append(np.transpose(temp[2],(1,2,0)))
            #x.append(np.transpose(tif.imread(batch_x[i]),(1,2,0)))
        return [np.array(x1),np.array(x2),np.array(x3)], np.array(batch_y)

In [4]:
class Data_Preprocess():
    
    def init_load(self, root_dir, csv_file):
        self.df = pd.read_csv(csv_file, low_memory=False)
        self.path = root_dir
    
    def create_mappings_for_unique_labels(self):
        # getting all unique names from csv file
        self.classes = list(sorted(self.df['class'].unique()))
        self.orders = list(sorted(self.df['order'].unique()))
        self.family = list(sorted(self.df['family'].unique()))
        self.genus = list(sorted(self.df['genus'].unique()))
        self.species = list(sorted(self.df['species_glc_id'].unique()))
        self.all_names = self.classes + self.orders + self.family + self.genus + self.species
        # creting map for one hot encoding / embedding
        self.all_encoded = {}
        self.all_rev_encoded = {}
        
        for i, name in enumerate(self.all_names):
            self.all_encoded[str(name)] = i
            self.all_rev_encoded[int(i)] = str(name)
        
    # embedding all the names
    def create_embedding(self):
        print("Done")
        columns = ['class','order','family','genus','species_glc_id']
        self.df = pd.DataFrame(shuffle(self.df.values), columns=self.df.columns)
        try:
            self.embed_vectors1 = pkl.load(open("Data/Embed1.pkl","rb"))
        except:
            self.embed_vectors1 = {}
            for col_idx in range(len(columns)-1):
                x,y = [],[]
                print("Collecting " + columns[col_idx] + "," + columns[col_idx+1])
                x.extend([self.all_encoded[str(i)] for i in self.df[columns[col_idx]]])
                y.extend([self.all_encoded[str(i)] for i in self.df[columns[col_idx+1]]])
                x,y = np.array(x), np.array(y)
                print(x.shape, y.shape)
                print(np.max(x))
                model = Sequential()
                model.add(Embedding(input_dim=np.max(x)+1, output_dim=10, input_length=1, name="Embed"))
                model.add(Flatten())
                model.add(Dense(1, activation='relu'))
                model.compile(optimizer='nadam',loss='logcosh', metrics=['mae','accuracy'])
                model.summary()
                model.fit(x,y,epochs=30,batch_size=100)
                self.embed_vectors1[columns[col_idx]] = np.array(model.get_layer("Embed").get_weights()[0])
                del model
            pkl.dump(self.embed_vectors1, open("Data/Embed1.pkl","wb"))
        
        try:
            self.embed_vectors2 = pkl.load(open("Data/Embed2.pkl","rb"))
        except:
            self.embed_vectors2 = {}
            x,y = [self.all_encoded[str(i)] for i in self.df[columns[-1]]], [self.embed_vectors1[columns[-2]][self.all_encoded[str(i)]] for i in self.df[columns[-2]]]
            x,y = np.array(x), np.array(y)
            print(x.shape, y.shape)
            model = Sequential()
            model.add(Embedding(input_dim=max(x)+1, output_dim=10, input_length=1, name="Embed"))
            model.add(Flatten())
            model.add(Dense(10))
            model.compile(optimizer='nadam',loss='logcosh', metrics=['mae','accuracy'])
            model.summary()
            model.fit(x,y,epochs=50,batch_size=200)
            self.embed_vectors2[columns[-1]] = np.array(model.get_layer("Embed").get_weights()[0])
            del model
            pkl.dump(self.embed_vectors2,open("Data/Embed2.pkl","wb"))
        
    def train_test_data_loading(self):
        self.x_train, self.x_test, self.y_train, self.y_test = [], [], [], []
        for cls in self.df['class'].unique():
            #if(cls not in ['Magnoliopsida']):
                for order in self.df[self.df['class']==cls]['order'].unique():
                    for family in self.df[(self.df['class']==cls) & (self.df['order']==order)]['family'].unique():
                        for genus in self.df[(self.df['class']==cls) & (self.df['order']==order) & (self.df['family']==family)]['genus'].unique():
                            for species in self.df[(self.df['class']==cls) & (self.df['order']==order) & (self.df['family']==family) & (self.df['genus']==genus)]['species_glc_id'].unique():
                                path = self.path+"train/"+cls+"/"+order+"/"+family+"/"+genus+"/"+str(species)+"/"
                                self.x_train.extend([path+i for i in os.listdir(path)])
                                path = self.path+"test/"+cls+"/"+order+"/"+family+"/"+genus+"/"+str(species)+"/"
                                self.x_test.extend([path+i for i in os.listdir(path)])
        
        np.random.shuffle(self.x_train)
        np.random.shuffle(self.x_test)
        
        for im in self.x_train:
            l = im.split("/")
            c, o, f, g, s = self.all_encoded[l[3]], self.all_encoded[l[4]], self.all_encoded[l[5]], self.all_encoded[l[6]], self.all_encoded[l[7]]
            self.y_train.append([[c],[o],[f],[g],[s]])
            
        for im in self.x_test:
            l = im.split("/")
            c, o, f, g, s = self.all_encoded[l[3]], self.all_encoded[l[4]], self.all_encoded[l[5]], self.all_encoded[l[6]], self.all_encoded[l[7]] #self.embed_vectors1['class'][self.all_encoded[l[3]]], self.embed_vectors1['order'][self.all_encoded[l[4]]], self.embed_vectors1['family'][self.all_encoded[l[5]]], self.embed_vectors1['genus'][self.all_encoded[l[6]]], self.embed_vectors2['species_glc_id'][int(l[7])]
            self.y_test.append([[c],[o],[f],[g],[s]])
        
    def ordered_call(self, root_dir, csv_file):
        print("Creating the data preprocessing object and loading csv")
        self.init_load(root_dir, csv_file)
        print("Done!")
        print("Creating unique mappings for labels")
        self.create_mappings_for_unique_labels()
        #print("Done!")
        #print("Creating embeddings for all the names")
        #self.create_embedding()
        print("Done!")
        print("Loading test and train image paths and corresponding labels")
        self.train_test_data_loading()
        print("Done!")

### Implement data preprocessing

In [5]:
data = Data_Preprocess()

In [5]:
data.ordered_call(root_dir="Data/Hierarchial Data/", csv_file="occurrences_train.csv")

Creating the data preprocessing object and loading csv
Done!
Creating unique mappings for labels
Done!
Loading test and train image paths and corresponding labels
Done!


In [6]:
#data.y_train, data.y_test = np.array(data.y_train).reshape(-1,1), np.array(data.y_test).reshape(-1,1)
data.x_train, data.y_train, data.x_test, data.y_test = np.array(data.x_train), np.array(data.y_train), np.array(data.x_test), np.array(data.y_test)

# Model

### Class Declaration

In [9]:
class CNN_Model:
    
    def __init__(self, data_object):
        self.img_height = 64
        self.img_width = 64
        self.img_channels = 1
        self.cardinality = 32
        self.data_object = data_object
        self.num_classes = len(data_object.all_encoded.keys())
        
    def three_cnn(self, x1,x2,x3):
        
        y = layers.Conv2D(filters=96, kernel_size=3, activation='relu')(x1)
        y = layers.MaxPool2D(2)(y)
        y = layers.BatchNormalization()(y)
        y = layers.Conv2D(filters=256, kernel_size=5, activation='relu')(y)
        y = layers.MaxPool2D(3)(y)
        y = layers.BatchNormalization()(y)
        y = layers.Conv2D(filters=256, kernel_size=7, activation='relu')(y)
        y = layers.MaxPool2D(3)(y)
        y = layers.BatchNormalization()(y)
        l1 = layers.Flatten()(y)
        
        y = layers.Conv2D(filters=96, kernel_size=3, activation='relu')(x2)
        y = layers.MaxPool2D(2)(y)
        y = layers.BatchNormalization()(y)
        y = layers.Conv2D(filters=256, kernel_size=5, activation='relu')(y)
        y = layers.MaxPool2D(3)(y)
        y = layers.BatchNormalization()(y)
        y = layers.Conv2D(filters=256, kernel_size=7, activation='relu')(y)
        y = layers.MaxPool2D(3)(y)
        y = layers.BatchNormalization()(y)
        l2 = layers.Flatten()(y)
        
        y = layers.Conv2D(filters=96, kernel_size=3, activation='relu')(x3)
        y = layers.MaxPool2D(2)(y)
        y = layers.BatchNormalization()(y)
        y = layers.Conv2D(filters=256, kernel_size=5, activation='relu')(y)
        y = layers.MaxPool2D(3)(y)
        y = layers.BatchNormalization()(y)
        y = layers.Conv2D(filters=256, kernel_size=7, activation='relu')(y)
        y = layers.MaxPool2D(3)(y)
        y = layers.BatchNormalization()(y)
        l3 = layers.Flatten()(y)
        
        merg=layers.Concatenate()([l1,l2,l3])
        
        x = layers.RepeatVector(5)(merg)
        x = layers.LSTM(128, return_sequences=True)(x)
        x = layers.LSTM(64, return_sequences=True)(x)
        x = layers.TimeDistributed(layers.Dense(128, activation='relu'))(x)
        x = layers.TimeDistributed(layers.Dense(self.num_classes, activation='softmax'))(x)
        
        return x
    
    
    
   
    def model_create(self, time_steps, batch_size):
        image_tensor1 = layers.Input(shape=(self.img_height, self.img_width, self.img_channels))
        image_tensor2 = layers.Input(shape=(self.img_height, self.img_width, self.img_channels))
        image_tensor3 = layers.Input(shape=(self.img_height, self.img_width, self.img_channels))
        network_output = self.three_cnn(image_tensor1,image_tensor2,image_tensor3)  
        model = models.Model(inputs=[image_tensor1,image_tensor2,image_tensor3], outputs=[network_output])
        print(model.summary())
        # Compiling the CNN
        model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy','mae'])
        return model
    
    def fit_generator(self, num_epochs=10, batch_size=32, crop_size=16, time_steps=5):        
        try:
            classifier = load_model("Code/Models/RCNN_ResNext1.h5")
        except:
            print("Training")
            classifier = self.model_create(time_steps=time_steps, batch_size=batch_size)
            train_data = ImageDataGenerator(self.data_object.x_train, self.data_object.y_train, batch_size, crop_size)
            #test_data = ImageDataGenerator(self.data_object.x_test, self.data_object.y_test, batch_size, crop_size)
            history = classifier.fit_generator(train_data, epochs=num_epochs, use_multiprocessing=True,shuffle=True)
            return classifier
            #Error saving the file.
            classifier.save("Code/Models/RCNN_ResNext1.h5")
        print("Testing")
        test_data = ImageDataGenerator(self.data_object.x_test, self.data_object.y_test, batch_size, crop_size)
        scores = classifier.evaluate_generator(test_data, use_multiprocessing=True)
        print("Loss : ", scores[0])
        print("Metrics : ", scores[1:])
        return classifier

### Model Run

In [7]:
model_object = CNN_Model(data)

AttributeError: 'Data_Preprocess' object has no attribute 'all_encoded'

In [9]:
classifier = model_object.fit_generator(num_epochs=1, batch_size=32, crop_size=32)

Training
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 64, 64, 21)   0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 62, 62, 96)   18240       input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 31, 31, 96)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 31, 31, 96)   384         max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
c

In [11]:
test_data = ImageDataGenerator(data.x_test, data.y_test,32,32)
scores = classifier.evaluate_generator(test_data, use_multiprocessing=True)

In [13]:
 classifier.save("Code/Models/RCNN_ResNext1.h5")

# Try new metric