In [1]:
from __future__ import print_function, division
import os
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import tifffile as tif
import matplotlib.pyplot as plt
from skimage import io, transform
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from keras.models import Sequential
from keras.layers import Dense, Layer, Conv2D, MaxPool2D, Flatten, Embedding, LSTM

os.chdir("../../")

Using TensorFlow backend.


In [None]:
class Image_Generator(Dataset):
    
    def __init__(self, image_paths, labels):
        self.image_paths = image_paths
        self.labels = labels
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = torch.from_numpy(np.array(tif.imread(self.image_paths[idx])[:,16:-16,16:-16], dtype=np.float32)).detach()
        label = self.labels[idx].detach()
        sample = {'image':image, 'label':label}
        return sample

In [32]:
class Data_Preprocess():
    
    def init_load(self, root_dir, csv_file):
        self.df = pd.read_csv(csv_file, low_memory=False)
        self.path = root_dir
    
    def create_mappings_for_unique_labels(self):
        # getting all unique names from csv file
        self.classes = list(sorted(self.df['class'].unique()))
        self.orders = list(sorted(self.df['order'].unique()))
        self.family = list(sorted(self.df['family'].unique()))
        self.genus = list(sorted(self.df['genus'].unique()))
        self.species = list(sorted(self.df['species_glc_id'].unique()))
        
        # creting map for one hot encoding / embedding
        self.class_encoding = {}
        self.order_encoding = {}
        self.family_encoding = {}
        self.genus_encoding = {}
        
        self.class_rev_encoding = {}
        self.order_rev_encoding = {}
        self.family_rev_encoding = {}
        self.genus_rev_encoding = {}
        
        for i, name in enumerate(self.classes):
            self.class_encoding[name] = i
            self.class_rev_encoding[i] = name
        for i, name in enumerate(self.orders):
            self.order_encoding[name] = i
            self.order_rev_encoding[i] = name
        for i, name in enumerate(self.family):
            self.family_encoding[name] = i
            self.family_rev_encoding[i] = name
        for i, name in enumerate(self.genus):
            self.genus_encoding[name] = i
            self.genus_rev_encoding[i] = name
        
    # embedding all the names
    def create_embedding(self):
        
            
    def train_test_data_loading(self):
        self.x_train, self.x_test, self.y_train, self.y_test = [], [], [], []
        for cls in self.df['class'].unique():
            for order in self.df[self.df['class']==cls]['order'].unique():
                for family in self.df[(self.df['class']==cls) & (self.df['order']==order)]['family'].unique():
                    for genus in self.df[(self.df['class']==cls) & (self.df['order']==order) & (self.df['family']==family)]['genus'].unique():
                        for species in self.df[(self.df['class']==cls) & (self.df['order']==order) & (self.df['family']==family) & (self.df['genus']==genus)]['species_glc_id'].unique():
                            path = self.path+"train/"+cls+"/"+order+"/"+family+"/"+genus+"/"+str(species)+"/"
                            self.x_train.extend([path+i for i in os.listdir(path)])
                            path = self.path+"test/"+cls+"/"+order+"/"+family+"/"+genus+"/"+str(species)+"/"
                            self.x_test.extend([path+i for i in os.listdir(path)])
        
        np.random.shuffle(self.x_train)
        np.random.shuffle(self.x_test)
        
        for im in self.x_train:
            l = im.split("/")
            c, o, f, g, s = self.master_dictionary[l[3]], self.master_dictionary[l[4]], self.master_dictionary[l[5]], self.master_dictionary[l[6]], self.master_dictionary[l[7]]
            self.y_train.append(s)
            
        for im in self.x_test:
            l = im.split("/")
            c, o, f, g, s = self.master_dictionary[l[3]], self.master_dictionary[l[4]], self.master_dictionary[l[5]], self.master_dictionary[l[6]], self.master_dictionary[l[7]]
            self.y_test.append(s)
        
    def ordered_call(self, root_dir, csv_file):
        print("Creating the data preprocessing object and loading csv")
        self.init_load(root_dir, csv_file)
        print("Done!")
        print("Creating unique mappings for labels")
        self.create_mappings_for_unique_labels()
        print("Done!")
        print("Creating embeddings for all the names")
        out_dim = self.create_embedding()
        print("Done!")
        print("Loading test and train image paths and corresponding labels")
        self.train_test_data_loading()
        print("Done!")
        return out_dim

In [33]:
class Network(nn.Module):
    
    def __init__(self, out_dim):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(33, 66, 2)
        w = (32-2)/2
        self.conv2 = nn.Conv2d(66, 96, 2)
        w = (w-2)/2
        self.conv3 = nn.Conv2d(96, 256, 5)
        w = (w-5)/2
        self.conv4 = nn.Conv2d(256, 512, 5)
        w = (w-5)/2
        self.fc1 = nn.Linear(247808, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, out_dim)
        
    def num_features(self, x):
        s = 1
        for i in x.size()[1:]:
            s*=i
        return s
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = x.view(-1, 1, self.num_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [34]:
data = Data_Preprocess()
out_dims = data.ordered_call(root_dir="Data/Hierarchial Data/", csv_file="occurrences_train.csv")

Creating the data preprocessing object and loading csv
Done!
Creating unique mappings for labels
Done!
Creating embeddings for all the names
(178767, 1) (178767, 1, 1)
Epoch 1/20


ValueError: could not convert string to float: 'Magnoliopsida'

In [None]:
len(data.x_train)

In [None]:
train_dataset = Image_Generator(image_paths=data.x_train, labels=data.y_train)
test_dataset = Image_Generator(image_paths=data.x_test, labels=data.y_test)
train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=30, shuffle=True)

In [None]:
net = Network(out_dims)
criterion = nn.MSELoss()
optimizer = optim.Adadelta(net.parameters(), lr=0.001)

In [None]:
for epoch in range(2):  # loop over the dataset multiple times
    
    running_loss = 0.0
    for i, sample in enumerate(train_loader, 0):
        # get the inputs
        inputs, labels = sample['image'], sample['label']
        
        print(inputs.size(), labels.size())
        
        images = Variable(images)
        labels = Variable(labels)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        print(outputs.size())
        # print statistics
        running_loss += loss.item()
        if i % 500 == 499:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 500))
            running_loss = 0.0

print('Finished Training')