In [1]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import os
import pandas as pd
import tifffile as tif
import matplotlib.pyplot as plt
import cv2
from torch.utils.data import Dataset, DataLoader
import numpy as np
from PIL import Image

os.chdir("../../")

In [2]:
def load_images_from_folder(folder):
    images = []
    shape=[]
    dic = dict((c, i) for i, c in enumerate(os.listdir(folder)))
    for filename in os.listdir(folder):
        for image in os.listdir(folder+str(filename)):
            #img = Image.open(os.path.join(folder+str(filename)+'/'+str(image))).convert('LA')
            img = cv2.imread(os.path.join(folder+str(filename),image))
            if img is not None:
                images.append(img.transpose((2, 0, 1)) )
                shape.append(dic[str(filename)])
                break
    return images,shape    

In [42]:
class Data_Preprocess():
    
    def init_load(self, root_dir, csv_file):
        self.df = pd.read_csv(csv_file, low_memory=False)
        self.path = root_dir
        
    def create_mappings_for_unique_labels(self):
        # getting all unique names from csv file
        self.classes = list(sorted(self.df['class'].unique()))
        self.orders = list(sorted(self.df['order'].unique()))
        self.family = list(sorted(self.df['family'].unique()))
        self.genus = list(sorted(self.df['genus'].unique()))
        self.species = list(sorted(self.df['species_glc_id'].unique()))
        
        # creting map for one hot encoding / embedding
        self.class_encoding = {}
        self.order_encoding = {}
        self.family_encoding = {}
        self.genus_encoding = {}
        self.species_encoding = {}
        for i, name in enumerate(self.classes):
            self.class_encoding[name] = i
        for i, name in enumerate(self.orders):
            self.order_encoding[name] = i
        for i, name in enumerate(self.family):
            self.family_encoding[name] = i
        for i, name in enumerate(self.genus):
            self.genus_encoding[name] = i
        
    # embedding all the names
    def create_embedding(self):
        self.master_dictionary = {}
        all_names = self.classes + self.orders + self.family + self.genus + self.species
        embed_dim = int(np.ceil(np.sqrt(np.sqrt(len(all_names)))))
        embed = nn.Embedding(len(all_names)+1, embed_dim)
        for i in all_names:
            self.master_dictionary[str(i)] = embed(torch.LongTensor([all_names.index(i)]).detach_())
        return embed_dim
            
    def train_test_data_loading(self):
        self.x_train, self.x_test, self.y_train, self.y_test = [], [], [], []
        for cls in self.df['class'].unique():
            for order in self.df[self.df['class']==cls]['order'].unique():
                for family in self.df[(self.df['class']==cls) & (self.df['order']==order)]['family'].unique():
                    for genus in self.df[(self.df['class']==cls) & (self.df['order']==order) & (self.df['family']==family)]['genus'].unique():
                        for species in self.df[(self.df['class']==cls) & (self.df['order']==order) & (self.df['family']==family) & (self.df['genus']==genus)]['species_glc_id'].unique():
                            path = self.path+"train/"+cls+"/"+order+"/"+family+"/"+genus+"/"+str(species)+"/"
                            self.x_train.extend([path+i for i in os.listdir(path)])
                            path = self.path+"test/"+cls+"/"+order+"/"+family+"/"+genus+"/"+str(species)+"/"
                            self.x_test.extend([path+i for i in os.listdir(path)])
        
        np.random.shuffle(self.x_train)
        np.random.shuffle(self.x_test)
        
        for im in self.x_train:
            l = im.split("/")
            #c, o, f, g, s = self.master_dictionary[l[3]], self.master_dictionary[l[4]], self.master_dictionary[l[5]], self.master_dictionary[l[6]], self.master_dictionary[l[7]]
            c = self.orders.index(l[4])
            self.y_train.append(int(c))
            
        for im in self.x_test:
            l = im.split("/")
            #c, o, f, g, s = self.master_dictionary[l[3]], self.master_dictionary[l[4]], self.master_dictionary[l[5]], self.master_dictionary[l[6]], self.master_dictionary[l[7]]
            c = self.orders.index(l[4])
            self.y_test.append(int(c))
        
    def ordered_call(self, root_dir, csv_file):
        print("Creating the data preprocessing object and loading csv")
        self.init_load(root_dir, csv_file)
        print("Done!")
        print("Creating unique mappings for labels")
        self.create_mappings_for_unique_labels()
        print("Done!")
        print("Creating embeddings for all the names")
        out_dim = self.create_embedding()
        print("Done!")
        print("Loading test and train image paths and corresponding labels")
        self.train_test_data_loading()
        print("Done!")
        return out_dim

In [43]:
class ShapesDataset(Dataset):
    
    def __init__(self, image_paths, labels):
        self.image_paths = image_paths
        self.labels = labels
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = torch.from_numpy(np.array(tif.imread(self.image_paths[idx])[:,17:-17,17:-17], dtype=np.float32)).detach()
        label = self.labels[idx]
        sample = {'image':image, 'label':label}
        return sample

In [44]:
data = Data_Preprocess()
out_dims = data.ordered_call(root_dir="Data/Hierarchial Data/", csv_file="occurrences_train.csv")

Creating the data preprocessing object and loading csv
Done!
Creating unique mappings for labels
Done!
Creating embeddings for all the names
Done!
Loading test and train image paths and corresponding labels
Done!


In [46]:
# Training settings
batch_size = 100

# Dataset
train_dataset = ShapesDataset(image_paths=data.x_train, labels=data.y_train)

test_dataset = ShapesDataset(image_paths=data.x_test, labels=data.y_test)

# Data Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

In [47]:
len(train_dataset)

152980

In [48]:
#batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

In [None]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        
        # Convolution 1
        self.cnn1 = nn.Conv2d(in_channels=33, out_channels=48, kernel_size=5)
        self.relu1 = nn.ReLU()
        
        # Max pool 1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
     
        # Convolution 2
        self.cnn2 = nn.Conv2d(in_channels=48, out_channels=128, kernel_size=3)
        self.relu2 = nn.ReLU()
        
        # Max pool 2
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)
        
        # Convolution 3
        self.cnn3 = nn.Conv2d(in_channels=128, out_channels=192, kernel_size=3)
        self.relu3 = nn.ReLU()
        
        # Convolution 4
        self.cnn4 = nn.Conv2d(in_channels=192, out_channels=192, kernel_size=3)
        self.relu4 = nn.ReLU()
        
        # Convolution 5
        self.cnn5 = nn.Conv2d(in_channels=192, out_channels=128, kernel_size=3)
        self.relu5 = nn.ReLU()
        
        # Max pool 2
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)
        
        # Fully connected 1 (readout)
        self.fc1 = nn.Linear(132*4*4,max(data.y_train)+1) 
        self.fc2 = nn.Linear(3800,max(data.y_train)) 
    
    def forward(self, x):
        # Convolution 1
        out = self.cnn1(x)
        out = self.relu1(out)
        
        # Max pool 1
        out = self.maxpool1(out)
        
        # Convolution 2 
        out = self.cnn2(out)
        out = self.relu2(out)
        
#         # Convolution 3 
#         out = self.cnn3(out)
#         out = self.relu3(out)
        
        # Max pool 2 
        out = self.maxpool2(out)
        
        # Resize
        # Original size: (100, 32, 7, 7)
        # out.size(0): 100
        # New out size: (100, 32*7*7)
        out = out.view(out.size(0), -1)

        # Linear function (readout)
        out = self.fc1(out)
        #out = self.fc2(out)
        
        return out


In [60]:
model = CNNModel()

# #######################
# #  USE GPU FOR MODEL  #
# #######################

# if torch.cuda.is_available():
#     model.cuda()

In [61]:
criterion = nn.CrossEntropyLoss()

In [62]:
learning_rate = 0.01

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
#iter = 0
for epoch in range(5):
    running_loss = 0.0
    for i, sample in enumerate(train_loader):
        images=sample['image'].float()
#         #######################
#         #  USE GPU FOR MODEL  #
#         #######################
#         if torch.cuda.is_available():
#             images = Variable(images.cuda())
#             labels = Variable(sample['label'].cuda())
#         else:
        images = Variable(images)
        labels = Variable(sample['label'])
        
        
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        # Forward pass to get output/logits
        outputs = model(images)
        
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        
        # Getting gradients w.r.t. parameters
        loss.backward()
        
        # Updating parameters
        optimizer.step()
        
        if i % 5== 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            #for sample in test_loader:
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                #if torch.cuda.is_available():
                    #images = Variable(sample['image'].cuda())
                #else:
                    #images = Variable(sample['image'])
                
            # Forward pass only to get logits/output
            outputs = model(images)
                
            # Get predictions from the maximum value
            _, predicted = torch.max(outputs.data, 1)
                
            # Total number of labels
            total += sample['label'].size(0)
                
#                 #######################
#                 #  USE GPU FOR MODEL  #
#                 #######################
#                 # Total correct predictions
#             if torch.cuda.is_available():
#                 correct += (predicted.cpu() == labels.cpu()).sum()
#             else:
            correct += (predicted == labels).sum()
            
            accuracy = float(100 * correct / total)
            
            # Print Loss
            print('Batch: {}, Loss: {}, Accuracy: {}'.format(i, loss.data[0], accuracy))



Batch: 0, Loss: 28.538368225097656, Accuracy: 16.0
Batch: 5, Loss: 7187507.0, Accuracy: 4.0
Batch: 10, Loss: 3.978882074356079, Accuracy: 7.0
Batch: 15, Loss: 4.015989303588867, Accuracy: 14.0
Batch: 20, Loss: 4.011097431182861, Accuracy: 16.0
Batch: 25, Loss: 8.707825660705566, Accuracy: 18.0
Batch: 30, Loss: 4.007197380065918, Accuracy: 16.0
Batch: 35, Loss: 3.999967336654663, Accuracy: 15.0
Batch: 40, Loss: 4.001761436462402, Accuracy: 17.0
Batch: 45, Loss: 3916904.25, Accuracy: 13.0
Batch: 50, Loss: 3.9987430572509766, Accuracy: 15.0
Batch: 55, Loss: 3.9959049224853516, Accuracy: 14.0
Batch: 60, Loss: 3.986060857772827, Accuracy: 16.0
Batch: 65, Loss: 3.9871528148651123, Accuracy: 17.0
Batch: 70, Loss: 3.984562635421753, Accuracy: 17.0
Batch: 75, Loss: 93.69624328613281, Accuracy: 11.0
Batch: 80, Loss: 3.9767649173736572, Accuracy: 11.0
Batch: 85, Loss: 3.9799492359161377, Accuracy: 15.0
Batch: 90, Loss: 3.9741482734680176, Accuracy: 17.0
Batch: 95, Loss: 3.973705768585205, Accurac