# Plant Seedlings Classification / First experiment

#Setting up the environment

Mounting drive

In [90]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Some linux commands to arrange files and directories if it is required

In [95]:
! ls "/content/gdrive/My Drive/"

0021e90e4.png  33748968f.png  65e97117e.png  98d819587.png  cb496f36e.png
003d61042.png  338c7e907.png  664194d19.png  98da6ef4e.png  cb76a7766.png
007b3da8b.png  34dd57ca9.png  6680836dd.png  99036c51d.png  cbba27d89.png
0086a6340.png  3526b05cc.png  668c1007c.png  99569b224.png  cbe761896.png
00c47e980.png  35a90f8d0.png  66ab0e8d0.png  995c7ab1e.png  cc3d2a59a.png
00d090cde.png  35cf9fa01.png  675ec1b0b.png  9a3f20121.png  cc74feadc.png
00ef713a8.png  35ebe165c.png  67ce3eaa6.png  9aa5587fe.png  cd5f0db1c.png
01291174f.png  36839d5e9.png  67e185673.png  9b4800b42.png  cd6adba97.png
026716f9b.png  36d62bf36.png  686dc7ec8.png  9b9911f20.png  ce15eee52.png
02cfeb38d.png  36ed4f215.png  6908fb540.png  9baf94467.png  ce3d280eb.png
03566743d.png  37297a64c.png  6982a9d30.png  9c0c5b731.png  ce42adffb.png
03a2ee656.png  37714071b.png  699d3c707.png  9c32a797e.png  cec5bf198.png
03e322a29.png  377283a21.png  69d1669f8.png  9c777333d.png  cf3a8b2fd.png
03ef36742.png  37c3108d6.png  6a41bf95

Check if GPU is running

In [3]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


Import libraries ( more details about in the report and the notebook of second experiment)

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler

import torchvision
from torchvision import transforms

import os
import time
import random

torch.manual_seed(23)
random.seed(23)

In [0]:
import os, shutil
from random import shuffle

Set some constant to access a file and directories

In [0]:
# path to original dataset
original_dataset_dir = '/content/gdrive/My Drive/train'
# directory to store the smaller dataset
base_dir = '/content/gdrive/My Drive/' 

#os.mkdir(base_dir)

In [0]:
# make training, validation and test set directories
train_dir = os.path.join(base_dir, 'train')
#os.mkdir(train_dir)
validation_dir = os.path.join(base_dir, 'validation')
#os.mkdir(validation_dir)

#Exploratory data analysis

In [0]:
classes = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',
           'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']

In [0]:
for directory in [train_dir, validation_dir]:
    for clas in classes:
        clas_dir = os.path.join(directory, clas)
        #os.mkdir(clas_dir)

In [10]:
for clas in classes:
    print(clas, ": ", len(os.listdir(os.path.join(original_dataset_dir, clas))))

Black-grass :  263
Charlock :  390
Cleavers :  287
Common Chickweed :  611
Common wheat :  221
Fat Hen :  475
Loose Silky-bent :  654
Maize :  221
Scentless Mayweed :  516
Shepherds Purse :  231
Small-flowered Cranesbill :  496
Sugar beet :  385


#Dataset preparation

Applyng several data augmentation techniques, image resizing, center crop and random horizontal flip

In [0]:
def get_transforms(target_size=100, normalize=False):
    t = transforms.Compose([
        transforms.Resize(target_size),
        transforms.CenterCrop(target_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor()
        ])
    return t

In [0]:
path = '/content/gdrive/My Drive/'

Apply data augmentation in the training dataset

In [14]:
full_dataset = torchvision.datasets.ImageFolder('/content/gdrive/My Drive/train', transform=get_transforms())
print('This dataset has:')
print('  {} elements'.format(len(full_dataset)))
print('  {} classes'.format(len(full_dataset.classes)))
print(full_dataset)

This dataset has:
  4750 elements
  12 classes
Dataset ImageFolder
    Number of datapoints: 4750
    Root location: /content/gdrive/My Drive/train
    StandardTransform
Transform: Compose(
               Resize(size=100, interpolation=PIL.Image.BILINEAR)
               CenterCrop(size=(100, 100))
               RandomHorizontalFlip(p=0.5)
               ToTensor()
           )


Splitting the data set

Training set = 70%
Validation set=30%

In [15]:
train_len = int(0.7 * len(full_dataset))
validate_len = len(full_dataset) - train_len
train_dataset, validate_dataset = torch.utils.data.random_split(full_dataset, (train_len, validate_len))
print('Training dataset contains {} elements'.format(len(train_dataset)))
print('Validation dataset contains {} elements'.format(len(validate_dataset)))

Training dataset contains 3325 elements
Validation dataset contains 1425 elements


Set train and validation set as dataloader 

In [0]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
validate_loader = torch.utils.data.DataLoader(validate_dataset)

#Building model

Using a version of LeNet-5, modified to take 3-channel color images (instead of 1-channel images as it was originally defined).

The __init__() method defines two convolutional layers and three linear layers

The forward() method composes these layers and some important functions into a computation graph that takes in a 3x32x32 tensor representing a 3-color image

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 12)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.fc1.in_features)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

skeleton of the image classifier (model architecture).Start with 3 color inputs and finish with 12 inputs

In [0]:
class SeedlingModelV1(nn.Module):
    def __init__(self):
        super(SeedlingModelV1, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 22 * 22, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 12)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.fc1.in_features)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Instantiates the model.Extracts an instance from the dataset
feeds the instance to the model for processing

In [19]:
model = SeedlingModelV1()
image, label = train_dataset[0]
output = model(torch.unsqueeze(image, 0))
print(output)

tensor([[ 0.0380, -0.0902,  0.0075,  0.1225, -0.0666,  0.0214,  0.0193, -0.1222,
         -0.0223,  0.0353, -0.0416, -0.0931]], grad_fn=<AddmmBackward>)


Training the Model

Setting some constants for tunning them 

In [0]:
N_EPOCHS = 20 # number of passes over the training dataset
LR = 0.01 # learning rate
MOMENTUM = 0.5 # for SGD

BATCH_SIZE = 4 # number of instances per batch served by dataloader
NUM_WORKERS = 2 # number of I/O threads used by dataloader

MODEL_DIR = 'models' # save models here
MODEL_SAVEFILE = 'seedling'

Create folder for store models

In [21]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


Registering information about the training and save the model

In [0]:
def tlog(msg):
    print('{}   {}'.format(time.asctime(), msg))

    
def save_model(model, epoch):
    tlog('Saving model')
    savefile = "{}-e{}-{}.pt".format(MODEL_SAVEFILE, epoch, int(time.time()))
    path = os.path.join(MODEL_DIR, savefile)
    # recommended way from https://pytorch.org/docs/stable/notes/serialization.html
    torch.save(model.state_dict(), path)
    return savefile

Checking if the GPU is running

In [23]:
if not torch.cuda.is_available():
    device = torch.device('cpu')
    print('*** GPU not available - running on CPU. ***')
else:
    device = torch.device('cuda')
    print('GPU ready to go!')

GPU ready to go!


Recreate the key components

In [0]:
full_dataset = torchvision.datasets.ImageFolder('/content/gdrive/My Drive/train', transform=get_transforms())
train_len = int(0.8 * len(full_dataset))
validate_len = len(full_dataset) - train_len
train_dataset, validate_dataset = torch.utils.data.random_split(full_dataset, (train_len, validate_len))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True)
validate_loader = torch.utils.data.DataLoader(validate_dataset, batch_size=1)

model = SeedlingModelV1()

 Training loop function (details comment inside) 

In [0]:
def train(model, epochs=N_EPOCHS):
    tlog('Training the model...')
    tlog('working on {}'.format(device))
    
    best_accuracy = 0. # determines whether we save a copy of the model
    saved_model_filename = None
    
    model = model.to(device) # move to GPU if available
    loss_fn = nn.CrossEntropyLoss() # combines nn.LogSoftmax() and nn.NLLLoss() for classification tasks
    optimizer = optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM)
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
    
    for epoch in range(epochs):
        tlog('BEGIN EPOCH {} of {}'.format(epoch + 1, epochs))
        running_loss = 0. # bookkeeping
        
        tlog('Train:')
        for i, data in enumerate(train_loader):
            instances, labels = data[0], data[1]
            instances, labels = instances.to(device), labels.to(device) # move to GPU if available
            
            optimizer.zero_grad()
            guesses = model(instances)
            loss = loss_fn(guesses, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            if (i + 1) % 200 == 0: # log every 200 batches
                tlog('  batch {}   avg loss: {}'.format(i + 1, running_loss / (200)))
                running_loss = 0.
        
        tlog('Validate:')
        with torch.no_grad(): # no need to do expensive gradient computation for validation
            total_loss = 0.
            correct = 0
            
            for i, data in enumerate(validate_loader):
                instance, label = data[0], data[1]
                instance, label = instance.to(device), label.to(device) # move to GPU if available
                
                guess = model(instance)
                loss = loss_fn(guess, label)
                total_loss += loss.item()
                
                prediction = torch.argmax(guess, 1)
                if prediction.item() == label.item(): # assuming batch size of 1
                    correct += 1

            avg_loss = total_loss / len(validate_loader)
            accuracy = correct / len(validate_loader)
            tlog('  Avg loss for epoch: {}   accuracy: {}'.format(avg_loss, accuracy))
            
            if accuracy >= best_accuracy:
                tlog( '  New accuracy peak, saving model')
                best_accuracy = accuracy
                saved_model_filename = save_model(model, epoch + 1)
                
    return (saved_model_filename, best_accuracy)
                


Print avg loss and accuracy for each epoch

In [26]:
best_model_filename, accuracy  = train(model)
print('The best model is saved at {} with accuracy {}'.format(best_model_filename, accuracy))

Sun Oct 27 00:32:19 2019   Training the model...
Sun Oct 27 00:32:19 2019   working on cuda
Sun Oct 27 00:32:21 2019   BEGIN EPOCH 1 of 20
Sun Oct 27 00:32:21 2019   Train:
Sun Oct 27 00:35:00 2019     batch 200   avg loss: 2.4632503211498262
Sun Oct 27 00:37:49 2019     batch 400   avg loss: 2.4362348598241805
Sun Oct 27 00:40:41 2019     batch 600   avg loss: 2.4246667897701264
Sun Oct 27 00:43:30 2019     batch 800   avg loss: 2.4216341572999953
Sun Oct 27 00:45:40 2019   Validate:
Sun Oct 27 00:52:18 2019     Avg loss for epoch: 2.4198215765702096   accuracy: 0.13473684210526315
Sun Oct 27 00:52:18 2019     New accuracy peak, saving model
Sun Oct 27 00:52:18 2019   Saving model
Sun Oct 27 00:52:18 2019   BEGIN EPOCH 2 of 20
Sun Oct 27 00:52:18 2019   Train:
Sun Oct 27 00:52:29 2019     batch 200   avg loss: 2.41332747399807
Sun Oct 27 00:52:39 2019     batch 400   avg loss: 2.4425433552265168
Sun Oct 27 00:52:49 2019     batch 600   avg loss: 2.411971354484558
Sun Oct 27 00:52:58 2

Load the model was trained and feed each of test instances and store the predictions into csv file in order to upload the csv on Kaggle. 

In [34]:
# load the model
path = os.path.join(MODEL_DIR, '***')
model_data = torch.load('/content/models/seedling-e14-1572138363.pt', map_location=torch.device('cpu'))
trained_model = SeedlingModelV1()
trained_model.load_state_dict(model_data)
print(trained_model)

# sanity check
image, label = train_dataset[0]
output = trained_model(torch.unsqueeze(image, 0))
print(output)

SeedlingModelV1(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=7744, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=12, bias=True)
)
tensor([[-4.5522,  4.5844,  2.0647, -0.8136, -3.1244, -1.3706, -4.2427, -0.7328,
          5.1257,  4.9860, -0.4386, -1.2220]], grad_fn=<AddmmBackward>)


In [0]:
!ls '/content/gdrive'

#Model Evaluation

Load the test images

In [0]:
from imageio import imread
from PIL import Image
from io import BytesIO
from os import listdir
from os.path import isfile, isdir, join


def get_test_transforms(target_size=100, normalize=False):
    t = transforms.Compose([
        transforms.Resize(target_size),
        transforms.CenterCrop(target_size),
        transforms.ToTensor()
        ])
   
    return t


class SeedlingTestDataset(torch.utils.data.Dataset):

    def __init__(self, path_to_test_data="/content/gdrive/My Drive/curated/test/test", transform=None):
        self.transform = transform
        self.data, self.datasize = self.build_dataset_from_path(path_to_test_data)
        self.filenames = sorted(self.data.keys())

    def build_dataset_from_path(self, test_data_path):
        data = {}
        for item in listdir(test_data_path):
            file_path = join(test_data_path, item)
            if isfile(file_path) and 'png' in file_path:
                data[item] = file_path
        return data, len(data)

    def __len__(self):
        return self.datasize

    def __getitem__(self, index):
        key = self.filenames[index]
        full_path = self.data[key]

        with open(full_path, 'rb') as f:
            img = Image.open(BytesIO(f.read()))
        if self.transform is not None:
            img = self.transform(img)

        return img, key

Instantiate the test dataset and wrap it in a dataloader

In [97]:
test_dataset = SeedlingTestDataset(transform=get_transforms())
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, num_workers=1, shuffle=False)

# sanity check
image, filename = test_dataset.__getitem__(0)
output = trained_model(torch.unsqueeze(image, 0))
print(output)

# translate to a guess about class
classes = full_dataset.classes
score, pred = torch.max(output, 1)
print('{} (score: {})'.format(classes[pred.item()], score.item()))

tensor([[-5.0286,  2.4747,  2.2483, -0.7138, -5.1260,  1.7360, -4.1350, -1.7517,
         -5.1252,  0.8126, 13.3818, -0.8629]], grad_fn=<AddmmBackward>)
Small-flowered Cranesbill (score: 13.381834983825684)


Making file submission

In [0]:
with open('submission.csv', 'w') as outfile:
    outfile.write('file,species\n') # required header row
    
    # Some models have layers that are only active during training,
    # so always call model.eval() before inference
    model.eval()
    with torch.no_grad():
        for _, (data, filename) in enumerate(test_loader):
            data.to(device)
            output = trained_model(data)
            score, pred = torch.max(output, 1)
            outfile.write('{}, {}\n'.format(filename[0], classes[pred.item()]))

Showing the first instances of the file submission

In [99]:
!head submission.csv

file,species
0021e90e4.png, Small-flowered Cranesbill
003d61042.png, Fat Hen
007b3da8b.png, Fat Hen
0086a6340.png, Common Chickweed
00c47e980.png, Sugar beet
00d090cde.png, Scentless Mayweed
00ef713a8.png, Common Chickweed
01291174f.png, Fat Hen
026716f9b.png, Loose Silky-bent


In [0]:
!ls '/content/gdrive'

Downloading the csv file to the computer. Ready to uploaded to Kaggle

In [0]:
from google.colab import files
files.download('submission.csv') 