In [None]:
import os
import numpy as np
import torch

import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt

from PIL import Image
%matplotlib inline

In [None]:
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

In [None]:
# define training and test data directories
# ../input/sorghum-id-fgvc-9/train_images/2017-06-01__10-26-27-479.png
# ../input/sorghum-id-fgvc-9/test/1000135300.png

data_dir = '../input/sorghum-id-fgvc-9/'
train_dir = os.path.join(data_dir, 'train_images/')
test_dir = os.path.join(data_dir, 'test/')

In [None]:
import pandas as pd
inputDF = pd.read_csv('../input/sorghum-id-fgvc-9/train_cultivar_mapping.csv')
# classes = inputDF['cultivar'].unique().tolist()

In [None]:
inputDF.head(2)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the label
le = LabelEncoder()
label_encoded = le.fit_transform(inputDF['cultivar'])
inputDF['label_encoded'] = label_encoded
inputDF['imagePath'] = train_dir + inputDF['image'] 

# Taking the class name
label_names = le.classes_
inputDF.head()

In [None]:
# errors = []
# for idx in range(len(inputDF)):
#     try:
#         img_name = inputDF.iloc[idx, -1]
#         image = Image.open(img_name)
#     except:
#         errors.append(idx)
errors = [235,
 363,
 1829,
 1890,
 1896,
 1906,
 1923,
 1926,
 1936,
 1966,
 1970,
 1972,
 2175,
 2194,
 2367,
 2387,
 2439,
 2472,
 2529,
 2547,
 2579,
 2605,
 2668,
 2696,
 2720,
 2782,
 2794,
 2892,
 3210,
 3271,
 3363,
 3982,
 4047,
 4068,
 4082,
 4145,
 4529,
 4543,
 4553,
 4576,
 5038,
 5130,
 5207,
 5439,
 5459,
 5478,
 5496,
 5521,
 5538,
 5543,
 5553,
 5563,
 5604,
 5607,
 5609,
 5614,
 5621,
 5663,
 5688,
 5696,
 5717,
 5733,
 5755,
 5781,
 5795,
 5840,
 5909,
 5973,
 6037,
 6085,
 6107,
 6117,
 6152,
 6412,
 6419,
 6474,
 6480,
 6483,
 6598,
 6909,
 6931,
 6975,
 7050,
 7095,
 7137,
 7140,
 7237,
 7292,
 7337,
 7407,
 7422,
 7490,
 7856,
 8192,
 8230,
 8253,
 8462,
 8471,
 8489,
 8504,
 8536,
 8609,
 8625,
 8971,
 9173,
 9240,
 9247,
 9260,
 9272,
 9375,
 9497,
 9602,
 9653,
 9786,
 9792,
 10550,
 10651,
 11204,
 11289,
 11571,
 11638,
 11654,
 11685,
 11791,
 11848,
 11861,
 11891,
 11918,
 11947,
 11957,
 11978,
 12001,
 12016,
 12024,
 12032,
 12037,
 12046,
 12059,
 12085,
 12100,
 12112,
 12116,
 12117,
 12132,
 12144,
 12148,
 12228,
 12376,
 12446,
 12460,
 12474,
 12487,
 12489,
 12493,
 12498,
 12511,
 12551,
 12577,
 12605,
 12613,
 12654,
 12669,
 12671,
 12769,
 12794,
 12814,
 12898,
 12906,
 12912,
 12913,
 12917,
 12920,
 12930,
 12948,
 12949,
 12952,
 12956,
 12964,
 12966,
 12968,
 12970,
 12979,
 12982,
 12985,
 12998,
 13001,
 13002,
 13005,
 13011,
 13015,
 13017,
 13026,
 13034,
 13035,
 13040,
 13046,
 13050,
 13054,
 13065,
 13066,
 13068,
 13075,
 13103,
 13109,
 13132,
 13133,
 13134,
 13148,
 13150,
 13151,
 13152,
 13161,
 13166,
 13169,
 13170,
 13174,
 13183,
 13190,
 13196,
 13567,
 13783,
 13880,
 13883,
 13957,
 14036,
 14186,
 14203,
 14237,
 14242,
 14261,
 14275,
 14286,
 14289,
 14309,
 14339,
 14594,
 14599,
 14604,
 14615,
 14650,
 14651,
 14678,
 14722,
 14762,
 14777,
 14800,
 14804,
 14810,
 14815,
 14829,
 14836,
 14846,
 14847,
 14853,
 14873,
 14874,
 14879,
 14888,
 14889,
 14897,
 14902,
 14911,
 14918,
 14923,
 14925,
 14927,
 14929,
 14935,
 14937,
 14948,
 14953,
 14955,
 14956,
 14960,
 14967,
 14969,
 14970,
 14973,
 14978,
 14981,
 14984,
 14998,
 15007,
 15012,
 15015,
 15019,
 15021,
 15023,
 15025,
 15027,
 15035,
 15039,
 15041,
 15050,
 15057,
 15063,
 15086,
 15087,
 15088,
 15099,
 15103,
 15115,
 15120,
 15121,
 15181,
 15256,
 15264,
 15324,
 15347,
 15378,
 15396,
 15409,
 15427,
 15440,
 15457,
 15460,
 15469,
 15474,
 15476,
 15477,
 15484,
 15490,
 15505,
 15533,
 15536,
 15570,
 15590,
 15595,
 15618,
 15652,
 15661,
 15684,
 15693,
 15700,
 15708,
 15743,
 15762,
 15763,
 15772,
 15773,
 15827,
 15841,
 15851,
 15854,
 15963,
 16288,
 16319,
 16336,
 16341,
 16349,
 16442,
 16476,
 16478,
 16479,
 16507,
 16508,
 16549,
 16589,
 17484,
 17991,
 18065,
 18113,
 18119,
 18241,
 18888,
 18897,
 18913,
 18927,
 18954,
 19008,
 19035,
 19037,
 19091,
 19111,
 19130,
 19132,
 19153,
 19171,
 19271,
 19326,
 19482,
 19644,
 19675,
 19676,
 19677,
 19691,
 19703,
 19707,
 19734,
 19741,
 19752,
 19755,
 19780,
 19806,
 19815,
 19816,
 19857,
 19876,
 20155,
 20174,
 20197,
 20224,
 20229,
 20243,
 20256,
 20309,
 20322,
 20368,
 20581,
 20591,
 20625,
 20641,
 20745,
 20747,
 20748,
 20778,
 20847,
 21011,
 21032,
 21333,
 21395,
 21448,
 21461,
 21510,
 21522,
 21525,
 21537,
 21570,
 21595,
 21598,
 21613,
 21621,
 21627,
 21633,
 21646,
 21731,
 22091,
 22189,
 22213,
 22346,
 22376,
 22504]

In [None]:
print(len(inputDF))
print(len(errors))
inputDF = inputDF.drop(errors)
print(len(inputDF))

In [None]:
inputDF.head(2)

In [None]:
from torch.utils.data import Dataset

class SorghumDataset(Dataset):
    def __init__(self, data_frame, root_dir, transform=None):
        self.data_frame = data_frame
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        # Return the length of the dataset
        return len(self.data_frame)
    
    def __getitem__(self, idx):
        # Return the observation based on an index. Ex. dataset[0] will return the first element from the dataset, in this case the image and the label.
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_name = self.data_frame.iloc[idx, -1]
        image = Image.open(img_name)
        label = self.data_frame.iloc[idx, -2]
        
        if self.transform:
            image = self.transform(image)
    
        return (image, label)

In [None]:
# from torch.utils.data import Dataset

# class CustomDataSet(Dataset):
#     def __init__(self, main_dir, transform):
#         self.main_dir = main_dir
#         self.transform = transform
#         all_imgs = os.listdir(main_dir)
#         self.total_imgs = natsort.natsorted(all_imgs)

#     def __len__(self):
#         return len(self.total_imgs)

#     def __getitem__(self, idx):
#         img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
#         image = Image.open(img_loc).convert("RGB")
#         tensor_image = self.transform(image)
#         return tensor_image

In [None]:
# load and transform data using ImageFolder

# VGG-16 Takes 224x224 images as input, so we resize all of them
data_transform = transforms.Compose([transforms.RandomResizedCrop(224), 
                                      transforms.ToTensor()])

# train_data = datasets.ImageFolder(train_dir, transform=data_transform)
# test_data = datasets.ImageFolder(test_dir, transform=data_transform)

# # print out some data stats
# print('Num training images: ', len(train_data))
# print('Num test images: ', len(test_data))

In [None]:
from torch.utils.data import DataLoader

train_data  = SorghumDataset(root_dir = train_dir, data_frame = inputDF, transform=data_transform)
train_loader = DataLoader(train_data , batch_size=20, shuffle=True, 
                               num_workers=0, drop_last=True)

# test_data  = CustomDataSet(test_dir, transform=data_transform)
# test_loader = data.DataLoader(test_data , batch_size=20, shuffle=True, 
#                                num_workers=0, drop_last=True)


In [None]:
# Load the pretrained model from pytorch
vgg16 = models.vgg16(pretrained=True)

# print out the model structure
print(vgg16)

In [None]:
print(vgg16.classifier[6].in_features) 
print(vgg16.classifier[6].out_features) 

In [None]:
# Freeze training for all "features" layers
for param in vgg16.features.parameters():
    param.requires_grad = False

In [None]:
import torch.nn as nn

vgg16.classifier[6] = nn.Linear(in_features=4096, out_features=len(label_names), bias=True)


# after completing your model, if GPU is available, move the model to GPU
if train_on_gpu:
    vgg16.cuda()

In [None]:
import torch.optim as optim

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.001
optimizer = optim.SGD(vgg16.classifier.parameters(), lr=0.001)

In [None]:
# number of epochs to train the model
n_epochs = 2

## TODO complete epoch and training batch loops
## These loops should update the classifier-weights of this model
## And track (and print out) the training loss over time

for epoch in range(1, n_epochs+1):

    # keep track of training and validation loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    # model by default is set to train
    for batch_i, (data, target) in enumerate(train_loader):
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = vgg16(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update training loss 
        train_loss += loss.item()
        
        if batch_i % 20 == 19:    # print training loss every specified number of mini-batches
            print('Epoch %d, Batch %d loss: %.16f' %
                  (epoch, batch_i + 1, train_loss / 20))
            train_loss = 0.0

In [None]:
torch.save(vgg16.state_dict(), './PATH')

In [None]:
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load('./PATH'))
model.eval()

In [None]:
testDF = pd.read_csv('../input/sorghum-id-fgvc-9/sample_submission.csv')
testDF.head(2)

In [None]:
label_encoded = le.transform(testDF['cultivar'])
testDF['label_encoded'] = label_encoded
testDF['imagePath'] = test_dir + testDF['filename']
testDF.head(2)

In [None]:
len(testDF)

In [None]:
test_data  = SorghumDataset(root_dir = test_dir, data_frame = testDF, transform=data_transform)
test_loader = DataLoader(test_data , batch_size=500, shuffle=False)

In [None]:
out = []
i = 0
import sys
for batch_i, (data, target) in enumerate(test_loader):
    # move tensors to GPU if CUDA is available
    if train_on_gpu:
        data, target = data.cuda(), target.cuda()
    output = vgg16(data)
    _, index = torch.max(output, 1)
    i = i + len(index)
    percentageDone = i/len(testDF)
    sys.stdout.write(str(percentageDone))
    out.append(index)

In [None]:
flat_list = [item for sublist in out for item in sublist]
len(flat_list)

In [None]:
cpuList = [item.cpu() for item in flat_list]

In [None]:
outColumn = list(le.inverse_transform(cpuList))

In [None]:
filenames = pd.read_csv('../input/sorghum-id-fgvc-9/sample_submission.csv')['filename']

In [None]:
pd.read_csv('../input/sorghum-id-fgvc-9/sample_submission.csv').columns

In [None]:
submissionCSV = pd.DataFrame(data = list(zip(filenames, outColumn)),
            columns=['filename', 'cultivar'])

In [None]:
submissionCSV.to_csv('output.csv', index= False)