In [None]:
#To fetch data from kaggle
!pip install -U kaggle-cli
!kg download -u yourusername -p yourpassword -c plant-seedlings-classification  #no single quotes in the username, password and competition name.
!unzand ip train.zip
!unzip test.zip
!unzip sample_submission.csv.zip

In [1]:
import torch
import os
import copy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from PIL import Image
from torch.autograd import Variable
from torchvision import datasets, transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

In [2]:
#list of transforms goes here

Basic = transforms.Compose([transforms.ToTensor()])

BasicScale = transforms.Compose([transforms.Scale(128),
                                 transforms.CenterCrop(128),
                                transforms.ToTensor()])

BasicAugment = transforms.Compose([transforms.Scale(224),
                                   transforms.RandomCrop(128),
                                   transforms.RandomHorizontalFlip()])

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])

Normalize = transforms.Compose([transforms.ToTensor(),
                               normalize])

In [3]:
VAL_SIZE = 0.1                                    #Validation ratio
dtype = torch.FloatTensor                         #dtype
NUM_W = 4                                         #Num of Workers
PIN = False                                       #Pin Memory
B = 16                                             #Batch Size

if torch.cuda.is_available():
    dtype = torch.cuda.FloatTensor
    NUM_W = 1
    PIN = True

#Loaded twice because val may need a diff. transform than train
train_dataset = ImageFolder('train', transform=BasicScale)
val_dataset = ImageFolder('train', transform=BasicScale) 

print 'Classes'
print train_dataset.classes

indices = list(xrange(len(train_dataset)))
split = int(np.floor(VAL_SIZE * len(train_dataset)))

#shuffling
np.random.seed(0)
np.random.shuffle(indices)

train_idx, val_idx = indices[split:], indices[:split]
train_len = len(train_idx)
val_len = len(val_idx)

train_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(val_idx)

train_loader = DataLoader(train_dataset, batch_size=B, sampler=train_sampler, num_workers=NUM_W, pin_memory=PIN)
val_loader = DataLoader(val_dataset, batch_size=B, sampler=val_sampler, num_workers=NUM_W, pin_memory=PIN)

Classes
['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent', 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']


In [None]:
#Run this cell to get an idea of the probability distribution over all classes
ys = []
for x, y in train_dataset:
    ys.append(y)   

plt.hist(ys, linewidth=1.2, edgecolor='black', normed=1)
for t in xrange(12):
    print '{} occurs {} times'.format(train_dataset.classes[t], ys.count(t))

In [4]:
class SimpleCNN(nn.Module):
    
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=2)
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(64, 128, kernel_size=3, stride=2)
        self.bn4 = nn.BatchNorm2d(128)
        self.conv5 = nn.Conv2d(128, 256, kernel_size=3, stride=2)
        self.bn5 = nn.BatchNorm2d(256)
        self.conv6 = nn.Conv2d(256, 512, kernel_size=3, stride=1)
        self.bn6 = nn.BatchNorm2d(512)
        self.linear = nn.Linear(512, 12)
        
    def forward(self, input):
        out = self.bn1(F.relu(self.conv1(input), inplace=True))
        out = self.bn2(F.relu(self.conv2(out), inplace=True))
        out = self.bn3(F.relu(self.conv3(out), inplace=True))
        out = self.bn4(F.relu(self.conv4(out), inplace=True))
        out = self.bn5(F.relu(self.conv5(out), inplace=True))
        out = self.bn6(F.relu(self.conv6(out), inplace=True))
        
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

In [5]:
BasicModel = SimpleCNN().type(dtype)
loss_fn = nn.CrossEntropyLoss().type(dtype)
optimizer = optim.Adam(BasicModel.parameters())

In [6]:
def check_val_loss(model):
    model.eval()
    running_loss = 0
    correct = 0
    for (x, y) in val_loader:
        x, y = Variable(x).type(dtype), Variable(y).type(torch.cuda.LongTensor)
        scores = model(x)
        preds = torch.max(scores, dim=1)[1]
        
        correct += (torch.sum((preds==y))).type(dtype)
        loss = loss_fn(scores, y)
        running_loss += loss.data[0]
        
    correct = correct.data.cpu().numpy()[0]
    print 'Validation loss is: {}'.format(running_loss)
    print 'Got {} / {} correct.'.format(correct, val_len)
    return (correct / val_len)

def save_model(model):
    torch.save(model.state_dict(), 'SimpleCNN.pt')
    
def load_model(model, file_name):
    model.load_state_dict(torch.load(file_name))

In [7]:
def train(model, num_epochs, loss_fn, optimizer, graph=False):
    best_model, best_val_acc, best_state_dict = None, 0, None
    for epoch in xrange(NUM_EPOCHS):
        model.train()
        running_loss = 0
        for (x, y) in train_loader:
            x, y = Variable(x).type(dtype), Variable(y).type(torch.cuda.LongTensor)

            scores = model(x)
            optimizer.zero_grad()

            loss = loss_fn(scores, y)
            running_loss += loss.data[0]
            loss.backward()
            optimizer.step()
        print 'Loss for epoch {}: {}'.format(epoch, running_loss)
        
        val_acc = check_val_loss(model)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = copy.deepcopy(model)
        
    if graph == True:
        print 'Implement Graphing of loss here'
        
    return best_model

In [17]:
NUM_EPOCHS = 10
best_model = train(BasicModel, NUM_EPOCHS, loss_fn, optimizer, graph=False)

Loss for epoch 0: 34.2786741555
Validation loss is: 25.7608733475
Got 393.0 / 475 correct.
Loss for epoch 1: 11.1851177464
Validation loss is: 23.8405536711
Got 407.0 / 475 correct.
Loss for epoch 2: 6.85196298361
Validation loss is: 20.3838922996
Got 418.0 / 475 correct.
Loss for epoch 3: 4.2698225677
Validation loss is: 21.9932197575
Got 413.0 / 475 correct.
Loss for epoch 4: 5.31247013807
Validation loss is: 42.1211785376
Got 354.0 / 475 correct.
Loss for epoch 5: 30.7577182353
Validation loss is: 24.6806575954
Got 392.0 / 475 correct.
Loss for epoch 6: 9.86445197463
Validation loss is: 23.622019738
Got 391.0 / 475 correct.
Loss for epoch 7: 14.9490696266
Validation loss is: 21.1600424647
Got 406.0 / 475 correct.
Loss for epoch 8: 7.11894881725
Validation loss is: 27.0791056454
Got 400.0 / 475 correct.
Loss for epoch 9: 6.49124410748
Validation loss is: 21.2499969155
Got 406.0 / 475 correct.


In [18]:
'Our best model performs like so\n'
check_val_loss(best_model)

Validation loss is: 21.2005843818
Got 418.0 / 475 correct.


0.88

In [15]:
test_df = pd.read_csv('sample_submission.csv')

ix_to_class = {0:'Black-grass', 1:'Charlock', 2:'Cleavers', 3:'Common Chickweed', 4:'Common wheat', 5:'Fat Hen', 
               6:'Loose Silky-bent', 7:'Maize', 8:'Scentless Mayweed', 9:'Shepherds Purse', 10:'Small-flowered Cranesbill', 
               11:'Sugar beet'}

def custom_image(model, file_name):
    img = Image.open(os.path.join('test', file_name))
    img = img.resize((128, 128), Image.BILINEAR)
    tensor_img = transforms.ToTensor()
    img_var = Variable(tensor_img(img).unsqueeze(0)).type(dtype)
    scores = model(img_var)
    pred = torch.max(scores, dim=1)[1].data.cpu().numpy()[0]
    return ix_to_class[pred]

In [20]:
test_df['species'] = test_df['file'].apply(lambda x: custom_image(best_model, x))   #using best_model

print test_df.head(5)
print "Alright everything looks good here. Let's save this output to a .csv file"

test_df.to_csv('submit.csv', index=False)

print 'Submitting this. Fingers crossed'
!kg submit submit.csv -u yourusername -p yourpassword -c plant-seedlings-classification -m "fingers crossed."

            file                    species
0  0021e90e4.png  Small-flowered Cranesbill
1  003d61042.png                    Fat Hen
2  007b3da8b.png                 Sugar beet
3  0086a6340.png           Common Chickweed
4  00c47e980.png                 Sugar beet
Alright everything looks good here. Let's save this output to a .csv file
Submitting this. Fingers crossed
0.86775
