In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import cv2
import PIL.Image
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import matplotlib.pyplot as plt
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder
import datetime
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

In [None]:
gpu_status = torch.cuda.is_available()

if not gpu_status:
    print("No GPU, Using CPU")
else:
    print("Using GPU")

In [None]:
train_df = pd.read_csv('../input/plant-pathology-2020-fgvc7/train.csv') 

In [None]:
train_df.head(5)

In [None]:
train_df.describe(include='all')

In [None]:
def get_label(row):
    for c in train_df.columns[1:]:
        if row[c] == 1:
            return c
        
train_df_copy = train_df.copy()

train_df_copy['label'] = train_df_copy.apply(get_label, axis=1)

In [None]:
train_df_copy.head(5)

In [None]:
train_df_copy.drop(['healthy', 'multiple_diseases', 'rust', 'scab'], axis=1, inplace=True)

In [None]:
train_df_copy.head(5)

In [None]:
sample_img = train_df.iloc[1,0]
sample_labels = train_df.iloc[1,:]
sample_labels = np.asarray(sample_labels)

print("Image Name:{}".format(sample_img))
print("Image Labels:{}".format(sample_labels))

In [None]:
print(len(train_df))
print(train_df.shape[1])

In [None]:
class LeafDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
        
    def __len__(self):
        return (self.df.shape[0])
    
    def __getitem__(self, idx):
        #print(self.df.loc[idx, 'image_id'])
        img_src = '../input/plant-pathology-2020-fgvc7/images/' + self.df.loc[idx, 'image_id'] + '.jpg'
        image = PIL.Image.open(img_src).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
            
        if self.df.shape[1] == 5:
            labels = self.df.loc[idx, ['healthy', 'multiple_diseases', 'rust', 'scab']].values
            labels = torch.from_numpy(labels.astype(np.uint8))
            labels = labels.unsqueeze(-1).long()
            labels = labels.numpy().tolist().index([1])
            labels = torch.from_numpy(np.asarray(labels))
            return image, labels
        
        else:
            return image

In [None]:
leaf_sample_dataset = LeafDataset(df=train_df, transform=None)
fig, ax = plt.subplots(1,3)

for i in range(3):
    img, label = leaf_sample_dataset[i]
    
    ax[i].imshow(img)
    print(type(img), img.size,label)

In [None]:
leaf_transform = transforms.Compose([transforms.Resize((512,512)),
                                     transforms.CenterCrop((256,256)),
                                     transforms.RandomAffine(degrees=15),
                                     transforms.RandomHorizontalFlip(p=0.4),
                                     transforms.RandomVerticalFlip(p=0.3),
                                     transforms.ToTensor(),
                                    transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))])
leaf_train_dataset = LeafDataset(df=train_df, transform=leaf_transform)
leaf_train_loader = DataLoader(leaf_train_dataset, shuffle=True, batch_size=16)

images, labels = next(iter(leaf_train_loader))
print(labels[0])
print(len(images))
plt.imshow(images[0].numpy().transpose((1, 2, 0)))

In [None]:
print(len(leaf_train_dataset))

In [None]:
dataset_size = len(leaf_train_dataset)
indices = list(range(dataset_size))
np.random.shuffle(indices)
split = int(np.floor(0.2*dataset_size))
train_idx, val_idx = indices[split:], indices[:split]

print(split)
print(len(train_idx), len(val_idx))

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(val_idx)

In [None]:
leaf_train_loader = DataLoader(leaf_train_dataset, sampler=train_sampler, batch_size=64)
leaf_valid_loader = DataLoader(leaf_train_dataset, sampler=valid_sampler, batch_size=64)

In [None]:
test_df = pd.read_csv('../input/plant-pathology-2020-fgvc7/test.csv')

In [None]:
test_df.head(5)

In [None]:
leaf_test_dataset = LeafDataset(df=test_df, transform=leaf_transform)
leaf_test_loader = DataLoader(leaf_test_dataset, batch_size=64)

test_images = next(iter(leaf_test_loader))
print(len(test_images))
print(test_images[0].shape)
plt.imshow(test_images[2].numpy().transpose((1, 2, 0)))

In [None]:
print(len(leaf_test_dataset))

In [None]:
diagnosis = ["healthy","multiple_diseases", "rust", "scab"]

In [None]:
train_images, train_labels = next(iter(leaf_train_loader))

fig = plt.figure(figsize=(25,4))
for idx in np.arange(8):
    ax = fig.add_subplot(2, 16/2, idx+1, xticks=[], yticks=[])
    plt.imshow(train_images[idx].numpy().transpose(1,2,0))
    ax.set_title(diagnosis[labels[idx]])

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        
        self.conv1 = nn.Conv2d(3,8,6,padding=1)
        self.conv2 = nn.Conv2d(8,16,6,padding=1)
        self.conv3 = nn.Conv2d(16,32,5,padding=1)
        self.conv4 = nn.Conv2d(32,64,5,padding=1)
        self.conv5 = nn.Conv2d(64,128,3)
        self.conv6 = nn.Conv2d(128,256,3)
        self.conv7 = nn.Conv2d(256,512,2)
        self.conv8 = nn.Conv2d(512,1024,2)
        
        #self.pool1 = nn.MaxPool2d(4,4)
        self.pool2 = nn.MaxPool2d(2,2)
        
        self.fc1 = nn.Linear(6*6*1024, 2048)
        self.fc2 = nn.Linear(2048, 4)
        #self.fc3 = nn.Linear(512, 4)
        #self.fc4 = nn.Linear(32, 4)
        
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        #print("0:", x.shape)
        x = (F.relu(self.conv1(x)))
        #print("1:",x.shape)
        x = self.pool2(F.relu(self.conv2(x)))
        #print("2:",x.shape)
        x = (F.relu(self.conv3(x)))
        #print("3:",x.shape)
        x = self.pool2(F.relu(self.conv4(x)))
        #print("4:",x.shape)
        x = (F.relu(self.conv5(x)))
        #print("5:",x.shape)
        x = self.pool2(F.relu(self.conv6(x)))
        #print("6:",x.shape)
        x = self.pool2(F.relu(self.conv7(x)))
        #print("6:", x.shape)
        x = self.pool2(F.relu(self.conv8(x)))
        #print("6:", x.shape)
        
        x = x.view(-1, 6*6*1024)
        #print("5:",x.shape)
        x = self.dropout(x)
        #print("6:",x.shape)
        
        x = F.relu(self.fc1(x))
        #print("7:",x.shape)
        x = self.dropout(x)
        #x = F.relu(self.fc2(x))
        #print("8:",x.shape)
        #x = self.dropout(x)
        #x = F.relu(self.fc3(x))
        #x = self.dropout(x)
        #print("9:",x.shape)
        x = self.fc2(x)
        #print("10:",x.shape)
        
        return x
    
model = Net()
print(model)

if gpu_status:
    model.cuda()

In [None]:
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
no_epochs = 40
valid_loss_min = np.Inf
curr_time = datetime.datetime.now()
curr_timestamp = str(datetime.datetime.now())

for epoch in range(1, no_epochs+1):
    train_loss = 0.0
    valid_loss = 0.0
    
    model.train()
    
    for data, target in leaf_train_loader:
        if gpu_status:
            data = data.cuda()
            target = target.cuda()
            
        optimizer.zero_grad()
        
        output = model(data)
        
        loss = criterion(output, target)
        loss.backward()
        
        optimizer.step()
        
        train_loss += loss.item()*data.size(0)
        
    model.eval()
    for data, target in leaf_valid_loader:
        if gpu_status:
            data = data.cuda()
            target = target.cuda()
            
        output = model(data)
        
        loss = criterion(output, target)
        
        valid_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(leaf_train_loader.dataset)
    valid_loss = valid_loss/len(leaf_valid_loader.dataset)
    
    print(datetime.datetime.now()-curr_time)
    print("Epoch {}: Training Loss : {:.4f} Validation Loss : {:.4f}".format(epoch, train_loss, valid_loss))
    
    if valid_loss < valid_loss_min:
        print("Validation loss decreased {:.6f} -> {:.6f}, Saving model...".format(valid_loss_min, valid_loss))
        torch.save(model.state_dict(), 'Kaggle_kernel_model_apple_leaf'+curr_timestamp+'.pt')
        valid_loss_min = valid_loss

In [None]:
test_df.head(5)

In [None]:
test_df['diagnosis'] = "None"

In [None]:
test_df.head(5)

In [None]:
file_name = 'Kaggle_kernel_model_apple_leaf'+ str(curr_timestamp)
model.load_state_dict(torch.load(file_name +'.pt'))

In [None]:
from IPython.display import FileLink
FileLink(file_name + '.pt')

In [None]:
with open(file_name + '.txt', 'a') as op_file:
    op_file.write(str(model))

In [None]:
torch.cuda.empty_cache()

In [None]:
data = data.cpu()
del loss, optimizer, data, target, leaf_train_loader

In [None]:
model.eval()
predList = []

for data in leaf_test_loader:
    if gpu_status:
        data = data.cuda()
    
    output = model(data)
        
    _, pred = torch.max(output, 1)
        
    pred = np.squeeze(pred.numpy()) if not gpu_status else np.squeeze(pred.cpu().numpy())
    
    print(pred)
    
    predList.append(pred)
    
print(len(predList))

In [None]:
test_df.iloc[0].diagnosis = 'No'

In [None]:
test_df.iloc[0]

In [None]:
diagnosis_list = []

for l in predList:
    for i in l:
        diagnosis_list.append(diagnosis[i])

In [None]:
diagnosis_series = pd.Series(diagnosis_list)

In [None]:
diagnosis_series.head(5)

In [None]:
test_df.drop(['diagnosis'], inplace=True, axis=1)

In [None]:
test_df.head(5)

In [None]:
test_df['diagnosis'] = diagnosis_series

In [None]:
test_df.head(5)

In [None]:
encoder = OneHotEncoder()
submission_df = encoder.fit_transform(test_df['diagnosis'].values.reshape(-1,1))

In [None]:
submission_df = pd.DataFrame(submission_df.todense(), columns=encoder.get_feature_names())
submission_df.head(5)

In [None]:
test_df = test_df.join(submission_df)

In [None]:
test_df.head(5)

In [None]:
cols = test_df.columns
test_df_final = test_df.rename(columns={"x0_healthy":"healthy", "x0_rust":"rust", "x0_scab":"scab", 
                                        "x0_multiple_dieseases":"multiple_dieseases"})

In [None]:
test_df_final.head(5)

In [None]:
test_df_final.drop(['diagnosis'], inplace=True, axis=1)

In [None]:
test_df_final.head(5)

In [None]:
cols = ['image_id', 'healthy', 'multiple_diseases', 'rust', 'scab']
for i in cols:
    if i not in test_df_final.columns:
        test_df_final[i] = 0.0
submission_final = test_df_final[cols]

In [None]:
submission_final.head(5)

In [None]:
submission_final.to_csv("Kaggle_kernel_submission_"+ curr_timestamp + ".csv", index=False)