In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working/train_i/'):
    print("Dir = "+ str(dirname) +" No. Of Images = "+str(len(filenames)))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# Imports here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import cv2

import torch
import torchvision
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models

from torch.optim import lr_scheduler
from torch.utils.data import WeightedRandomSampler, DataLoader

from shutil import copyfile
import os

# Loading Data and spliting it

In [None]:
df = pd.read_csv("/kaggle/input/cassava-leaf-disease-classification/train.csv")

In [None]:
df.head()

Spliting 90% data for training and 10% for testing. With 90% again spliting it further into 80% for Training 20% for validation

In [None]:
# Test Train Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["image_id"], df["label"], test_size=0.10, random_state=42)

#for training and validing
train_data, val_data, train_label, val_label = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

# Making folders for PyTorch to read dataset in right order

In [None]:
!mkdir "train_i"
!mkdir "val_i"
!mkdir "test_i" 

for label in range(5):
  if not os.path.exists("train_i/"+str(label)):
    os.makedirs("train_i/"+str(label))
  
  if not os.path.exists("val_i/"+str(label)):
    os.makedirs("val_i/"+str(label))
  
  if not os.path.exists("test_i/"+str(label)):
    os.makedirs("test_i/"+str(label))

path = "/kaggle/input/cassava-leaf-disease-classification/train_images/"    
for x,label in zip(train_data,train_label):
    send_path = path+x
    copyfile(src = send_path, dst = "/kaggle/working/train_i/"+str(label)+"/"+x)
  
for x,label in zip(val_data,val_label):
    send_path = path+x
    copyfile(src = send_path, dst = "/kaggle/working/val_i/"+str(label)+"/"+x) 

for x,label in zip(X_test,y_test):
    send_path = path+x
    copyfile(send_path,dst = "/kaggle/working/test_i/"+str(label)+"/"+x)   

# Preprocessing

In [None]:
# As our sample data is imbalance so we need to upsample it 
def get_WeightedSamplerDataLoader(training_dataset,batch_size,path):
    class_weights = {}
    sample_weights = [0] * len(training_dataset)
    
    for root, subdir, files in os.walk(path):
        if(len(files)>0):
            #print(root.split("/"))
            if len(root.split("/")) == 5:
                class_weights[str(root.split("/")[4])] = round((10/len(files))*100,7)
  
    for idx, (data,label) in enumerate(training_dataset.imgs):
        #print(class_weights)
        weights = class_weights[str(label)]
        sample_weights[idx] = weights 

    sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights) , replacement=True)

    loader = DataLoader(training_dataset, batch_size = batch_size, sampler= sampler,num_workers=4)

    return loader


To increase the training example and to prevent our model from overfitting we have to apply different trainformations

In [None]:
training_transforms = transforms.Compose([transforms.RandomRotation(30),
                                          transforms.RandomResizedCrop(224),
                                          transforms.RandomHorizontalFlip(),
                                          transforms.RandomVerticalFlip(),
                                          transforms.ColorJitter(brightness=0.10, contrast=0.1, saturation=0.1, hue=0.00),
                                          transforms.ToTensor(),
                                          transforms.Normalize([0.485, 0.456, 0.406], 
                                                               [0.229, 0.224, 0.225])])

validation_transforms = transforms.Compose([transforms.Resize(256),
                                            transforms.CenterCrop(224),
                                            transforms.ToTensor(),
                                            transforms.Normalize([0.485, 0.456, 0.406], 
                                                                 [0.229, 0.224, 0.225])])

testing_transforms = transforms.Compose([transforms.Resize(256),
                                         transforms.CenterCrop(224),
                                         transforms.ToTensor(),
                                         transforms.Normalize([0.485, 0.456, 0.406], # This is Mean
                                                              [0.229, 0.224, 0.225])]) # This Standard Deviation

# TODO: Load the datasets with ImageFolder
training_dataset = datasets.ImageFolder("/kaggle/working/train_i/", transform=training_transforms)
validation_dataset = datasets.ImageFolder("/kaggle/working/val_i/", transform=validation_transforms)
testing_dataset = datasets.ImageFolder("/kaggle/working/test_i/", transform=testing_transforms)

# TODO: Using the image datasets and the trainforms, define the dataloaders
#train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=32, shuffle=True)
train_loader = get_WeightedSamplerDataLoader(training_dataset,16,"/kaggle/working/train_i")
validate_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=32)
test_loader = torch.utils.data.DataLoader(testing_dataset, batch_size=32)

# Model EfficientNet_b0


In [None]:
!pip install efficientnet_pytorch

In [None]:
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained("efficientnet-b0",advprop=True)

In [None]:
# Function for the validation pass
def validation(model, validateloader, criterion):
    
    val_loss = 0
    accuracy = 0
    
    for images, labels in iter(validateloader):

        images, labels = images.to('cuda'), labels.to('cuda')

        output = model.forward(images)
        val_loss += criterion(output, labels).item()

        probabilities = torch.exp(output)
        
        equality = (labels.data == probabilities.max(dim=1)[1])
        accuracy += equality.type(torch.FloatTensor).mean()
    
    return val_loss, accuracy

In [None]:
print(model)

In [None]:
for parameter in model.parameters():
    parameter.requires_grad = True

model._fc = nn.Linear(1280, 5)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model._fc.parameters(), lr=1e-4,
                                weight_decay=1e-6)

In [None]:
# Train the classifier

#from workspace_utils import active_session

def train_classifier():

    #with active_session():

      epochs = 8
      steps = 0
      print_every = 40
      if torch.cuda.is_available(): 
        model.to('cuda')


      for e in range(epochs):
      
          model.train()

          running_loss = 0
  
          for images, labels in iter(train_loader):
      
              steps += 1
      
              images, labels = images.to('cuda'), labels.to('cuda')
      
              optimizer.zero_grad()
      
              output = model.forward(images)
              loss = criterion(output, labels)
              loss.backward()
              optimizer.step()
      
              running_loss += loss.item()
      
              if steps % print_every == 0:
              
                  model.eval()
              
                  # Turn off gradients for validation, saves memory and computations
                  with torch.no_grad():
                      validation_loss, accuracy = validation(model, validate_loader, criterion)
          
                  print("Epoch: {}/{}.. ".format(e+1, epochs),
                        "Training Loss: {:.3f}.. ".format(running_loss/print_every),
                        "Validation Loss: {:.3f}.. ".format(validation_loss/len(validate_loader)),
                        "Validation Accuracy: {:.3f}".format(accuracy/len(validate_loader)))
          
                  running_loss = 0
                  #exp_lr_scheduler.step()
                  model.train()
                  
train_classifier()

In [None]:
def test_accuracy(model, test_loader):

    # Do validation on the test set
    model.eval()
    model.to('cuda')

    with torch.no_grad():
    
        accuracy = 0
    
        for images, labels in iter(test_loader):
    
            images, labels = images.to('cuda'), labels.to('cuda')
    
            output = model.forward(images)

            probabilities = torch.exp(output)
        
            equality = (labels.data == probabilities.max(dim=1)[1])
        
            accuracy += equality.type(torch.FloatTensor).mean()
        
        print("Test Accuracy: {}".format(accuracy/len(test_loader)))    
        
        
test_accuracy(model, test_loader)

# Submission

In [None]:
from PIL import Image

test_img = Image.open(r'/kaggle/input/cassava-leaf-disease-classification/test_images/2216849948.jpg')
array = np.array(test_img)
plt.imshow(array)

tranform_test_image = testing_transforms(test_img)

output = model.forward(tranform_test_image[None].to('cuda'))
probabilities = torch.exp(output)

df = pd.read_csv("/kaggle/input/cassava-leaf-disease-classification/sample_submission.csv")
df["label"] = probabilities.max(dim=1)[1].to('cpu')

In [None]:
df.to_csv("submission.csv",index=False)