## 1. Configurations

### 1. Libraries

In [None]:
# General
import numpy as np
import pandas as pd
import glob
import os
import PIL
from PIL import Image
import cv2
from collections import Counter

# Encoding
from sklearn import preprocessing


# Visualizations
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Neural Network
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

### 2. Notebook Configurations

In [None]:
print('Using:')
print('\nPyTorch version:', torch.__version__)
print('\n Running on GPU' if torch.cuda.is_available() else 'GPU device not found. Running on CPU')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')


### 3. Classes

In [None]:
class PlantPathology(Dataset):
  def __init__(self,root_dir, image_names, labels, transform=None):
    self.root_dir = root_dir
    self.image_names = image_names
    self.labels = labels
    self.transform = transform

  def __len__(self):
    return len(self.labels)
  
  def __getitem__(self, index):

    label = self.labels[index]
    image = Image.open(os.path.join(self.root_dir, self.image_names[index]))
    #image = cv2.imread(image_name)

    if self.transform is not None:
      image = self.transform(image)

    return image, label

### 4. Functions

In [None]:
def data_details(label_path, img_path):
    '''
    Input: Path to labels, path to images
    Action: Get details of the labels and the images
    Output: Details of labels and images
    '''
    # Label Details
    df = pd.read_csv(label_path)
    n_rows = df.shape[0]
    n_classes = df['labels'].nunique()
    classes = df['labels'].value_counts()
    classes = classes.to_frame()
    describe = df.describe()
    classes = classes.apply(lambda x: round((x/n_rows)*100,0))
    duplicates = df[df.duplicated()]
    
    # Image Details
    num_images = len([name for name in os.listdir(img_path) if os.path.isfile(os.path.join(img_path, name))])
    image_dimensions = []
    
    images = [name for name in os.listdir(img_path) if os.path.isfile(os.path.join(img_path, name))]
    
    for name in images[:100]:
        img = cv2.imread('../input/plant-pathology-2021-fgvc8/train_images/'+name)
        dim = (img.shape[1],img.shape[0])
        image_dimensions.append(dim)
    
    dimensions = Counter(image_dimensions).keys() # equals to list(set(words))
    dimensions_frequency = Counter(image_dimensions).values() # counts the elements' frequency
    
    print('Train Data Details')
    print('\n')
    print('\nShape of the label file: ', df.shape)
    print('\nData types of the columns:', df.info())
    print('\nData Description:', describe)
    print('\nTotal number of classes',n_classes)
    print('\nClasses:')
    print('\n',classes)
    print('\nRow duplicates:',duplicates)
    print('\nSample rows:')
    print('\n',df.head())
    print('\nIMAGE DETAILS')
    print('\nNumber of images in the training folder:',num_images)
    print('\nUnique dimensions from a sample of 100 images:',dimensions)
    print('Frequencies of dimensions from a sample of 100 images:',dimensions_frequency)

In [None]:
def data_loader_exploration(data_loader):
    '''
    Input: Data loader
    Action: Get batch details
    Output: Batch details
    '''
    batch = next(iter(train_loader))
    images, labels = batch
    
    print('\nNumber of components in the batch:',len(batch))
    print('Type of batch:',type(batch))
    print('Shape of a batch:',images.shape)
    print('Length of the batch:',len(images))
    print('\n')
    grid = torchvision.utils.make_grid(images, nrow=10)
    plt.figure(figsize=(15,15))
    plt.imshow(np.transpose(grid, (1,2,0)))
    print('\nlabels:', labels)

## 2. Raw Data Exploration

In [None]:
img_path = '../input/plant-pathology-2021-fgvc8/train_images'
label_path = '../input/plant-pathology-2021-fgvc8/train.csv'
data_details(label_path,img_path)

## 3. Data Preparation

### 1. Label Encoding

In [None]:
train_df = pd.read_csv(label_path)
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(train_df['labels'])
train_df['label_id'] = label_encoder.transform(train_df['labels'])
labels = train_df['label_id'].values
image_names = train_df['image'].values

### 2. Data Transform

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Resize((128, 128)),
     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]
)

### 3. Define Dataset

In [None]:
train_set = PlantPathology(root_dir=img_path,
                           image_names=image_names,
                           labels = labels,
                           transform=transform)

### 4. Define Data Loader

In [None]:
train_loader = DataLoader(train_set,
                          batch_size=64,
                          shuffle=True)

### 5. Loader Exploration

In [None]:
data_loader_exploration(train_loader)