In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch_xla
import torch_xla.core.xla_model as xm

import os
import time
import random
import numpy as np
import pandas as pd
import cv2
import albumentations
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models

from typing import Tuple, Callable
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import hamming_loss
from sklearn.model_selection import train_test_split
from albumentations.pytorch import ToTensorV2 as AT


# System constants
SEED = 42
DEFAULT_TENSOR_TYPE = 'torch.FloatTensor'
WORKERS = 0  # number of cpu cores, 0 means "use everything"

# Path constants
MAIN_FOLDER = '../input/plant-pathology-2021-fgvc8'
DATA_FOLDER = '../input/plant-pathology-2021-224x224/train_imgs'  # absolute path to the folder with train images
DATAFRAME_PATH = os.path.join(MAIN_FOLDER, 'train.csv')
MODEL_FILE = 'model.pt'

# DF related constants
IMAGES_COL_NAME = 'image'
LABELS_COL_NAME = 'labels'

# Image parameters
IMG_SIZE = (224, 224)

# Data manipulating constants
N_EPOCHS = 1
BATCH_SIZE = 1024
K = 5  # number of folds per split
TEST_SIZE = 0.25  # the size of the test set in shares

# Optimazation constants
LR = 1e-2
FACTOR = 0.805
SCHEDULER_PATIENCE = 2


DataColumns = Tuple[pd.Series, pd.Series]
DataSample = Tuple[torch.Tensor, torch.Tensor]
ModelPrediction = Tuple[np.ndarray, np.ndarray]
DataFramesPair = Tuple[pd.DataFrame, pd.DataFrame]
DataloadersPair = Tuple[DataLoader, DataLoader]
LossFn = nn.Module
ScheduleFn = ReduceLROnPlateau
ScoreFn = Callable[[np.ndarray, np.ndarray], float]

train_transforms = albumentations.Compose([
    albumentations.Resize(*IMG_SIZE),
    albumentations.Flip(p=0.5),
    albumentations.RandomBrightnessContrast(p=0.5),
    albumentations.ShiftScaleRotate(p=0.5),
    albumentations.Normalize(),
    albumentations.CoarseDropout(p=0.5),
    albumentations.Cutout(p=0.5),
    AT()
])

validaton_transforms = albumentations.Compose([
    albumentations.Resize(*IMG_SIZE),
    albumentations.Normalize(),
    AT()
])

test_transforms = validaton_transforms

  
class PathologyDataset(Dataset):
    '''
    Class that represents data from Plant Pathology Challenge. This dataset is used to train models.
    '''
    
    def __init__(self, df: pd.DataFrame, images_path: str, transforms=None):
        images_names = df[IMAGES_COL_NAME].tolist()
        labels = df[LABELS_COL_NAME].tolist()
            
        self.images_names = images_names
        self.labels = labels
        self.images_path = images_path
        self.transforms = transforms
    
    def __len__(self) -> int:
        return len(self.images_names)
    
    def __getitem__(self, idx: int) -> DataSample:
        image_name = self.images_names[idx]
        label = self.labels[idx]
        
        image = cv2.imread(os.path.join(self.images_path, image_name))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = PathologyDataset.__transform_image(image, self.transforms)
        label = torch.tensor(label)
        
        return image, label
    
    @staticmethod
    def __transform_image(image: np.ndarray, transforms) -> torch.Tensor:
        '''
        Method that encapsulates augmentation library API
        '''
        if transforms is not None:
            augmented = transforms(image=image)
            return augmented['image']
        return image
    
    
class TrainEngine:
    '''
    Class that provides API to train neural networks
    '''
    @staticmethod
    def train_model(model: nn.Module, df: pd.DataFrame, criterion: nn.Module, optimizer: optim.Optimizer,
                scheduler: ScheduleFn, n_epochs: int) -> None:
        '''
        Train loop with cross-validation. It also uses logging and saves best model.
        '''
        max_validation_score = -np.inf
        kf = KFold(n_splits=K, shuffle=True, random_state=SEED)

        for epoch in range(1, n_epochs + 1):
            print('Epoch: #', epoch, sep='')
            mean_validation_score = 0
            
            for fold, (train_indices, val_indices) in enumerate(kf.split(df[IMAGES_COL_NAME], df[LABELS_COL_NAME])):
                trainloader, validloader = TrainEngine.__prepare_dataloaders(df, train_indices, val_indices)
                TrainEngine.__train_one_fold(model, criterion, optimizer, trainloader, fold + 1)
                
                with torch.no_grad():
                    mean_validation_score += \
                    TrainEngine.__validate_one_fold(model, criterion, optimizer, scheduler, validloader)
                del trainloader, validloader
                
            mean_validation_score /= K
            
            #end of epoch
            print('Mean validation score: {:.4f}'.format(mean_validation_score))
            
            if mean_validation_score > max_validation_score:
                print('Saving model...')
                max_validation_score = mean_validation_score
                xm.save(model.state_dict(), MODEL_FILE)
    
    @staticmethod
    def get_device_type() -> torch.device:
        return TrainEngine.__device
    
    @staticmethod
    def __prepare_dataloaders(df: pd.DataFrame, train_indices: np.ndarray, valid_indices: np.ndarray) -> DataloadersPair:
        train_ = df.iloc[train_indices, :].reset_index(drop=True)
        valid_ = df.iloc[valid_indices, :].reset_index(drop=True)

        train_ds = PathologyDataset(train_, DATA_FOLDER, train_transforms)
        valid_ds = PathologyDataset(valid_, DATA_FOLDER, validaton_transforms)

        trainloader = DataLoader(train_ds, batch_size=BATCH_SIZE, pin_memory=False, drop_last=False, num_workers=WORKERS)
        validloader = DataLoader(valid_ds, batch_size=BATCH_SIZE, pin_memory=False, shuffle=False, num_workers=WORKERS)

        return trainloader, validloader
    
    @staticmethod
    def __train_one_fold(model: nn.Module, criterion: LossFn, optimizer: optim.Optimizer,
                         dataloader: DataLoader, fold: int) -> None:
        print(time.ctime(), 'Fold:', fold)
        
        model.train()
        train_loss = 0.0
        
        for images, labels in dataloader:
            images = images.to(TrainEngine.__device)
            labels = labels.to(TrainEngine.__device)
            
            optimizer.zero_grad()
            
            output = torch.sigmoid(model(images))
            loss = criterion(output, labels)
            loss.backward()
            xm.optimizer_step(optimizer, barrier=True)
            
            train_loss += loss.item()
        
        print('Training loss: {:.4f}'.format(train_loss))
        
    @staticmethod
    def __validate_one_fold(model: nn.Module, criterion: LossFn, optimizer: optim.Optimizer,
                            scheduler: ScheduleFn, dataloader: DataLoader) -> float:
        model.eval()
        
        val_loss = 0.0
        val_image_predictions = []
        val_image_labels = []
        
        for images, labels in dataloader:
            images = images.to(TrainEngine.__device)
            labels = labels.to(TrainEngine.__device)

            output = torch.sigmoid(model(images))
            loss = criterion(output, labels)

            val_loss += loss.item()
            val_image_predictions += [torch.round(output).cpu().detach().numpy()]
            val_image_labels += [labels.cpu().detach().numpy()]
        
        scheduler.step(val_loss)
        
        val_image_predictions = np.concatenate(val_image_predictions)
        val_image_labels = np.concatenate(val_image_labels)
        val_hamming = hamming_loss(val_image_predictions, val_image_labels)
        print('Validation multi-label Hamming accuracy: {:.4f}'.format(1 - val_hamming))
        
        return (1 - val_hamming)
    
    __device = xm.xla_device()

    
class TestEngine:
    '''
    Class that provides API to test neural networks
    '''
    @staticmethod
    def test_model(model: nn.Module, df: pd.DataFrame) -> None:
        testloader = TestEngine.__prepare_dataloader(df)
        test_image_predictions, test_image_labels = TestEngine.__get_predictions(model, testloader)
        
        test_hamming = hamming_loss(np.round(test_image_predictions), test_image_labels)
        print('Test multi-label Hamming accuracy: {:.4f}'.format(1 - test_hamming))
    
    @staticmethod
    def __prepare_dataloader(df: pd.DataFrame) -> DataLoader:
        test_ds = PathologyDataset(df, DATA_FOLDER, test_transforms)
        testloader = DataLoader(test_ds, batch_size=BATCH_SIZE, pin_memory=False, shuffle=False, num_workers=WORKERS)
        
        return testloader
    
    @staticmethod
    def __get_predictions(model: nn.Module, dataloader: DataLoader) -> ModelPrediction:
        model.eval()
        
        test_image_predictions = []
        test_image_labels = []
        
        for images, labels in dataloader:
            images = images.to(TrainEngine.get_device_type())
            labels = labels.to(TrainEngine.get_device_type())

            output = torch.sigmoid(model(images))

            test_image_predictions += [torch.round(output).cpu().detach().numpy()]
            test_image_labels += [labels.cpu().detach().numpy()]
        
        test_image_predictions = np.concatenate(test_image_predictions)
        test_image_labels = np.concatenate(test_image_labels)
        
        return test_image_predictions, test_image_labels


def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.mdeterministic = True


def read_data(df_path: str) -> DataFramesPair:
    df = pd.read_csv(df_path)
    df[IMAGES_COL_NAME] = os.listdir('../input/plant-pathology-2021-224x224/train_imgs')
    df[LABELS_COL_NAME] = df[LABELS_COL_NAME].apply(str.split).apply(tuple)
    df[LABELS_COL_NAME] = MultiLabelBinarizer().fit_transform(df[LABELS_COL_NAME]).astype('float32').tolist()
    
    train_images, test_images, train_labels, test_labels = train_test_split(
        df[IMAGES_COL_NAME], df[LABELS_COL_NAME], test_size=TEST_SIZE,
        shuffle=True, random_state=SEED
    )
    train_df = pd.DataFrame({IMAGES_COL_NAME: train_images, LABELS_COL_NAME: train_labels}).reset_index()
    test_df = pd.DataFrame({IMAGES_COL_NAME: test_images, LABELS_COL_NAME: test_labels}).reset_index()
    
    return train_df, test_df


def get_n_classes(df: pd.DataFrame) -> int:
    return len(df[LABELS_COL_NAME][0])


def create_model(n_classes: int) -> nn.Module:
    model = models.resnet18(pretrained=True)
    
    for child in model.children():
        for param in child.parameters():
            param.requires_grad = False
    
    model.fc = nn.Linear(512, n_classes)
    model = model.to(TrainEngine.get_device_type())
    
    return model


def begin_session() -> None:
    print('\n' * 5)


def main():
    seed_everything(SEED)
    torch.set_default_tensor_type(DEFAULT_TENSOR_TYPE)
    
    train_df, test_df = read_data(DATAFRAME_PATH)
    n_classes = get_n_classes(train_df)

    model = create_model(n_classes)
    criterion = nn.BCELoss().to(TrainEngine.get_device_type())
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)
    scheduler = ReduceLROnPlateau(optimizer, factor=FACTOR, patience=SCHEDULER_PATIENCE)
    
    begin_session()
    TrainEngine.train_model(model, train_df, criterion, optimizer, scheduler, N_EPOCHS)
    print('Training complited.', end='\n\n')
    
    print('Using model with test set...')
    with torch.no_grad():
        TestEngine.test_model(model, test_df)
    
    del model, criterion, optimizer, scheduler
    torch.cuda.empty_cache()
    
    
if __name__ == '__main__':
    main()