In [None]:
import os
import pandas as pd
import numpy as np
import sklearn as sk
import cv2 
from skimage.transform import resize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torch.nn.functional as F
import torchvision
import torchvision.datasets as dataset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
import os
import gc
import copy
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt


import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

from pytorch_lightning import metrics
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import LearningRateMonitor

from transformers import AdamW, get_cosine_schedule_with_warmup

from tqdm.notebook import tqdm

from PIL import Image

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [None]:
TRAIN=False
NUMBER_EPOCHS=8
SUBMIT=True
LOAD=True
EVALUATE_TRESHOLD=True
LOAD_CHECK_PATH='../input/train-checkpoints-effb5/checkpoints/efficientb5/epoch2.pb'

## get train csv

In [None]:
train_csv='/kaggle/input/plant-pathology-2021-fgvc8/train.csv'
train_df=pd.read_csv(train_csv)
train_df.head()

## Possible label Extraction

In [None]:
labels=train_df['labels'].to_numpy()
count_labels={}
for label in labels:
  for word in label.split(' '):
    if word not in count_labels:
      count_labels[word]=1
    else:
      count_labels[word]+=1
possible_labels=count_labels.keys()
map_dictionary={ label:index for index,label in enumerate(possible_labels)}
map_dictionary.keys()

## label to tensor casting

In [None]:
def get_label_from_image_name(img_name:str)->str:
  return train_df.loc[train_df['image'] == img_name].labels.to_numpy()[0]

def label_to_tensor(label : str):
  listed_classes=label.split(' ')
  torch_target=torch.zeros([6], dtype=torch.float)
  for class_found in listed_classes:
    index=map_dictionary[class_found]
    torch_target[index]=1
  return torch_target

st='scab frog_eye_leaf_spot complex'
label_to_tensor(st)

## Data Sources

In [None]:
base_image_dir='../input/resized-plant2021/img_sz_512'
base_test_dir='/kaggle/input/plant-pathology-2021-fgvc8/test_images'

## Configurações
   

In [None]:
class CONFIG:
    seed = 42
    model_name = 'tf_efficientnetv2_m_in21k' 
    train_batch_size = 8
    valid_batch_size = 32
    img_size = (256,256)
    epochs = 5
    learning_rate = 1e-4
    min_lr = 1e-6
    weight_decay = 1e-6
    T_max = 5
    scheduler = None
    n_accumulate = 1
    n_fold = 5
    target_size = 6
    if (torch.cuda.is_available()):
      print('using gpu')
      device = torch.device("cuda")
    else:
      print('using cpu')
      device = torch.device("cpu")

def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG.seed)

# Dataclasses for train and test

In [None]:
class PlantDataset(Dataset):
    
    def __init__(self,base_dir, paths ,transform=None ):
        super().__init__()
        
        if transform is None:
            self.transform=A.Compose([
                    A.Resize(428, 428),
                    ToTensorV2()
            ])
        else:
            self.transform=transform
            
        self.base_dir=base_dir
        self.paths=paths
    
        
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        img_loc = os.path.join(self.base_dir, self.paths[idx])
        image = Image.open(img_loc).convert("RGB")
        image=np.array(image)
        trans_image = self.transform(image=image)['image']
        label=get_label_from_image_name(self.paths[idx])
        label_tensor=label_to_tensor(label)
        return (trans_image,label_tensor)
    
    
class PlantDatasetTest(Dataset):
    
    def __init__(self,base_dir, paths ,transform=None ):
        super().__init__()
        
        if transform is None:
            self.transform=A.Compose([
                    A.Resize(428, 428),
                    ToTensorV2()
            ])
        else:
            self.transform=transform
            
        self.base_dir=base_dir
        self.paths=paths
    
        
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        img_loc = os.path.join(self.base_dir, self.paths[idx])
        image = Image.open(img_loc).convert("RGB")
        image=np.array(image)
        trans_image = self.transform(image=image)['image']
        label=self.paths[idx]
        return (trans_image,label)
    

## Creating Dataloader for train and validation

In [None]:
paths=os.listdir(base_image_dir)
train_paths=paths[:-1840]
validation_paths=paths[-1840:]
transforms=A.Compose([
                A.Resize(428, 428),
                ToTensorV2()
        ])

train_loader = DataLoader(PlantDataset(base_image_dir,train_paths,transforms), shuffle=False,num_workers=2,batch_size=CONFIG.train_batch_size)
validation_loader = DataLoader(PlantDataset(base_image_dir,validation_paths,transforms), shuffle=False,num_workers=2,batch_size=CONFIG.train_batch_size)
    

## Creating preprocess class

In [None]:
from torch import nn
from torchvision import transforms as T

class Preprocess(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.augment = nn.Sequential(
            T.RandomCrop((400, 400)),
            T.RandomApply(nn.ModuleList([T.CenterCrop((385,385))]),p=.5),
            T.RandomHorizontalFlip(.5),
            T.RandomVerticalFlip(.5),
            T.ColorJitter(brightness=.1, hue=.1,contrast=.1)
        )
        
        self.normalize = nn.Sequential(
            T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        )
        
    def forward(self,x):
        x = x / 255 # move to 0,1 interval 
        x = self.augment(x) # apply augment
        return self.normalize(x)

In [None]:
! pip install ../input/efficientnet-pytorch-offline-pack/EfficientNet-PyTorch


## Creating Efficientnet Model

In [None]:

from efficientnet_pytorch import EfficientNet

class Model(LightningModule):
    
    def __init__(self, lr=5e-3,betas=(.8, .9),weight_decay=10.,drop1=.3,drop2=.1):
        super().__init__()
        
        self.save_hyperparameters()
        
        self.cost = nn.BCELoss()
        
        self.preprocess = Preprocess()
        
        def load_effnet(i,path,out):
            effnet = EfficientNet.from_name('efficientnet-b'+str(i))
            effnet.load_state_dict(torch.load(path))
            n_features = effnet._fc.in_features
            effnet._dropout = nn.Dropout(self.hparams.drop1)
            effnet._fc = nn.Sequential(
                            nn.Linear(n_features,1000),nn.ReLU(),
                            nn.Dropout(self.hparams.drop2),
                            nn.Linear(1000,out)
            )
            return effnet
        
        self.effnet = load_effnet(5,
                                  '../input/efficientnet-pytorch-offline-pack/efficientnet-b5-b6417697.pth',6)
        
        
    def forward(self,x):
        x = self.preprocess(x)
        x = self.effnet(x)
        return torch.sigmoid(x)

    
model = Model()

# getting optimizer parameters

In [None]:
params_to_update = model.parameters()
feature_extract = True
print("Params to learn:")
if feature_extract:
    params_to_update = []
    for name,param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)

In [None]:
if TRAIN:
    optimizer = optim.Adam(params_to_update, lr=CONFIG.learning_rate, weight_decay=CONFIG.weight_decay)

## loading model to device

In [None]:
model.to(CONFIG.device)

### creating folders to store checkpoints

In [None]:
os.makedirs('./checkpoints/efficientb5/',exist_ok=True)

## training structure

In [None]:
import copy

def train_epoch(model,train_loader,optimizer,epoch):
    model.train()
    print('training epoch', epoch)
    running_loss = 0.0
    train_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs=inputs.to(CONFIG.device)
        labels=labels.to(CONFIG.device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
#             print('\n new\n')
#             print(outputs.shape, outputs)

#             prob_outputs=softmax(outputs)

        loss = model.cost(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item() * CONFIG.train_batch_size
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            train_loss+=running_loss
            running_loss = 0.0

    train_loss = train_loss/len(train_loader.sampler)
    
    return train_loss

In [None]:
@torch.no_grad()
def valid_epoch(model,validation_loader,optimizer,epoch):
    model.eval()
    print('validation epoch', epoch)
    running_loss = 0.0
    val_loss=0.0
    for i, data in enumerate(validation_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs=inputs.to(CONFIG.device)
        labels=labels.to(CONFIG.device)

        
        # forward + backward + optimize
        outputs = model(inputs)
#             print('\n new\n')
#             print(outputs.shape, outputs)

#             prob_outputs=softmax(outputs)

        loss = model.cost(outputs, labels)

        # print statistics
        running_loss += loss.item() * CONFIG.train_batch_size
        if i % 100 == 99:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            val_loss+=running_loss
            running_loss = 0.0
    
    val_loss = val_loss/len(validation_loader.sampler)
    return val_loss

In [None]:
import time
import gc
def run_train_validation_epochs(model, optimizer,train_loader,valid_loader,num_epochs):    
    start = time.time()
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_epoch(model,train_loader=train_loader,optimizer=optimizer,epoch=epoch)
        
        valid_epoch_loss = valid_epoch(model, optimizer=optimizer,validation_loader=valid_loader,epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(valid_epoch_loss)

        print(f'loss in epoch {epoch} train={train_epoch_loss} valid={valid_epoch_loss}')

        PATH = "./checkpoints/efficientb5/epoch{:.0f}.pb".format(epoch)
        torch.save(model.state_dict(), PATH)
        print("Model Saved")    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    
    return model, history

## RUN TRAIN

In [None]:
if TRAIN:
    model,history=run_train_validation_epochs(model=model,optimizer=optimizer, train_loader=train_loader,valid_loader=validation_loader,num_epochs=NUMBER_EPOCHS)

## EVAlUATE LOSS x EPOCH

In [None]:
if TRAIN:
    total_epochs=NUMBER_EPOCHS
    epochs=list(range(1,total_epochs+1))
    print(history['Train Loss'])
    train_loss=history['Train Loss']
    valid_loss=history['Valid Loss']
    print(history['Valid Loss'])
    plt.plot(epochs,train_loss,label = "Train")
    plt.plot(epochs,valid_loss,label = "Validation")
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.title('Loss over epochs')
    plt.legend()
    plt.show()

## LOAD A TRAINED CHECKPOINT

In [None]:
if LOAD:
    model.load_state_dict(torch.load(LOAD_CHECK_PATH,map_location=CONFIG.device))

## TRESHOLD EVALUTION

In [None]:
from sklearn.metrics import f1_score,accuracy_score,classification_report
def get_df_for_f1_in_validation(model,labels,dataloader):
    df=pd.DataFrame(columns=['image',*labels])
    
    model.eval()
    N = 4
    ground_truth=[]
    for img, y in dataloader:
        y_hat = np.zeros((img.size(0),6))
        img=img.to(CONFIG.device)
        for _ in range(N):
            y_hat += model(img).detach().cpu().numpy()
        y_hat /= N
        for tensor in y:
            ground_truth.append(tensor.cpu().numpy())
        for image,lbls in enumerate(y_hat):
            df=df.append({'image':f'sample_{image}',**{l:v for l,v in zip(labels,lbls)}},ignore_index=True)
    return df,ground_truth

def get_score_from_treshold(df,ground_truth,labels,tresholds):
    extract_labels=[]
    for label in labels:
        df[label+'_']=df[label].apply(lambda x: 0 if x<thresholds[label] else 1)
        extract_labels.append(label+'_')
    pred=df[extract_labels].to_numpy()
    score=f1_score(pred,ground_truth,average='micro')
    return score

def get_score_and_accuracy_from_treshold(df,ground_truth,labels,tresholds):
    extract_labels=[]
    for label in labels:
        df[label+'_']=df[label].apply(lambda x: 0 if x<thresholds[label] else 1)
        extract_labels.append(label+'_')
    pred=df[extract_labels].to_numpy()
    
    score_f1=f1_score(pred,ground_truth,average='micro')
    score_acc=accuracy_score(pred,ground_truth,)
    print(classification_report(pred,ground_truth,target_names=labels))
    return score_f1,score_acc

In [None]:
treshold_value=0.5

if EVALUATE_TRESHOLD:
    labels=map_dictionary.keys()
    df_f1,ground_truth=get_df_for_f1_in_validation(model,labels,validation_loader)
    treshold_values=[0.4,0.5,0.6,0.7,0.8]
    scores_f1=[]
    best_f1=0
    for value in treshold_values:
        thresholds= [value]*6
        thresholds={k:v for k,v in zip(labels, thresholds)}
        score=get_score_from_treshold(df_f1,ground_truth,labels,thresholds)
        scores_f1.append(score)
        if(score > best_f1):
            best_f1=score
            treshold_value=value
            
    plt.plot(treshold_values,scores_f1)
    plt.xlabel('treshold')
    plt.ylabel('f1 score')
    plt.title('F1 in validation x treshold')
    plt.legend()
    plt.show()
    
treshold_value

In [None]:
if SUBMIT:
    labels=map_dictionary.keys()
    df_f1,ground_truth=get_df_for_f1_in_validation(model,labels,validation_loader)
    thresholds= [treshold_value]*6
    thresholds={k:v for k,v in zip(labels, thresholds)}
    score_f1,score_acc=get_score_and_accuracy_from_treshold(df_f1,ground_truth,labels,thresholds)
    print(f"validation f1_score={score_f1} accuracy={score_acc}")

## CREATE SUBMISSION

In [None]:
if SUBMIT:
    labels=map_dictionary.keys()
    thresholds= [treshold_value]*6
    thresholds={k:v for k,v in zip(labels, thresholds)}
    thresholds
    paths=os.listdir(base_test_dir)
    test_loader = DataLoader(PlantDatasetTest(base_test_dir,paths), shuffle=False,num_workers=2,batch_size=CONFIG.train_batch_size)

    df=pd.DataFrame(columns=['image',*labels])
    
    model.eval()
    N = 4
    for img, y in test_loader:
        img=img.to(CONFIG.device)
        y_hat = np.zeros((img.size(0),6))
        for _ in range(N):
            y_hat += model(img).detach().cpu().numpy()
        y_hat /= N
        for image,lbls in zip(y,y_hat):
            df=df.append({'image':image,**{l:v for l,v in zip(labels,lbls)}},ignore_index=True)
    for label in labels:
        df[label+'_']=df[label].apply(lambda x: '' if x<thresholds[label] else label+' ')
    
    df['labels']=df.apply(lambda x: ''.join(x[7:13]),axis=1)
    df.labels=df.labels.apply(lambda x: 'healthy' if x=='' else x)
    df.labels=df.labels.apply(lambda x: 'healthy' if 'healthy' in x else x)
    df[['labels','image']].to_csv('submission.csv',index=False)


    
    