## Import Lib

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import json
import torch
import random
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
sys.path.append('../input/repvgg')
from repvgg import repvgg_model_convert, create_RepVGG_B1
import torch.nn as nn
import torchvision 
from torchvision import models,transforms
from PIL import Image
from torch.utils.data import Dataset , DataLoader 

%matplotlib inline
BASE_DIR = "../input/plant-pathology-2021-fgvc8"
BASE_TRAIN_IMAGES_DIR = "../input/plant-pathology-2021-fgvc8/train_images"
BASE_TEST_IMAGES_DIR = "../input/plant-pathology-2021-fgvc8/test_images"
DEVICE=torch.device("cuda")

## Set Seed

In [None]:
def seed_it(seed):
    random.seed(seed)
    os.environ["PYTHONSEED"] = str(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
seed_it(31)

In [None]:
torch.backends.cudnn.benchmark = True


In [None]:
train_df = pd.read_csv(os.path.join(BASE_DIR,'train.csv'))
train_df.info()
train_df.head(5)

## Encode Label
reference:[this notebook](https://www.kaggle.com/ateplyuk/plant-2021-starter)

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train_df['labels'])
train_df['labels_id'] = le.transform(train_df['labels'])
train_df

In [None]:
CLASS_NAMES=dict(sorted(train_df[['labels_id', 'labels']].values.tolist()))
CLASS_NAMES

In [None]:
fig,axis =plt.subplots() 
label_counts=train_df["labels"].value_counts()
label_counts_names=label_counts.index.tolist()
label_counts=label_counts.values

axis.barh(label_counts_names, label_counts, align='center')

axis.invert_yaxis()
fig.show()


## Check Image Size

In [None]:
def checkimagesize(paths):
    sizes={}
    for p in paths:
        img = Image.open(os.path.join(BASE_TRAIN_IMAGES_DIR,p))
        if str(img.size) in sizes:
            sizes[str(img.size)]+=1
        else:
            sizes[str(img.size)]=1
    print(sizes)
#checkimagesize(train_df['image'])
'''
{'(4000, 2672)': 16485,
'(4000, 3000)': 665,
'(2592, 1728)': 1027,
'(4608, 3456)': 123,
'(5184, 3456)': 193,
'(4032, 3024)': 132,
'(3024, 4032)': 3,
'(3024, 3024)': 3,
'(4000, 2248)': 1}
'''

## Define Dataset

In [None]:
class PlantDataset(Dataset):
    def __init__(self, df, transforms=None):
        
        super().__init__()
        self.dataframe = df
        self.transforms = transforms
    
    def __len__(self):
        return self.dataframe.shape[0]
    
    def __getitem__(self, index: int):

        label = self.dataframe.iloc[index]['labels_id']
          
        imgpath = os.path.join(BASE_TRAIN_IMAGES_DIR,self.dataframe.iloc[index]["image"])
        img = Image.open(imgpath)
        if self.transforms:
            img = self.transforms(img)
        return img, label

In [None]:
def splitData(dataframe,p=0.8):
    randomlist = np.random.rand(len(dataframe))<p
    train_dataframe =  dataframe[randomlist]
    valid_dataframe =dataframe[~randomlist]
    print("train {}".format(len(train_dataframe)))
    print("valid {}".format(len(valid_dataframe)))
    return train_dataframe , valid_dataframe

trainDataframe,validDataframe=splitData(train_df)
    

## Define Weighted Sampler for Balence Data

In [None]:
def get_weight_for_balance(dataframe,numclass):
    numdata = len(dataframe)
    counts=[0]*numclass
    for l in range(numclass):
        counts[l]=len(dataframe[dataframe["labels_id"]==l])
    weights_per_classes=[0]*numclass
    for idx,c in enumerate(counts):
        weights_per_classes[idx] = 0 if c ==0 else (numdata/c)
    print(weights_per_classes)
    weights=[]
    for i in range(numdata):
        weights.append(weights_per_classes[dataframe.iloc[i]["labels_id"]])
    return torch.DoubleTensor(weights)

## Create Dataloader

In [None]:
Weighted=False

BATCH_SIZE=64

HEIGHT,WIDTH=224,224

numclasses = len(CLASS_NAMES.values())

train_transform = transforms.Compose([
                                transforms.Resize((WIDTH,HEIGHT)),
                                #transforms.RandomCrop(400,300),
                                transforms.RandomHorizontalFlip(),
                                transforms.RandomVerticalFlip(),
                                #transforms.RandomRotation(90),
                                #transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.1, hue=0.1),
                                transforms.ToTensor(),
                                #transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
                                transforms.Normalize((0.485, 0.456, 0.406),(.229, 0.224, 0.225))])

valid_transform = transforms.Compose([transforms.Resize((WIDTH,HEIGHT)),
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.485, 0.456, 0.406),(.229, 0.224, 0.225))])

trainDataset = PlantDataset(trainDataframe,train_transform)
validDataset = PlantDataset(validDataframe,valid_transform)

if Weighted:
    weights = get_weight_for_balance(trainDataframe,numclasses)
    weightsampler = WeightedRandomSampler(torch.DoubleTensor(weights),num_samples=8000, replacement=True)
    trainDataLoader = DataLoader(trainDataset,batch_size= BATCH_SIZE,num_workers=4,pin_memory=True,shuffle=False,sampler=weightsampler)
else:
    trainDataLoader = DataLoader(trainDataset,batch_size= BATCH_SIZE,num_workers=4,pin_memory=True,shuffle=True)
    
validDataLoader = DataLoader(validDataset,batch_size= BATCH_SIZE,num_workers=4,pin_memory=False)

## Create Model

In [None]:
model = create_RepVGG_B1(deploy=False)
model.load_state_dict(torch.load("../input/repvggpretrainedweights/drive-download-20210121T111115Z-001/RepVGG-B1-train.pth"))
in_features = model.linear.in_features
model.linear = nn.Linear(in_features,numclasses,bias=True)
print(model)
model=model.to(DEVICE)

## Warmup LearningRateScheler

In [None]:
import math
def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS: # exponential warmup
        lr = LR_START + (LR_MAX + LR_START) * (epoch / LR_RAMPUP_EPOCHS) ** 2.5
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS: # sustain lr
        lr = LR_MAX
    else: # cosine decay
        epoch_diff = epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS
        decay_factor = (epoch_diff / DECAY_EPOCHS) * math.pi
        decay_factor= (math.cos(decay_factor) + 1) / 2        
        lr = LR_FINAL + (LR_MAX - LR_FINAL) * decay_factor
    return lr

In [None]:
EPOCH=20
LR_START = 1e-6
LR_MAX = 2e-4
LR_FINAL = 1e-6
LR_RAMPUP_EPOCHS = 2
LR_SUSTAIN_EPOCHS = 0
DECAY_EPOCHS = EPOCH  - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS - 1
LR_EXP_DECAY = (LR_FINAL / LR_MAX) ** (1 / (EPOCH - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS - 1))

def show_lr_schedule(epochs):
    rng = [i for i in range(epochs)]
    y = [lrfn(x) for x in rng]
    x = np.arange(epochs)
    x_axis_labels = list(map(str, np.arange(1, epochs+1)))
    print('init lr {:.1e} to {:.1e} final {:.1e}'.format(y[0], max(y), y[-1]))
    
    plt.figure(figsize=(30, 10))
    plt.xticks(x, x_axis_labels, fontsize=16) # set tick step to 1 and let x axis start at 1
    plt.yticks(fontsize=16)
    plt.plot(rng, y)
    plt.grid()
    plt.show() 
show_lr_schedule(EPOCH)

In [None]:
def change_lr(op,epoch):
    newlr = lrfn(epoch)
    optimizer.param_groups[0]['lr'] = newlr*0.5
    optimizer.param_groups[1]['lr'] = newlr

## Create Loss function and Optimizer

In [None]:

topparams=[]
for name,params in model.named_parameters():
    if "linear" not in name:
        topparams.append(params)
params=[{'params':topparams,'lr':LR_START*0.5},
        {'params':model.linear.parameters(),'lr':LR_START},]
optimizer = torch.optim.AdamW(params=params,lr=LR_START)
lossfunction = torch.nn.CrossEntropyLoss()
#scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.25, patience=5)

## Define M1-Score Function

In [None]:
def calcaulateMacroF1(allpred,allans,allpredacc,nclasses):
    recalls = [0 if allans[i] == 0 else round(allpredacc[i]/allans[i],2) for  i in range(0,5)]
    precisions = [0 if allpred[i] == 0 else round(allpredacc[i]/allpred[i],2) for  i in range(0,5)]
    avg_recalls = sum(recalls) / nclasses
    avg_precisions = sum(precisions) / nclasses
    macro_f1 = (0 if (avg_recalls+avg_precisions) == 0 else 2*(avg_recalls*avg_precisions)/(avg_recalls+avg_precisions))
    recalls = ["%.2f"%i for i in recalls]
    precisions = ["%.2f"%i for i in precisions]
    return recalls ,precisions ,macro_f1

## Train Model

In [None]:
minLoss = 1.0

def train_one_epoch(dataloader , model):
    
    model.train()
    total_loss=0
    iter_count=0
    total_acc=[0]*numclasses
    pred_acc=[0]*numclasses
    allpred=[0]*numclasses
    total_iter=len(dataloader)
    
    for imgs,labels in dataloader:
        
        iter_count+=1
        imgs = imgs.to(DEVICE)
        labels=labels.to(DEVICE)
        
        pred = model(imgs)
        
        loss = lossfunction(pred,labels)  
        loss.backward()
        total_loss+=loss.detach()
        
        for p_index , p in enumerate(pred):
            p_label = p.argmax()
            allpred[p_label]+=1
            total_acc[labels[p_index]]+=1
            if p_label == labels[p_index]:
                pred_acc[p_label]+=1
        
        recalls ,precisions ,macro_f1 = calcaulateMacroF1(allpred,total_acc,pred_acc,numclasses)
        avg_acc = ["%.3f"%(pred_acc[i]/(1 if total_acc[i]==0 else total_acc[i])) for i in range(numclasses)]
        avg_loss = total_loss/iter_count           
        print("\rTrain {}/{} Loss:{} Acc:{} F1:{} Recall{} Precisions{}".format(iter_count,total_iter,"%.3f"%avg_loss,"%.3f"%(sum(pred_acc)/sum(total_acc)),"%.2f"%macro_f1,recalls,precisions),end='',flush=True)
    print('')
    
    
def evaluate(dataloader,model,epoch):
    
    global minLoss
    model.eval()
    total_loss=0
    total_acc=[0]*numclasses
    pred_acc=[0]*numclasses
    allpred=[0]*numclasses
    iter_count=0
    total_iter=len(dataloader)
    
    with torch.no_grad():
        for imgs,labels in dataloader:
            
            iter_count+=1
            imgs = imgs.to(DEVICE)
            labels=labels.to(DEVICE)
            
            pred = model(imgs)
            loss = lossfunction(pred,labels)
            total_loss+=loss.detach()
            
            for p_index , p in enumerate(pred):
                p_label = p.argmax()
                allpred[p_label]+=1
                total_acc[labels[p_index]]+=1
                if p_label == labels[p_index]:
                    pred_acc[p_label]+=1
                    
            recalls ,precisions ,macro_f1 = calcaulateMacroF1(allpred,total_acc,pred_acc,numclasses)
            avg_acc = ["%.3f"%(pred_acc[i]/(1 if total_acc[i]==0 else total_acc[i])) for i in range(numclasses)]
            avg_loss = total_loss/iter_count
            print("\rValid {}/{} Loss:{} Acc:{} F1:{} Recall{} Precisions{}".format(iter_count,total_iter,"%.3f"%avg_loss,"%.3f"%(sum(pred_acc)/sum(total_acc)),"%.2f"%macro_f1,recalls,precisions),end='',flush=True)
            
        #scheduler.step(avg_loss)
        if avg_loss < minLoss:
            minLoss = avg_loss
            savemodel(model,f"modelweight_{avg_loss}_{epoch}.pkl")
            
    print('')
    
def savemodel(model,filepath):
    model_dir="/kaggle/working"
    torch.save(model.state_dict(),os.path.join(model_dir,filepath))

In [None]:
for epoch in range(0,EPOCH):
    change_lr(optimizer,epoch)
    print("EPOCH:{}".format(epoch+1))
    train_one_epoch(trainDataLoader,model)
    evaluate(validDataLoader,model,epoch)
    