In [None]:
import numpy as np 
import pandas as pd 
import glob
import json
import os
import seaborn as sns
import cv2
import matplotlib.pyplot as plt

<!-- The training set images are organized in subfolders **h22-train/images/subfolder1/subfolder2/image_id.jpg**, where subfolder1 and subfolder2 comes from the **first three and the last two digits of the image_id**. **Image_id is a result of combination between category_id and unique numbers that differentiates images within plant taxa.** -->

In [None]:
INPUT_BASE_FILES = glob.glob('../input/herbarium-2022-fgvc9/*')

train_metadata_json = INPUT_BASE_FILES[0]
sample_submission_csv = INPUT_BASE_FILES[1]
test_metadata_json = INPUT_BASE_FILES[2]
train_images_folder = INPUT_BASE_FILES[3]
test_images_folder = INPUT_BASE_FILES[4]


# EDA

In [None]:
with open(train_metadata_json) as json_file:
    train_metadata = json.load(json_file)
    
with open(test_metadata_json) as json_file:
    test_metadata = json.load(json_file)

In [None]:
print(train_metadata.keys()) # A dictionary

In [None]:
print(test_metadata[:2]) # A list

In [None]:
for k,v in train_metadata.items():
    print(f'| Key : {k}   >>  Total values  : {len(v)} ')

In [None]:
gen = train_metadata.get('genera')
genera_dict = {}
for i in gen:
    genera_dict[i.get('genus_id')] = i.get('genus')

In [None]:
print('Sample Values of each keys .. \n')

print('[+] Images ---\n')
print(train_metadata.get('images')[0])
print('\n')
print('[+] Annotations ---\n')
print(train_metadata.get('annotations')[0])
print('\n')
print('[+] Categories ---\n')
print(train_metadata.get('categories')[0])
print('\n')
print('[+] Genera --- \n ')
print(train_metadata.get('genera')[0])
print('\n')
print('[+] Distances --- \n ')
print(train_metadata.get('distances')[0])
print('\n')
print('[+] Institutions --- \n ')
print(train_metadata.get('institutions')[0])
print('\n')
print('[+] License --- \n ')
print(train_metadata.get('license')[0])

In [None]:
# Image information
file_names = []
image_ids = []
genus_ids = []
genus_names = []
category_ids = []
institution_ids = []
image_paths = []

for i,j in zip(train_metadata.get('images'),train_metadata.get('annotations')):
    
    image_id_im = i.get('image_id')
    image_id_anno = j.get('image_id')
    
    if image_id_im == image_id_anno:
        file_name = i.get('file_name')
        genus_id = j.get('genus_id')
        category_id = j.get('category_id')
        institution_id = j.get('institution_id')
        
        file_names.append(file_name)
        image_ids.append(image_id_anno)
        genus_ids.append(genus_id)
        genus_names.append(genera_dict.get(genus_id))
        category_ids.append(category_id)
        institution_ids.append(institution_id)
        image_paths.append(os.path.join(train_images_folder,file_name))

In [None]:
training_images_df = pd.DataFrame.from_dict({'FileNames' : file_names, 'ImageID' : image_ids, 'GenusID' : genus_ids,'GenusNames':genus_names,
                                             'CategoryID' : category_ids,'InstitutionID' : institution_ids,'ImagePath':image_paths})

In [None]:
training_images_df.to_csv('training_images_df.csv')

In [None]:
training_images_df.sample(5)

In [None]:
# Check correlation between 'CategoryID','GenusID' and 'InstitutionID'
corr_cgi = training_images_df[['CategoryID','GenusID','InstitutionID']].corr()
corr_cgi

In [None]:
training_images_df[['CategoryID','GenusID','InstitutionID']].nunique()

In [None]:
training_images_df.sort_values(by=['CategoryID'],ascending=False).head(4)

## Briet data information

In [None]:
print('Genus ID information')
id,count = np.unique(genus_ids,return_counts=True)
genus_count_df = pd.DataFrame.from_dict({'Genus ID' : id,'Count' : count}).sort_values(by=['Count'],ascending=False)
genus_count_df['Count'].hist(bins=100, figsize=(18, 6), grid=True)
plt.title('Histogram of Genus ID counts')
plt.show()

In [None]:
print('Category ID Information') 
id,count = np.unique(category_ids,return_counts=True)
category_id_df = pd.DataFrame.from_dict({'Category ID' : id,'Count' : count}).sort_values(by=['Count'],ascending=False)
category_id_df['Count'].hist(bins=100, figsize=(18, 6), grid=True)
plt.title('Histogram of Category ID counts')
plt.show()

In [None]:
print('Institution ID information')
id,count = np.unique(institution_ids,return_counts=True)
institution_id_df = pd.DataFrame.from_dict({'Institution ID' : id,'Count' : count}).sort_values(by=['Count'],ascending=False)
institution_id_df['Count'].hist(bins=100, figsize=(18, 6), grid=True)
plt.title('Histogram of Institution ID counts')
plt.show()

## Image Visualization

In [None]:
def visualize_data(df,show_by='Random',genus_name = None):
    
    if show_by == 'Genus':
        df = df[df['GenusNames']==genus_name]
            
    data = df.sample(10)
    
    image_paths = data['ImagePath'].to_list()
    genus_ids = data['GenusNames'].to_list()
    category_ids = data['CategoryID'].to_list()
    institution_ids = data['InstitutionID'].to_list()
    
    plt.figure(figsize=(13,13))
    
    for indx,im in enumerate(image_paths):
        plt.subplot(2,5,indx+1)
        image = cv2.imread(im)
        plt.imshow(image[:,:,::-1])
        plt.title(f'GeniusNames :{genus_ids[indx]},\nCategoryID : {category_ids[indx]},\nInstitutionID : {institution_ids[indx]}')
        plt.axis('off')
    plt.tight_layout()

In [None]:
# Visualize random 10 image
visualize_data(training_images_df,show_by='Random')

In [None]:
# Visualize random 10 image for a particular genus
visualize_data(training_images_df,show_by='Genus',genus_name='Asimina')

# MODELLING

In [None]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import Dataset
import torch
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torch
from torch.nn import functional as F
from torch import nn
import pandas as pd
import torchvision
from pytorch_lightning.core.lightning import LightningModule

In [None]:
data = training_images_df #pd.read_csv('./training_images_df.csv',index_col=0,dtype=str)


X = data['ImagePath'].to_list()
y = data['CategoryID'].to_list()
#y = [i-1 for i in y]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=41,stratify=y)

train_df = pd.DataFrame.from_dict({'ImagePath' : X_train, 'CategoryID' : y_train})
val_df = pd.DataFrame.from_dict({'ImagePath' : X_val, 'CategoryID' : y_val})

In [None]:
#training_images_df.to_csv('data_info.csv')

In [None]:
print(max(y),min(y))
print(max(y_train),min(y_train))
print(max(y_val),min(y_val))

In [None]:
train_transform = A.Compose([
    A.HorizontalFlip(p=0.3),
    A.RandomBrightnessContrast(p=0.1),
    A.Normalize(p=1),
    A.Rotate(limit=30,p=0.2)
])

val_transform = A.Compose([
    A.Normalize(p=1),
])

In [None]:
class CDataset(Dataset):
    def __init__(self, dataframe, transform=None,target_size=(512,512)):
        self.transform = transform
        self.dataframe = dataframe
        self.image_paths = dataframe['ImagePath']
        self.labels = dataframe['CategoryID']
        self.target_size = target_size
        
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, indx):
        
        if torch.is_tensor(indx):
            indx = indx.tolist()

        img_name = self.image_paths[indx]
        image = cv2.imread(img_name)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, self.target_size)
        label = np.array(self.labels[indx]).astype(int)
        if self.transform:
            transformed_image = self.transform(image=image)['image']
            transformed_image,label = torch.from_numpy(transformed_image),torch.from_numpy(label)
            transformed_image = transformed_image.permute(2, 0, 1)
            return (transformed_image,label)
        image = image.permute(2, 0, 1)
        return (image,label)

In [None]:
training_data = CDataset(train_df,transform = train_transform,target_size=(224,224))
validation_data = CDataset(val_df,transform = val_transform,target_size=(224,224))

In [None]:
train_batch_size = 128
train_dataloader = DataLoader(training_data, batch_size=train_batch_size,shuffle=True)
validation_dataloader = DataLoader(validation_data, batch_size=128,shuffle=False)

In [None]:
print(len(train_dataloader))
print(len(validation_dataloader))

In [None]:
def visualize_from_dataloader(dl):
    features, labels = next(iter(train_dataloader))
    print(type(features))
    print(type(labels))
    print(f"Feature batch shape: {features.size()}")
    print(f"Labels batch shape: {labels.size()}")
    img = features[0].squeeze()
    img = img.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img = img * std + mean
    label = labels[0]
    plt.imshow(img, cmap="gray")
    plt.title(f"Label: {label}")
    plt.show()

In [None]:
visualize_from_dataloader(train_dataloader)

In [None]:
!pip install efficientnet_pytorch 
!pip install torchsummary

In [None]:
from efficientnet_pytorch import EfficientNet
from efficientnet_pytorch.utils import MemoryEfficientSwish
from torchsummary import summary
from torch import nn
import torch.nn.functional as F
import time
import copy
from tqdm.autonotebook import tqdm

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        classes = 15505
        self.base_model = EfficientNet.from_name("efficientnet-b0",include_top=False, in_channels=3)
        self.drop = nn.Dropout2d(p=0.2)
        self.fc1 = nn.Linear(1280, 1280//2)
        self.fc2 = nn.Linear(1280//2, classes)

    def forward(self, x):  
        x = self.base_model(x)
        x = x.view(-1,1280)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = Net()
model = model.to(device)
layer = 1
for name, param in model.named_parameters():
    #print(f'layer : {layer}, name : {name}')
    if layer < 210:
        param.requires_grad = False
    layer+=1
summary(model,input_size=(3,224,224))

In [None]:
# def get_preds(model):
#     model.eval()
#     images, labels = next(iter(train_dataloader))
#     images = images.to(device)
#     labels = labels.to(device)
#     outputs = model(images)
#     _, preds = torch.max(outputs, 1)
#     print(preds)
#     return preds
#get_preds(model)

In [None]:
dataloaders = {'train' : train_dataloader, 'val' : validation_dataloader}
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25):
    since = time.time()
    val_acc_history = []
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        # Each epoch has a training and validation phase
        
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
            
            running_loss = 0.0
            running_corrects = 0
            # Iterate over data.
            with tqdm(dataloaders[phase],unit="batch") as dl:
                for inputs, labels in dl:
                    inputs = inputs.to(device)
                    labels = labels.to(device)
                    optimizer.zero_grad()
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                        _, preds = torch.max(outputs, 1)
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data).item()
                    dl.set_postfix(loss=running_loss/len(dataloaders[phase].dataset), accuracy=running_corrects / len(dataloaders[phase].dataset))

                epoch_loss = running_loss / len(dataloaders[phase].dataset)
                epoch_acc = running_corrects / len(dataloaders[phase].dataset)

                print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

                # deep copy the model
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_model_wts = copy.deepcopy(model.state_dict())
                    torch.save(best_model_wts,'best_model.pth')
                    
                if phase == 'val':
                    val_acc_history.append(epoch_acc)

                del inputs, labels
                torch.cuda.empty_cache()
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [None]:
model, val_acc_history = train_model(model, dataloaders, criterion, optimizer, num_epochs=5)

In [None]:
#torch.save(model,'best_model.pth')