In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

import os
import json

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
TRAIN_DIR = '../input/herbarium-2021-fgvc8/train/'
TEST_DIR = '../input/herbarium-2021-fgvc8/test/'

In [None]:
with open(TRAIN_DIR + 'metadata.json', "r", encoding="ISO-8859-1") as file:
    train = json.load(file)    

In [None]:
train_img = pd.DataFrame(train['images'])
train_ann = pd.DataFrame(train['annotations']).drop(columns='image_id')
train_df = train_img.merge(train_ann, on='id')

print(len(train_df))
train_df.head()

In [None]:
BATCH = 128
EPOCHS = 5

LR = 0.01
IM_SIZE = 224

In [None]:
tr_df = train_df[:20000]
len(tr_df)

In [None]:
X_Train, Y_Train = tr_df['file_name'].values, tr_df['category_id'].values

In [None]:
from PIL import Image

In [None]:
Transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Resize((IM_SIZE, IM_SIZE)),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

In [None]:
class GetData(Dataset):
    def __init__(self, Dir, FNames, Labels, Transform):
        self.dir = Dir
        self.fnames = FNames
        self.transform = Transform
        self.labels = Labels         
        
    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, index):       
        x = Image.open(os.path.join(self.dir, self.fnames[index]))
    
        if "train" in self.dir:             
            return self.transform(x), self.labels[index]
        elif "test" in self.dir:            
            return self.transform(x), self.fnames[index]

In [None]:
trainset = GetData(TRAIN_DIR, X_Train, Y_Train, Transform)
trainloader = DataLoader(trainset, batch_size=BATCH, shuffle=True)

In [None]:
NUM_CL = len(train_df['category_id'].value_counts())
NUM_CL

In [None]:
next(iter(trainloader))[0].shape

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

In [None]:
model = torchvision.models.densenet169(pretrained=True)

In [None]:
print(model.classifier.in_features) 
print(model.classifier.out_features)

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
n_inputs = model.classifier.in_features
last_layer = nn.Linear(n_inputs, NUM_CL)
model.classifier = last_layer
if torch.cuda.is_available():
    model.cuda()
print(model.classifier.out_features)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.classifier.parameters())

In [None]:
training_history = {'accuracy':[],'loss':[]}
validation_history = {'accuracy':[],'loss':[]}

In [None]:
from tqdm import tqdm

In [None]:
## Normal Training
def train(trainloader, model, criterion, optimizer, scaler, device=torch.device("cpu")):
  train_acc = 0.0
  train_loss = 0.0
  for images, labels in tqdm(trainloader):
    images = images.to(device)
    labels = labels.to(device)
    optimizer.zero_grad()
#     with torch.cuda.amp.autocast(enabled=True):
    output = model(images)
    loss = criterion(output, labels)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    acc = ((output.argmax(dim=1) == labels).float().mean())
    train_acc += acc
    train_loss += loss
  return train_acc/len(trainloader), train_loss/len(trainloader)

In [None]:
## Normal Evaluation
def evaluate(testloader, model, criterion, device=torch.device("cpu")):
  eval_acc = 0.0
  eval_loss = 0.0
  for images, labels in tqdm(testloader):
    images = images.to(device)
    labels = labels.to(device)
    with torch.no_grad():
      output = model(images)
      loss = criterion(output, labels)

    acc = ((output.argmax(dim=1) == labels).float().mean())
    eval_acc += acc
    eval_loss += loss
  
  return eval_acc/len(testloader), eval_loss/len(testloader)


In [None]:
%%time
## Normal Training
scaler = torch.cuda.amp.GradScaler(enabled=True)
for epoch in range(EPOCHS):
  train_acc, train_loss = train(trainloader, model, criterion, optimizer, scaler, device=device)
#   eval_acc, eval_loss = evaluate(val_loader, model, criterion, device=torch.device("cuda"))
  print("")
  print(f"Epoch {epoch + 1} | Train Acc: {train_acc*100} | Train Loss: {train_loss}")
#   print(f"\t Val Acc: {eval_acc*100} | Val Loss: {eval_loss}")
  print("===="*8)

In [None]:
torch.save(model.state_dict(), "model.pth")

In [None]:
%%time

with open(TEST_DIR + 'metadata.json', "r", encoding="ISO-8859-1") as file:
    test = json.load(file)

In [None]:
test_df = pd.DataFrame(test['images'])
X_Test = test_df['file_name'].values
print(len(test_df))
test_df.head()

In [None]:
testset = GetData(TEST_DIR, X_Test, None, Transform)
testloader = DataLoader(testset, batch_size=1, shuffle=False)

In [None]:
%%time

s_ls = []

with torch.no_grad():
    model.eval()
    for image, fname in testloader: 
        image = image.to(device)
        
        logits = model(image)        
        ps = torch.exp(logits)        
        _, top_class = ps.topk(1, dim=1)
        
        for pred in top_class:
            s_ls.append([fname[0].split('/')[-1][:-4], pred.item()])

In [None]:
sub = pd.DataFrame.from_records(s_ls, columns=['Id', 'Predicted'])
sub.head()

In [None]:
sub.to_csv("submission.csv", index=False)