In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

from transformers import ViTFeatureExtractor, ViTModel, ViTConfig, AutoConfig

from PIL import Image
#from torchsummary import summary

%matplotlib inline



In [5]:
# Load Training data
df = pd.read_excel('/kaggle/input/cowimages/cowimagelow/train.xlsx')
df["file_path"] = '/kaggle/input/cowimages/cowimagelow/train/'+df['file']
df["label"] = df["class"]

In [6]:
# Define function to add data/model in to GPU (cuda)
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
def to_device(data, device):
    # if data is list or tuple, move each of them to device
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device) -> None:
        self.dl = dl
        self.device = device

    def __iter__(self):
        for b in self.dl:
            # yield only execuate when the function is called
            yield to_device(b, self. device)

    def __len__(self):
        return len(self.dl)

In [7]:
# Define training dataset
class cattleDataset(Dataset):
    def __init__(self, dataframe, trans_transform=None, res_transform=None):
        self.labels = dataframe["label"]
        self.images = dataframe["file_path"]
        self.trans_transform = trans_transform
        self.res_transform = res_transform

    def __len__ (self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        img_path = self.images[idx]
        image = Image.open(img_path)

        image_trans = self.trans_transform(np.array(image), return_tensors='pt')
        image_trans = image_trans['pixel_values'].squeeze()

        image_res = self.res_transform(image)

        label = self.labels[idx]

        return image_trans, image_res, label

trans_transform = ViTFeatureExtractor.from_pretrained('google/vit-large-patch16-224')
res_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


train_ds = cattleDataset(df, trans_transform=trans_transform, res_transform=res_transform)
train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [9]:
# Load Training data
df_test = pd.read_excel("/kaggle/input/cowimages/cowimagelow/test.xlsx")
df_test["file_path"] = '/kaggle/input/cowimages/cowimagelow/test/'+df_test["file"]# Create image path
df_test["label"] = df_test["class"]

In [10]:
test_ds = cattleDataset(df_test, trans_transform=trans_transform, res_transform=res_transform)
val_dl = DataLoader(test_ds, batch_size=16, shuffle=False)

In [11]:
# Modify the model - ResNet
model_Res = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=False)

# Remove the last layer of the model Res
layers_Res = list(model_Res.children())
model_Res = nn.Sequential(*layers_Res[:-1])

# Set the top layers to be not trainable
count = 0
for child in model_Res.children():
    count += 1
    if count < 8:
        for param in child.parameters():
            param.requires_grad = False
# Modify the model - ViT model

from transformers import ViTModel, ViTConfig
# Load the pre-trained ViT model
config = ViTConfig()
model_trans = ViTModel(config)
count = 0
for child in model_trans.children():
    count += 1
    if count < 4:
        for param in child.parameters():
            param.requires_grad = False

layers_trans = list(model_trans.children()) # Get all the layers from the Transformer model
model_trans_top = nn.Sequential(*layers_trans[:-2]) # Remove the normalization layer and pooler layer
trans_layer_norm = list(model_trans.children())[2] # Get the normalization layer

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip


In [12]:
class model_final(nn.Module):
    def __init__(self, model_trans_top, model_Res, dp_rate = 0.3):
        super().__init__()
        # All the trans model layers
        self.model_trans_top = model_trans_top
        #self.trans_layer_norm = trans_layer_norm
        self.trans_flatten = nn.Flatten()
        self.trans_linear = nn.Linear(150528, 2048)

        # All the ResNet model
        self.model_Res = model_Res

        # Merge the result and pass 
        self.dropout = nn.Dropout(dp_rate)
        self.linear1 = nn.Linear(2048, 25)
    
        self.self_attention = nn.MultiheadAttention(embed_dim=2048, num_heads=8)
     


    def forward(self, trans_b, res_b):
        # Get intermediate outputs using hidden layer
        result_trans = self.model_trans_top(trans_b)
        patch_state = result_trans.last_hidden_state[:,1:,:] # Remove the classification token and get the last hidden state of all patchs
        #result_trans = self.trans_layer_norm(patch_state)
        result_trans = self.trans_flatten(patch_state)
        result_trans = self.dropout(result_trans)
        result_trans = self.trans_linear(result_trans)

       # print('\n ViT features  shape', result_trans.shape)

        result_res = self.model_Res(res_b)

        # result_res = result_res.squeeze() # Batch size cannot be 1
        result_res = torch.reshape(result_res, (result_res.shape[0], result_res.shape[1]))

        x = result_trans.unsqueeze(0)
        y = result_res.unsqueeze(0)
        x, _ = self.self_attention(y,x,y)
        result_merge = x.squeeze()
        
     

        result_merge = self.dropout(result_merge)
        result_merge = self.linear1(result_merge)




        return result_merge


In [13]:
model = model_final(model_trans_top, model_Res, dp_rate=0.3)

In [14]:

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
train_dl = DeviceDataLoader(train_dl, device)
val_dl=DeviceDataLoader(val_dl, device)
model = to_device(model, device)

In [15]:
# Define optimizer and learning_rate scheduler
params = [param for param in list(model.parameters()) if param.requires_grad]
optimizer = torch.optim.Adam(params, lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.1, 
    patience=2, 
    verbose=True)

def fit(epochs, model, train_dl, val_dl, patience=5):
    device = next(model.parameters()).device
    opt = optimizer
    sched = lr_scheduler
    loss_func = nn.CrossEntropyLoss()
    
    best_val_accuracy = 0
    no_improvement_count = 0
    
    for epoch in range(epochs):
        model.train()
        batch_num = 1
        total_loss = 0
        for x_trans, x_res, yb in train_dl:
            x_trans = x_trans.to(device)
            x_res = x_res.to(device)
            yb = yb.to(device)
            
            preds = model(x_trans, x_res)
            loss = loss_func(preds.squeeze(), yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
            print('\r', f'batch #{batch_num}: {loss}', end='')
            batch_num += 1
            total_loss += loss.item()
        
        sched.step(total_loss)

        model.eval()
        num_correct = 0
        num_total = 0
        with torch.no_grad():
            for x_trans, x_res, yb in val_dl:
                x_trans = x_trans.to(device)
                x_res = x_res.to(device)
                yb = yb.to(device)
                
                preds = model(x_trans, x_res)
                _, predicted = torch.max(preds.data, 1)
                num_correct += (predicted == yb).sum().item()
                num_total += yb.size(0)
        
        val_accuracy = 100 * num_correct / num_total
        print('\n', f'Epoch: ({epoch+1}/{epochs}) Loss = {total_loss}, Validation Accuracy = {val_accuracy}%')

        # Early stopping check
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            no_improvement_count = 0
        else:
            no_improvement_count += 1
        
        if no_improvement_count >= patience:
            print(f'\nEarly stopping after {patience} epochs without improvement.')
            break   

In [16]:
fit(50, model, train_dl, val_dl,10)


 batch #157: 3.1004095077514656
 Epoch: (1/50) Loss = 506.90197944641113, Validation Accuracy = 5.12%
 batch #157: 3.1499962806701665
 Epoch: (2/50) Loss = 502.3433105945587, Validation Accuracy = 7.04%
 batch #157: 3.1510057449340822
 Epoch: (3/50) Loss = 498.84123373031616, Validation Accuracy = 8.64%
 batch #157: 3.0552377700805664
 Epoch: (4/50) Loss = 495.77550530433655, Validation Accuracy = 10.24%
 batch #157: 3.1232292652130127
 Epoch: (5/50) Loss = 493.42453145980835, Validation Accuracy = 9.28%
 batch #157: 3.2011275291442875
 Epoch: (6/50) Loss = 491.0006353855133, Validation Accuracy = 9.76%
 batch #157: 3.2438364028930664
 Epoch: (7/50) Loss = 489.94416189193726, Validation Accuracy = 11.84%
 batch #157: 3.2377071380615234
 Epoch: (8/50) Loss = 486.92253279685974, Validation Accuracy = 13.44%
 batch #157: 3.1728820800781252
 Epoch: (9/50) Loss = 485.3731861114502, Validation Accuracy = 13.6%
 batch #157: 3.2546203136444095
 Epoch: (10/50) Loss = 485.03262662887573, Validat