In [None]:
import numpy as np
import os
import torch
import pandas as pd
import cv2
from torch import nn
from torchvision import transforms
from sklearn.model_selection import train_test_split

# Sanity Check

In this notebook, we want to run sanity checks to test if the data actually contains meaningful information.

In [None]:
petdf = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
# petdf = petdf[(petdf.Pawpularity < 25) | (petdf.Pawpularity > 75)]

train_df, test_df = train_test_split(petdf, test_size=0.2)
train_df, test_df = train_df.reset_index(), test_df.reset_index()

In [None]:
class PawpularDataset:
    def __init__(self, image_ids, pawpularity):
        self.image_ids = image_ids
        print(self.image_ids)
        self.pawpularity = pawpularity
        
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, item):
        embedding = np.load(os.path.join(
            '../input/petfinder-msvision-features/embeddings/ms_vision',
            self.image_ids[item] + '.npy'
        ))
        return {
            "embedding": embedding,
            "pawpularity": self.pawpularity[item],
        }

In [None]:
train_data = PawpularDataset(train_df.Id, train_df.Pawpularity)
test_data = PawpularDataset(test_df.Id, train_df.Pawpularity)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=False)

**First question:** is it possible to train a simple model that can tell which pet has a higher **pawpularity** score (i.e. a binary classifier)?

In [None]:
model = nn.Sequential(
  nn.Linear(4096, 1024),
  nn.ReLU(),
  nn.Linear(1024, 512),
  nn.ReLU(),
  nn.Linear(512, 512),
  nn.ReLU(),
  nn.Linear(512, 2),
)

In [None]:
def train_epoch(model, train_loader, test_loader, epoch, device='cpu'):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    running_loss = 0
    model.train()
    
    # Train model
    for batch_idx, pawbatch in enumerate(train_loader):
        left_emb = pawbatch['embedding'].to(device)
        right_emb = torch.roll(left_emb, 1, 0).to(device)
        left_pawpul = pawbatch['pawpularity'].to(device)
        right_pawpul = torch.roll(left_pawpul, 1, 0).to(device)
        target = (left_pawpul < right_pawpul).type(torch.LongTensor)
        nn_input = torch.cat([left_emb, right_emb], dim=-1)
        
        optimizer.zero_grad()
        
        outputs = model(nn_input)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if batch_idx % 20 == 19:    # print every 20 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, batch_idx + 1, running_loss / 20))
            running_loss = 0.0

    # Eval model
    model.eval()
    accuracy = 0
    null_acc = 0
    odin_acc = 0
    with torch.no_grad():
        for batch_idx, pawbatch in enumerate(test_loader):
            left_emb = pawbatch['embedding'].to(device)
            right_emb = torch.roll(left_emb, 1, 0).to(device)
            left_pawpul = pawbatch['pawpularity'].to(device)
            right_pawpul = torch.roll(left_pawpul, 1, 0).to(device)
            target = (left_pawpul < right_pawpul).type(torch.LongTensor)
            nn_input = torch.cat([left_emb, right_emb], dim=-1)
            outputs = model(nn_input)
            accuracy += torch.sum(torch.argmax(outputs, 1) == target)
            null_acc += torch.sum(0 == target)
            odin_acc += torch.sum(1 == target)
        print('---------------------------------')
        print('Test acc   :', accuracy / (128. * len(test_loader)))
        print('Guess 0 acc:', null_acc / (128. * len(test_loader)))
        print('Guess 1 acc:', odin_acc / (128. * len(test_loader)))
        print('---------------------------------')

In [None]:
for epoch in range(5):
    train_epoch(model, train_loader, test_loader, epoch=epoch)

**Second question:** can we predict if a pet belongs to the upper half (~33 pawpularity, where 33 is median pawpularity)?

In [None]:
model = nn.Sequential(
  nn.Linear(2048, 1024),
  nn.ReLU(),
  nn.Linear(1024, 512),
  nn.ReLU(),
  nn.Linear(512, 512),
  nn.ReLU(),
  nn.Linear(512, 2),
)

In [None]:
def train_epoch(model, train_loader, test_loader, epoch, device='cpu'):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    running_loss = 0
    model.train()
    
    # Train model
    for batch_idx, pawbatch in enumerate(train_loader):
        emb = pawbatch['embedding'].to(device)
        pawpul = pawbatch['pawpularity'].to(device)
        target = (pawpul >= np.mean(petdf.Pawpularity)).type(torch.LongTensor)
        
        optimizer.zero_grad()
        
        outputs = model(emb)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if batch_idx % 20 == 19:    # print every 20 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, batch_idx + 1, running_loss / 20))
            running_loss = 0.0

    # Eval model
    model.eval()
    accuracy = 0
    null_acc = 0
    odin_acc = 0
    with torch.no_grad():
        for batch_idx, pawbatch in enumerate(test_loader):
            emb = pawbatch['embedding'].to(device)
            pawpul = pawbatch['pawpularity'].to(device)
            target = (pawpul >= np.median(petdf.Pawpularity)).type(torch.LongTensor)
            outputs = model(emb)
            accuracy += torch.sum(torch.argmax(outputs, 1) == target) / 128.
            null_acc += torch.sum(0 == target) / 128.
            odin_acc += torch.sum(1 == target) / 128.
        print('---------------------------------')
        print('Test acc   :', accuracy / (len(test_loader)))
        print('Guess 0 acc:', null_acc / (len(test_loader)))
        print('Guess 1 acc:', odin_acc / (len(test_loader)))
        print('---------------------------------')

In [None]:
for epoch in range(5):
    train_epoch(model, train_loader, test_loader, epoch=epoch)