In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import seaborn as sns
import cv2
import torch.hub
import torchvision
import torch.nn as nn
from torch.utils.data import (
    Dataset,
    DataLoader,
)  
from skimage import io
from torchvision.transforms import transforms
from torch import optim
from tqdm import tqdm

TODO
1. ~Number of images in all folders~
2. ~analyse the train.csv~
3. ~Split the data into training and test set ( Try with different methods of splitting )~
4. Analyse the data ( somewhat dones)
5. ~make a model for analyzing the species~
6. Add better performance metric in the model

I am using the resized happywhale dataset, courtesy of @RDizzl3, 
https://www.kaggle.com/rdizzl3/jpeg-happywhale-128x128

In [None]:
train_images_path = '../input/jpeg-happywhale-128x128/train_images-128-128/train_images-128-128'
test_images_path = '../input/jpeg-happywhale-128x128/test_images-128-128/test_images-128-128'
train_df_path = '../input/happy-whale-and-dolphin/train.csv'

Number of images in folders

In [None]:
print(f'Number of training images - {len(os.listdir(train_images_path))}')
print(f'Number of test images - {len(os.listdir(test_images_path))}')

Analysing the train df

In [None]:
df = pd.read_csv(train_df_path)

In [None]:
df.head()

In [None]:
df.info()

There are no null values, thats a relief

I just read that there are duplicates in the names, lets check that ourselves!

In [None]:
sorted(df['species'].unique())


The following are similar
1. bottlenose_dolphin and bottlenose_dolpin
2. kiler_whale killer_whale
3. globis and pilot_whale are same as short_finned_pilot_whale
                          

*(commersons and common dolphin are different)*

In [None]:
# Courtesy of Aleksey Alekseev
df.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

In [None]:
print(f'Ratio of unique images - {len(df.image.unique())/df.shape[0]}')
print(f'Number of unique species - {len(df.species.unique())}')
print(f'Number of unique individual_id - {len(df.individual_id.unique())/df.shape[0]}')

So it looks like
1. we have unique images 
2. there are some images with same individual id
3. There are 26 unique species

In [None]:
df['individual_id'].value_counts()

In [None]:
# converting species to unique integers
mapping = {item:i for i, item in enumerate(df["species"].unique())}
df["species_idx"] = df["species"].apply(lambda x: mapping[x])

In [None]:
df.head()

Before going further, let us try to split our data into training and test set

In [None]:
def split_data(df,method = 'random_split',split_ratio = .5,verbose = False):
    
    
    
    if(method == 'random_split'):
        df_train,df_test,_,_ = train_test_split(df,df,test_size = split_ratio,random_state = 5)
    
    if(verbose):
        print(f'df_train : {df_train.shape}')
        print(f'df_test : {df_test.shape}')
    df_train.reset_index(inplace = True,drop = True)
    df_test.reset_index(inplace = True,drop = True)
    return  df_train,df_test
        
df_train,df_test = split_data(df,'random_split',.3,True)

Analysis

In [None]:
print("Percent Distribution of species")
100*(df_train.species.value_counts()/df_train.shape[0])

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.histplot(data = df_train,y = 'species')
plt.show()

from what i infer, we already have a bunch of dolphins/whales by their ids, there are multiple images of these having the same id, basically each animal has some photos taken at different instances of time.

The test set also containes these animals, some are new however and we cannot assign an id to them.


In [None]:
def draw_image(path):
    img = cv2.imread(path)
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    plt.axis('off')
    plt.imshow(img)
    plt.show()

In [None]:
for file_names in df_train.groupby(by='individual_id')['image'].apply(list):
    if(len(file_names)>=3):
        for file_name in file_names:
            draw_image(os.path.join(train_images_path,file_name))
        break

## Model To Classify the species

In [None]:
# Making Dataset class for loader
class WhaleDataset(Dataset):
    def __init__(self,df,root_dir,transforms = None):
        self.df = df
        self.root_dir = root_dir
        self.transforms = transforms
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir,self.df.image[index])
        img = io.imread(img_path)
        
        y_label = torch.tensor(int(self.df.species_idx[index]))
        
        if self.transforms:
            for transform in self.transforms:
                img = transform(img)
        
        return (img,y_label)
        

In [None]:
train_dataset = WhaleDataset(df = df_train,
                             root_dir = '../input/jpeg-happywhale-128x128/train_images-128-128/train_images-128-128',
                            transforms = [transforms.ToTensor()])
test_dataset = WhaleDataset(df = df_test,
                             root_dir = '../input/jpeg-happywhale-128x128/train_images-128-128/train_images-128-128',
                            transforms = [transforms.ToTensor()])

In [None]:
# Check accuracy on training & test to see how good our model
from sklearn.metrics import f1_score

def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)
            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)


    model.train()
    return num_correct/num_samples


def check_f1_score(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()
    y_true = np.array([])
    y_pred = np.array([])
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)
            scores = model(x)
            _, predictions = scores.max(1)
            predictions = predictions.cpu().detach().numpy()
            y = y.cpu().detach().numpy()
            y_true = np.concatenate((y_true,y))
            y_pred = np.concatenate((y_pred,predictions))
        
    model.train()
    return f1_score(y_true,y_pred,average = "weighted")



In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
in_channels = 3
num_classes = 26
learning_rate = 0.001
batch_size = 64
num_epochs = 50

train_loader = DataLoader(dataset = train_dataset,batch_size = batch_size,shuffle = True)
test_loader = DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True)

# A model for predicting the speccies
model = torchvision.models.vgg16(pretrained = True)



for (idx,param) in enumerate(model.parameters()):
    if(idx<25):
        param.requires_grad = False

class Identity(nn.Module):
    def __init__(self):
        super(Identity,self).__init__()
    def forward(self,x):
        return x;
    
model.avgpool = Identity()
model.classifier = nn.Sequential(nn.Linear(8192,512),
                                 nn.ReLU(),
                                 nn.Linear(512,256),
                                 nn.ReLU(),
                                 nn.Linear(256,26))

model.to(device)

train_f1_score = []
test_f1_score = []



# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train Network
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
        # Get data to cuda if possible
        data = data.to(device=device)
        targets = targets.to(device=device)

        # forward
        scores = model(data)
        loss = criterion(scores, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer.step()
        
    if(epoch%2 == 0):
        train_f1_score.append(check_f1_score(train_loader, model))
        test_f1_score.append(check_f1_score(test_loader, model))
        if(len(test_f1_score) == 1):
            torch.save(model, 'best-model.pt')
            torch.save(model.state_dict(), 'best-model-parameters.pt')
        elif test_f1_score[-1]>max(test_f1_score[0:-1]):
            torch.save(model, 'best-model.pt')
            torch.save(model.state_dict(), 'best-model-parameters.pt')

In [None]:
plt.plot(train_f1_score,c = 'b')
plt.plot(test_f1_score,c = 'r')
plt.legend(["Train f1 score","Test f1 score"])
plt.show()


In [None]:
train_f1_score