In [None]:
!pip install imagesize

In [None]:
import os
import sys
import path

import numpy as np
import pandas as pd
import cv2 as cv
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import albumentations as A
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold

In [None]:
train= pd.read_csv("../input/happy-whale-and-dolphin/train.csv")
train_path = '../input/happy-whale-and-dolphin/train_images'
test_path = '../input/happy-whale-and-dolphin/test_images'
sample_sub = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')

In [None]:
train.head()

In [None]:
train['filepaths'] = train_path+'/'+train['image']

In [None]:
train.head()

In [None]:
plt.imshow(plt.imread(train['filepaths'][0]))

In [None]:
print("total row in Train", train.shape)

In [None]:
train.describe()

In [None]:
train.columns

In [None]:
train.info()

In [None]:
train.isnull().sum().any()

In [None]:
train.species.unique() #kiler_whale is killer_whale we have to replace that

In [None]:
train.individual_id.unique() #many photo of single animal

In [None]:
def get_path(path):
    image_name = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            full_path = os.path.join(dirname, filename)
            image_name.append(full_path)
    return image_name

In [None]:
train_img_path = get_path(train_path)
test_img_path = get_path(test_path)
print(len(train_img_path)) #no of train images
print(len(test_img_path)) # no of test images

In [None]:
def show_images(path, rows, cols, title):
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(12, 8))
    plt.suptitle(title, fontsize=22)
    for i , ax in enumerate(axes.flat):
        x = plt.imread(train_img_path[i])
        ax.imshow(x)
        plt.xticks([])
    plt.tight_layout()
    plt.show()

In [None]:
show_images(train_img_path[:10], 5, 4, "Train")

In [None]:
#show_images(test_img_path[:10], 5, 4, "Test") 

In [None]:
plt.figure(figsize=(20, 20))
plt.yticks(fontsize=16)
sns.countplot(y="species", data=train, order= train.iloc[0:]["species"].value_counts().index, linewidth=3)
plt.title("Species Distribution", font="Serif", size=20)
plt.show()

In [None]:
#adjusting the species name
train["species"] = train["species"].replace(["bottlenose_dolpin", "kiler_whale", "beluga", "globis", "pilot_whale"], 
                                            ["bottlenose_dolphin", "killer_whale", "beluga_whale", 
                                             "short_finned_pilot_whale", "short_finned_pilot_whale"])
#dolphin or whale_class
train["class"] = train["species"].apply(lambda x: x.split("_")[-1])

In [None]:
train["class"].head()

In [None]:
train["class"].unique()

In [None]:
def show_values_on_bars(axs, h_v="v", space=0.4):    
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, format(value, ','), ha="center") 
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, format(value, ','), ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [None]:
species_sample = train["species"].value_counts().reset_index()
species = train.groupby(by=["individual_id", "species"]).count()\
            .reset_index()["species"].value_counts().reset_index()

classes = train.groupby(by=["individual_id", "class"]).count()\
            .reset_index()["class"].value_counts().reset_index()

fig =plt.figure(figsize=(20, 15))
ax1= fig.add_subplot(1,2,1)

fig.suptitle('Classes', size = 15, color = "blue", weight='bold')

sns.barplot(data=classes, y="class", x="index", ax=ax1, palette="tab10")
show_values_on_bars(ax1, h_v="v", space=0.4)
ax1.set_title("Class Frequency", size = 15, weight='bold')
ax1.set_ylabel("Frequency", size = 13,  weight='bold')
ax1.set_xlabel("Class", size = 13, color = "blue", weight='bold')

In [None]:
#percentage of whale-66.7% and dolphin-33%
def show_species(species_name, sample_size):
    data = train[train["species"] == species_name].sample(sample_size, random_state=24)
    image_nr = data["image"].to_list()
    image_path = data["filepaths"].to_list()
    
    #plotting
    fig, axs = plt.subplots(1, sample_size, figsize=(23, 4))
    axs = axs.flatten()
    for k , path in enumerate(image_path):
        axs[k].set_title(f"{k+1}.{species_name}-{image_nr[k]}", fontsize=13, color="blue", weight="bold")
        img = plt.imread(path)
        axs[k].imshow(img)
        axs[k].axis("off")
    plt.tight_layout()
    plt.show()

In [None]:
for species_name in train["species"].unique().tolist():
    show_species(species_name, sample_size=4)

In [None]:
import imagesize
#imagesize to get the widths and heights
widths , heights = [], []

for path in tqdm(train["filepaths"]):
    width, height = imagesize.get(path)
    widths.append(width)
    heights.append(height)
    
train["width"] = widths
train["height"] = heights
train["dimention"] = train["width"]* train["height"] 

In [None]:
train["image_code"] = train["image"].apply(lambda x:x.split(".")[0])
#target column

tmp = train.groupby('individual_id')['image_code'].agg("unique").to_dict()
train['target'] = train["individual_id"].map(tmp)

In [None]:
#map the individual id to a unique key
individual_mapping = train["individual_id"].value_counts().reset_index().drop(columns=["individual_id"])
individual_mapping.columns = ["individual_id"]
individual_mapping["individual_key"] = np.arange(start=0, stop= len(individual_mapping), step=1)

train = pd.merge(train, individual_mapping, on="individual_id")

In [None]:
import warnings
warnings.filterwarnings("ignore")
#validation Fold
skf = StratifiedKFold(n_splits=5)
skf_splits = skf.split(X= train.drop(columns="individual_key"), y=train["individual_key"])

for fold, (train_index, valid_index) in enumerate(skf_splits):
    train.loc[valid_index, "kfold"] = np.int(fold)
train["kfold"] = train["kfold"].astype(int)

train.head(3)

In [None]:
TEST_PATH = "../input/happy-whale-and-dolphin/test_images"

#height, weight and dim for test 
test = pd.DataFrame({"image": os.listdir(TEST_PATH)})
test["path"] = TEST_PATH + "/" + test["image"]
test["image_code"] = test["image"].apply(lambda x: x.split(".")[0])

widths, heights = [], []

for path in tqdm(test["path"]):
    width, height = imagesize.get(path)
    widths.append(width)
    heights.append(height)
    
test["width"] = widths
test["height"] = heights
test["dimention"] = test["width"] * test["height"]

In [None]:
test.head(3)

In [None]:
!pip install -q efficientnet_pytorch
!pip install scipy

In [None]:
import albumentations
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from efficientnet_pytorch import EfficientNet
from numpy import dot, sqrt
from scipy import spatial

from transformers import *

In [None]:
#parameters
STATE= 24
KEYS = [0,1,2,3,4,5,6,7,8,9]
IMG_SIZE = 256
BATCH_SIZE = 16

In [None]:
#sample from training
df= pd.read_csv("../input/happywhale-2022/train.csv")
df = df[df["individual_key"].isin(KEYS)].reset_index(drop=True)
df["path"] = "../input/happy-whale-and-dolphin/train_images/" + df["image"]
df.head()

In [None]:
def get_transforms(img_size=256):
    return albumentations.Compose([albumentations.Resize(img_size, img_size), 
                                   albumentations.Normalize()
                                  ])

In [None]:
class HappyWhale(Dataset):
    def __init__(self, csv, transforms = get_transforms(img_size=256)):
        self.csv = csv
        self.transform = transforms
        
    def __len__(self):
        return self.csv.shape[0]
    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        image = cv.imread(row.path)
        image = image[:, :, ::-1] 
        
        transformed_img = self.transform(image= image)
        transformed_img = transformed_img['image'].astype(np.float32)
        image = transformed_img.transpose(2, 0, 1)
        target = torch.tensor(row.individual_key)
        
        return torch.tensor(image), target
    
#getting the Data Loader
dataset = HappyWhale(df, transforms = get_transforms(img_size=IMG_SIZE))
loader = DataLoader(dataset, batch_size= BATCH_SIZE)

In [None]:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#EfficientNet Model
class BackboneModel(nn.Module):
    def __init__(self):
        super(BackboneModel, self).__init__()
        self.backbone = EfficientNet.from_pretrained("efficientnet-b7")
        #the pre-trained weights
        
    def forward(self, img):
        img = self.backbone(img)
        return img
    
model = BackboneModel().to(device)

In [None]:
#2nd notebook

In [None]:
#retrive all embeddings for each image
all_embeddings = []
all_targets = []

with torch.no_grad():
    for img, target in tqdm(loader): 
        img = img.to(device)
        img_embedding = model(img)
        img_embedding = img_embedding.detach().cpu().numpy()
        all_embeddings.append(img_embedding)
        all_targets.append(target.numpy())

In [None]:
#concatenate the batches together
image_embeddings = np.concatenate(all_embeddings)
image_targets = np.concatenate(all_targets)

print("shape of embed:", image_embeddings[0].shape)

np.save("efficientnet_image_embeddings.npy", image_embeddings)
np.save("efficientnet_image_targets.npy", image_targets)

In [None]:
#Cosine Distance
def get_cosine_similarity(embeddings):
    similarity_matrix = []
    
    for embed1 in embeddings:
        similarity_row = []
        for embed2 in embeddings:
            similarity_row.append(1- spatial.distance.cosine(embed1, embed2))
        similarity_matrix.append(similarity_row)
    return np.array(similarity_matrix, dtype="float32")

In [None]:
#select few examples from same individual
example_index = df[df["individual_key"]==1].sample(5, random_state=24).index.tolist()
example_paths = df[df["individual_key"]==1].sample(5, random_state=24)["path"].tolist()
example_embeds = image_embeddings[example_index]

#compute similarity
cos_matrix = get_cosine_similarity(example_embeds) 

mask = np.zeros_like(cos_matrix)
mask[np.triu_indices_from(mask)] = True

In [None]:
#similarity plot
fig = plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid(shape=(6, 6), loc = (5, 1), colspan =1)
ax2 = plt.subplot2grid(shape=(6, 6), loc=(5, 2), colspan=1)
ax3 = plt.subplot2grid(shape=(6, 6), loc=(5, 3), colspan = 1)
ax4 = plt.subplot2grid(shape=(6, 6), loc =(5, 4), colspan=1)
ax5 = plt.subplot2grid(shape=(6, 6), loc= (5, 5), colspan=1)
h_axes = [ax1,ax2 , ax3, ax3, ax4, ax5]

ax6 = plt.subplot2grid(shape=(6, 6), loc = (0, 0), colspan =1)
ax7 = plt.subplot2grid(shape=(6, 6), loc=(1, 0), colspan=1)
ax8 = plt.subplot2grid(shape=(6, 6), loc=(2, 0), colspan = 1)
ax9 = plt.subplot2grid(shape=(6, 6), loc =(3, 0), colspan=1)
ax10 = plt.subplot2grid(shape=(6, 6), loc= (4, 0), colspan=1)
v_axes = [ax6, ax7, ax8, ax9, ax10]

ax11 = plt.subplot2grid(shape=(6, 6), loc=(0, 1), colspan=5, rowspan=5)

fig.suptitle("Cos Distance", size=21, color="blue", weight="bold")
for k , ax in enumerate(h_axes):
    ax.imshow(plt.imread(example_paths[k-1]))
    ax.set_axis_off()
    
for k, ax in enumerate(v_axes):
    ax.imshow(plt.imread(example_paths[k-1]))
    ax.set_axis_off()

sns.heatmap(cos_matrix, ax= ax11, fmt=".5", cbar=False, 
           annot= True, linewidth=0.5, mask= mask, square=True, cmap="winter_r")
plt.tight_layout()
plt.show()