# **PetFinder-Finding Duplicates With CNN**

Inspired from [schulta's](https://www.kaggle.com/schulta) [work on identifying duplicates](https://www.kaggle.com/schulta/petfinder-identify-duplicates-and-share-findings/notebook), I tried to find the duplicate images with a pretrained CNN rather than image hashing. The reason for that is, CNNs are better at identifying same images when one of them is rotated, translated, etc.



# Method

I used a pretrained EfficientnetB2 model with Imagenet weights as a feature extractor (I consider the features just after the Global Average Pooling layer). 

Then I calculated the cosine similarity between each images and visualize them.

# Findings

* 37 images are exatcly same. This number is obtained when cosine similarity threshold is set to 0.9
* Between 0.8-0.9 similarity threshold values, images from the same animal are captured. In this range images are very similar yet paw score of them differ much
* Above showed that, scaling, rotating or applying similar augmentations may yield different paw scores. In my opinion, we should either discard one of these images or change our augmentation methods

In [None]:
!pip uninstall efficientnet
!pip install -U git+https://github.com/qubvel/efficientnet

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tensorflow.keras.models import Model
from PIL import Image
import efficientnet.tfkeras as efn
from efficientnet.tfkeras import preprocess_input
import math
from tqdm import tqdm
import random
import matplotlib.pyplot as plt

In [None]:
def calculate_size(current_size, target_size):
    """
    Calculates the size of the image when it is resized to target_size while keeping the aspect ratio.
    Params:
    current_size (tuple): Size of the current image
    target_size (tuple):  Desired size of the image
    Returns:
    A calculated size which is most closest to target_size when aspect ratio is kept.
    """
    w_ratio = target_size[0] / current_size[0]
    h_ratio = target_size[1] / current_size[1]
    scale = min(w_ratio, h_ratio)
    return (int(scale * current_size[0]), int(scale * current_size[1]))


def pad_to_n(arr, new_dim):
    """
    Apply a 2d padding on array to match its size to a new dimension.
    Params:
    arr (ndarray): Array which padding will be applied.
    new_dim (tuple): Dimension after the padding.
    Returns:
    Padded input array
    """
    if len(arr.shape)== 3:
        h,w,c = arr.shape
        x_new = np.zeros((new_dim[0], new_dim[1], c))
        x_new[:h, :w, :c] = arr.copy()
    else:
        h,w = arr.shape
        x_new = np.zeros((new_dim[0], new_dim[1]))
        x_new[:h, :w] = arr.copy()
    return x_new

def load_img(img_path, preprocess=None, target_size=None, same_aspect=False):
    """
    Loads image file in given target size and applies preprocess to loaded file.
    Params:
    img_path: Path of the image file to be loaded.
    preprocess: Preprocess function to apply the images. If passed as None then
    no preprocess will be applied. 
    target_size: Size of the image to be loaded in (width, height). If passed 
    as None then the image will be loaded in its orginal size
    same_aspect: Whether to keep the same aspect ratio while resizing.
    If not none then target_size should provided.
    Returns:
    Preprocess applied loaded image in target size 
    """
    img = Image.open(img_path)
    org_img_size = img.size
  
    if target_size is not None:
        if same_aspect:
            # When aspect ratio kept same, it may not be possbile to 
            # resize to a target size.
            possible_size = calculate_size(org_img_size, target_size)
            img = img.resize(possible_size)
            img = np.array(img)
            new_target_size = (target_size[1], target_size[0])
            img = pad_to_n(img,  new_target_size).astype('uint8')
        else:
            img = img.resize(target_size)
            img = np.array(img)
    
    if preprocess is not None:
        img = preprocess(img)

    return img

# Model Creation

In [None]:
model = efn.EfficientNetB2(weights='imagenet')
feature_layer = model.get_layer('avg_pool')
feature_extractor = Model(model.input, feature_layer.output)

In [None]:
train_imgs_dir = '/kaggle/input/petfinder-pawpularity-score/train/'
train_df = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
N_EXAMPLES = len(train_df)

In [None]:
features = []
paw_scores = []
img_paths = []
image_ids = []
batch_size = 24
iter_num = math.ceil(len(train_df) / batch_size)

In [None]:
for i in tqdm(range(iter_num)) :
    batch_df = train_df.iloc[i*batch_size:(i+1)*batch_size]
    batch_paths = [os.path.join(train_imgs_dir, pid + '.jpg') for pid in batch_df['Id']]
    batch_paws = batch_df['Pawpularity'].tolist()
    batch_ids = batch_df['Id'].tolist()
    batch_imgs = np.array([load_img(img_path, preprocess=preprocess_input, target_size=(260, 260), same_aspect=False) for img_path in batch_paths])
    
    batch_features = feature_extractor.predict(batch_imgs)
    features.append(batch_features)
    paw_scores.extend(batch_paws)
    img_paths.extend(batch_paths)
    image_ids.extend(batch_ids)

In [None]:
features = np.array(features)
# Rearange batch dimension
np_features = features.reshape(features.shape[0] * features.shape[1], features.shape[2])
paw_scores = np.array(paw_scores)
img_paths = np.array(img_paths)

In [None]:
def plot_similar_pairs(xs, ys):
    """
    Plots similar pairs given their index lists
    Params:
        xs: Similar item index list
        ys: Similar item index list. Each item must be correspond to its pair in parameter xs
    """    
    for x,y in zip(xs,ys):
        img_path_1 = img_paths[x]
        img_path_2 = img_paths[y]
        img1 = load_img(img_path_1, preprocess=None, target_size=None, same_aspect=False)
        img2 = load_img(img_path_2, preprocess=None, target_size=None, same_aspect=False)
        paw_score_1 = paw_scores[x]
        paw_score_2 = paw_scores[y]

        fig, ax = plt.subplots(1, 2)
        ax[0].imshow(img1)
        ax[0].set_title("Paw Score: {:.2f}".format(paw_score_1))
        ax[1].imshow(img2)
        ax[1].set_title("Paw Score: {:.2f}".format(paw_score_2))

        for j in range(2):
            ax[j].set_xticks([])
            ax[j].set_yticks([])

        plt.show()    

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cos_similarity = cosine_similarity(np_features)

In [None]:
# Diagonal entries are similarity scores between same images
# Make them 0 to not include them later
cos_similarity[np.eye(N_EXAMPLES, dtype='bool')] = 0

# Samples With Similarity > 0.9

In [None]:
SIM_THRES = 0.9
sim_mask = np.where(cos_similarity > SIM_THRES, True, False)
# Consider upper half of the similarity mask to remove duplicate similar pairs
half_sim_mask = np.triu(sim_mask)
xs, ys = np.where(half_sim_mask)

In [None]:
plot_similar_pairs(xs, ys)

# Samples With Similarity Between 0.85-0.9

In [None]:
UP_SIM_THRES = 0.9
LOW_SIM_THRES = 0.85
sim_mask_low = np.where(cos_similarity > LOW_SIM_THRES, True, False)
sim_mask_up = np.where(cos_similarity < UP_SIM_THRES, True, False)
sim_mask = np.logical_and(sim_mask_low, sim_mask_up)
# Consider upper half of the similarity mask to remove duplicate similar pairs
half_sim_mask = np.triu(sim_mask)
xs, ys = np.where(half_sim_mask)

In [None]:
plot_similar_pairs(xs, ys)

In [None]:
UP_SIM_THRES = 1.01
LOW_SIM_THRES = 0.77
sim_mask_low = np.where(cos_similarity > LOW_SIM_THRES, True, False)
sim_mask_up = np.where(cos_similarity < UP_SIM_THRES, True, False)
sim_mask = np.logical_and(sim_mask_low, sim_mask_up)
# Consider upper half of the similarity mask to remove duplicate similar pairs
half_sim_mask = np.triu(sim_mask)
xs, ys = np.where(half_sim_mask)

In [None]:
to_delete_indx = np.unique(np.concatenate([xs, ys]))
print("There are {} images to be deleted".format(len(to_delete_indx)))
to_delete_img_ids = [image_ids[idx] for idx in to_delete_indx]

In [None]:
cleaned_df = train_df[train_df['Id'].apply(lambda x: x not in to_delete_img_ids)]
cleaned_df.to_csv('duplicate_removed_train.csv', index=False)