<a href="https://colab.research.google.com/github/sartajsehgal/Data-Science/blob/main/Kinship_Relationship_DSEvaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import cv2
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
from imageio import imread
from skimage.transform import resize
from scipy.spatial import distance
from keras.models import load_model
import pandas as pd
from tqdm import tqdm

In [2]:
train_df = pd.read_csv("/content/drive/MyDrive/Kaggle/SMILE_dataset/train_relationships.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Kaggle/SMILE_dataset/sample_submission.csv")

In [3]:
model_path = "/content/drive/MyDrive/Kaggle/SMILE_dataset/keras-facenet/keras-facenet/model/facenet_keras.h5"
model = load_model(model_path)



In [4]:
def prewhiten(x):
    if x.ndim == 4:
        axis = (1, 2, 3)
        size = x[0].size
    elif x.ndim == 3:
        axis = (0, 1, 2)
        size = x.size
    else:
        raise ValueError('Dimension should be 3 or 4')

    mean = np.mean(x, axis=axis, keepdims=True)
    std = np.std(x, axis=axis, keepdims=True)
    std_adj = np.maximum(std, 1.0/np.sqrt(size))
    y = (x - mean) / std_adj
    return y

def l2_normalize(x, axis=-1, epsilon=1e-10):
    output = x / np.sqrt(np.maximum(np.sum(np.square(x), axis=axis, keepdims=True), epsilon))
    return output

def load_and_align_images(filepaths, margin,image_size = 160):
    
    aligned_images = []
    for filepath in filepaths:
        img = imread(filepath)
        aligned = resize(img, (image_size, image_size), mode='reflect')
        aligned_images.append(aligned)
            
    return np.array(aligned_images)

In [5]:
def calc_embs(filepaths, margin=10, batch_size=512):
    pd = []
    for start in tqdm(range(0, len(filepaths), batch_size)):
        aligned_images = prewhiten(load_and_align_images(filepaths[start:start+batch_size], margin))
        pd.append(model.predict_on_batch(aligned_images))
    embs = l2_normalize(np.concatenate(pd))

    return embs

In [6]:
test_images = os.listdir("/content/drive/MyDrive/Kaggle/SMILE_dataset/test/")
test_embs = calc_embs([os.path.join("/content/drive/MyDrive/Kaggle/SMILE_dataset/test/", f) for f in test_images])
# np.save("test_embs.npy", test_embs)
test_embs

100%|██████████| 13/13 [06:27<00:00, 29.79s/it]


array([[ 0.03484537,  0.12908678, -0.0181959 , ..., -0.2585555 ,
        -0.13148756, -0.01973889],
       [ 0.04109402,  0.04779609,  0.12647359, ..., -0.07085994,
         0.05687727,  0.044689  ],
       [-0.0494254 , -0.01584665,  0.08263745, ..., -0.05969948,
         0.10659206, -0.14784393],
       ...,
       [ 0.06496517,  0.02294946,  0.07013849, ..., -0.2266992 ,
        -0.05978282, -0.01152707],
       [ 0.05065063, -0.05268567, -0.08779337, ...,  0.04090406,
         0.03944483, -0.00751425],
       [-0.04110827,  0.01781067, -0.0139101 , ..., -0.03060862,
        -0.11479206,  0.02817542]], dtype=float32)

In [7]:
test_df["distance"] = 0
img2idx = dict()
for idx, img in enumerate(test_images):
    img2idx[img] = idx

In [8]:
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    imgs = [test_embs[img2idx[img]] for img in row.img_pair.split("-")]
    test_df.loc[idx, "distance"] = distance.euclidean(*imgs)

100%|██████████| 5310/5310 [00:02<00:00, 2544.36it/s]


In [9]:
all_distances = test_df.distance.values
sum_dist = np.sum(all_distances)

In [10]:
probs = []
for dist in tqdm(all_distances):
    prob = np.sum(all_distances[np.where(all_distances <= dist)[0]])/sum_dist
    probs.append(1 - prob)

100%|██████████| 5310/5310 [00:00<00:00, 25455.82it/s]


In [11]:
sub_df = pd.read_csv("/content/drive/MyDrive/Kaggle/SMILE_dataset/sample_submission.csv")
sub_df.is_related = probs
sub_df.to_csv("submission.csv", index=False)