# Identification rate calculation

This jupyter notebook provide identification rate calculation for cplfw with distractors from megaface dataset with transformed similarity function.

In [30]:
import numpy as np
from sklearn.preprocessing import normalize
from ir_class import IrBigData
import os
from tqdm.notebook import tqdm
import pandas as pd

IrBigData._print_info = False

Define models for which we calculate features. Then run bash scripts that build models for given methods, get features for cplfw and megaface in ./features folder .

In [31]:
methods = ['PCA', 'norms']

! python3 create_models.py --methods {' '.join(methods)}
! python3 calculate_features.py --methods {' '.join(methods)} --datasets cplfw megaface 

outliers: 100%|###################################| 3/3 [00:06<00:00,  2.28s/it]


Our basic similarity function is cosine distance, thus, for simplicity we normalize embeddings and use dot product of embeddings on a sphere.

In [32]:
cplfw_emb = normalize(np.load('image_embeddings/cplfw.npy'))
megaface_emb = normalize(np.load('image_embeddings/megaface.npy'))

with open('image_embeddings/labels/cplfw_labels.txt', encoding='utf-8') as txt_labels_file:
    lines = txt_labels_file.readlines()
cplfw_labels = np.array([i.rstrip('\n') for i in lines])


Define parameters we want to get, details are in IrBigData class documentaion. 

In [33]:
parameters_ir = {
    "similarity_type": "features",
    "fpr_threshold": 1e-5,
    "dist_type": "max_threshold",
    "protocol": "data_distractors_no_false_pairs",
    "N_distractors": 10000
}

Next for each method we run experiments with 2 similarity functions: basic 'cosine' and 'features'. Cosine similarity_type $s(x ,y)$ corresponds to dot product of embeddings x, y on a sphere, for features we use next function:

$$s_h(x, y) = \begin{cases}
s(x ,y), &\max(o(x)), o(y))) < \alpha, \\
0, &\max(o(x), o(y)) \geq \alpha,
\end{cases}$$

where $t$ is a threshold parameter and $o(x)$ is a feature of $x$ embedding. To calculate the best $\alpha$ we make grid of threshold parameter 'alpha' around 0.99 quantile of features, that corresponds to proportion of outliers in cplfw dataset, detected by eye. For each case we calculate identification and verification (identification with no outliers) rate.


In [63]:
# choose random distractors
indices_random = np.random.choice(len(megaface_emb), 
                                  size=parameters_ir['N_distractors'], 
                                  replace=False)
megaface_emb = megaface_emb[indices_random]


results_dict = {}
results_vr_dict = {}

pbar = tqdm(methods)
for method in pbar:
    pbar.set_description(method)        
    results_dict[method] = {'cosine':{}, 'features':{}}
#     results_vr_dict[method] = {}
    results_arr = []
#     results_vr_arr = []

    cplfw_features = np.load('features/cplfw/{}_dist.npy'.format(method))
    megaface_features = np.load('features/megaface/{}_dist.npy'.format(method))[indices_random]
    
    IR = IrBigData(cplfw_emb, cplfw_features, 
               cplfw_labels, parameters_ir, distractors=megaface_emb, 
               distractor_features=megaface_features)
    IR.params['similarity_type'] = 'features'
    
    quantiles_arr = [np.quantile(cplfw_features, i) for i in [0.984 + 0.001*i for i in range(11)]]
    for alpha in tqdm(quantiles_arr, leave=False):
        IR.params['alpha'] = alpha
        IR.main()
        results_arr.append(IR.CMT_)
        results_vr_arr.append(IR.VR_)
    
    results_dict[method]['features']['ir'] = max(results_arr)
    results_dict[method]['features']['vr'] = max(results_vr_arr)
#     results_vr_dict[method]['features'] = max(results_vr_arr)
    
    IR = IrBigData(cplfw_emb, None,
               cplfw_labels, parameters_ir, distractors=megaface_emb, 
               distractor_features=None)
    IR.params['similarity_type'] = 'cosine'
    IR.main()
    results_dict[method]['cosine']['ir'] = IR.CMT_
    results_dict[method]['cosine']['vr'] = IR.VR_
#     results_vr_dict[method]['cosine'] = IR.VR_
    

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [66]:
results_dict

{'PCA': {'cosine': {'ir': 0.664875021849327, 'vr': 0.6664918720503409},
  'features': {'ir': 0.6647002272330013, 'vr': 0.7443628736234924}},
 'norms': {'cosine': {'ir': 0.664875021849327, 'vr': 0.6664918720503409},
  'features': {'ir': 0.6647002272330013, 'vr': 0.7443628736234924}}}

In [67]:
a = pd.DataFrame.from_dict(results_dict)
a

Unnamed: 0,PCA,norms
cosine,"{'ir': 0.664875021849327, 'vr': 0.666491872050...","{'ir': 0.664875021849327, 'vr': 0.666491872050..."
features,"{'ir': 0.6647002272330013, 'vr': 0.74436287362...","{'ir': 0.6647002272330013, 'vr': 0.74436287362..."
