In [1]:
# import libaries

from PIL import Image
from tqdm.notebook import tqdm
import numpy as np
import random
import sys
import pandas as pd
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchinfo import summary
from torchvision import transforms
from torch import optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, ReLU, ReLU6, Sigmoid, Dropout2d, Dropout, AvgPool2d, MaxPool2d, AdaptiveAvgPool2d, Sequential, Module, Parameter
from statistics import mean

import datetime
import os

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix

from deepface import DeepFace

In [2]:
seed = 41
# Define custom dataset
class FaceDataset(Dataset):
    def __init__(self, df, train, num_sample = None, transform = None, num_img_pool = 10, ):
        # set random seed for FaceDataset
        np.random.seed(seed)
        random.seed(seed)
        # create constructors
        self.unique_img_name = None
        self.train = train
        self.data = dict()
        self.images = list()
        self.identities = list()
        # label to indices
        # self.label_to_indices = dict()
        self.labels = list()
        self.img_name = list()
        # read csv file
        self.df = df
        # set the transformation
        self.transform = transform
        # drop last n row from dataframe
        self.df = self.df.head(num_sample)
        #get the length of entire dataset
        self.len_ = len(self.df)
        if num_sample is None or num_sample > self.len_:
            num_sample = self.len_
        # load imgs
        self.load_imgs(self.df, num_imgs = num_img_pool, max = num_sample)

    def __len__(self):
        return self.len_

    def __getitem__(self, idx):
        if self.train:
            anchor_img = self.images[idx]
            anchor_label = self.labels[idx]
            pos_idx = np.random.choice(np.arange(len(self.images))[self.labels == anchor_label])
            neg_idx = np.random.choice(np.arange(len(self.images))[self.labels != anchor_label])

            pos_img = self.images[pos_idx]
            neg_img = self.images[neg_idx]

            pos_label = self.labels[pos_idx]
            neg_label = self.labels[neg_idx]

            if self.transform is None:
                img_to_tensor = transforms.ToTensor()
                anchor_img = img_to_tensor(anchor_img)
                pos_img = img_to_tensor(pos_img)
                neg_img = img_to_tensor(neg_img)
            else:
                anchor_img = self.transform(anchor_img)
                pos_img = self.transform(pos_img)
                neg_img = self.transform(neg_img)
            return anchor_img, pos_img, neg_img

        else:
            label = False
            anchor_img = self.images[idx]
            anchor_label = self.labels[idx]
            if idx % 2 == 0:
                test_idx = np.random.choice(np.arange(len(self.images))[self.labels == anchor_label])
                label = True

            else:
                test_idx = np.random.choice(np.arange(len(self.images))[self.labels != anchor_label])

            neg_idx = np.random.choice(np.arange(len(self.images))[self.labels != anchor_label])
            test_img = self.images[test_idx]
            neg_img = self.images[neg_idx]

#             if self.transform is None:
#                 img_to_tensor = transforms.ToTensor()
#                 anchor_img = img_to_tensor(anchor_img)
#                 test_img = img_to_tensor(test_img)
#                 neg_img = img_to_tensor(neg_img)

#             else:
#                 anchor_img = self.transform(anchor_img)
#                 test_img = self.transform(test_img)
#                 neg_img = self.transform(neg_img)

            return anchor_img, test_img, neg_img ,label, self.img_name[test_idx], self.identities[test_idx]

    # load imgs from pandas to memory and define the maximum number of images
    def load_imgs(self, df, num_imgs, max):
        # iterate thought each row
        for i, row in tqdm(df.iterrows(), total = max):
            # get identity of each row
            row_identity = row['identity']
            # append each identity to numberical value
            # self.label_to_indices[int(row_identity)] = i
            count_img = 0
            # loop imgs in each identity
            for img_name in row['path']:
                if count_img > num_imgs:
                    break
                # concatenate the directoru and image name
                # path_to_image = self.dir+img_name
                path_to_image = img_name
                # open image and convert to RGB
#                 img = Image.open(path_to_image).convert('RGB')
                self.labels.append(i)
                self.images.append(path_to_image)
                
                self.img_name.append(path_to_image)
                self.identities.append(str(row_identity))
                
                count_img += 1  # print('Added img '+ str(row_identity))
        self.labels = np.array(self.labels)

In [3]:
ds_df = pd.read_csv('./digiface_csv_files/digi_all.csv')
ds_df = ds_df.groupby('identity')['path'].apply(list).reset_index()
ds_df

Unnamed: 0,identity,path
0,0,"[digiFace1M\subjects_0-1999_72_imgs\0\20.png, ..."
1,1,"[digiFace1M\subjects_0-1999_72_imgs\1\66.png, ..."
2,2,"[digiFace1M\subjects_0-1999_72_imgs\2\29.png, ..."
3,3,"[digiFace1M\subjects_0-1999_72_imgs\3\42.png, ..."
4,4,"[digiFace1M\subjects_0-1999_72_imgs\4\33.png, ..."
...,...,...
72661,199994,[digiFace1M\subjects_166666-199998_5_imgs\1999...
72662,199995,[digiFace1M\subjects_166666-199998_5_imgs\1999...
72663,199996,[digiFace1M\subjects_166666-199998_5_imgs\1999...
72664,199997,[digiFace1M\subjects_166666-199998_5_imgs\1999...


In [4]:
from sklearn.model_selection import train_test_split
seed = 41
# splitting each dataset
train_df, eval_df = train_test_split(ds_df, test_size= 0.3, shuffle = True, random_state = seed)
val_df, test_df = train_test_split(eval_df, test_size = 0.4, shuffle = True, random_state = seed)

# print to check size of each dataset
print(f'Train Size: {len(train_df)}')
print(f'Val Size: {len(val_df)}')
print(f'Test Size: {len(test_df)}')

train_df.head(5)

Train Size: 50866
Val Size: 13080
Test Size: 8720


Unnamed: 0,identity,path
26344,120344,[digiFace1M\subjects_100000-133332_5_imgs\1203...
61833,189166,[digiFace1M\subjects_166666-199998_5_imgs\1891...
46921,174254,[digiFace1M\subjects_166666-199998_5_imgs\1742...
4873,8873,[digiFace1M\subjects_8000-9999_72_imgs\8873\54...
19875,113875,[digiFace1M\subjects_100000-133332_5_imgs\1138...


In [5]:
batch_size = 64
split = 5
k_fold = KFold(n_splits = split, shuffle = True, random_state=seed)

val_dataset = FaceDataset(df = test_df, num_sample = 10000, train= False)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True, pin_memory=True)

  0%|          | 0/8720 [00:00<?, ?it/s]

In [6]:
metrics = ["cosine", "euclidean", "euclidean_l2"]
models = ["VGG-Face", "Facenet", "Facenet512", "OpenFace", "DeepFace", "DeepID", "ArcFace", "Dlib"]

result = DeepFace.verify(img1_path = val_df[val_df['identity'] == 100992]['path'].item()[0], img2_path = val_df[val_df['identity'] == 100992]['path'].item()[1], model_name = models[1], distance_metric = metrics[2], enforce_detection=False)


In [7]:
result

{'verified': False,
 'distance': 1.0183921163657763,
 'threshold': 0.8,
 'model': 'Facenet',
 'detector_backend': 'opencv',
 'similarity_metric': 'euclidean_l2',
 'facial_areas': {'img1': {'x': 12, 'y': 17, 'w': 88, 'h': 88},
  'img2': {'x': 6, 'y': 16, 'w': 78, 'h': 78}},
 'time': 19.03}

In [10]:
temp_pos_result = {
    'verified': list(),
    'distance': list(),
    'threshold': list(),
    'model': list(),
    'detector_backend': list(),
    'similarity_metric': list(),
    'identity1': list(),
    'identity2': list(),
    'label': list(),
    'time': list()
}

temp_pos_neg_result = {
    'verified': list(),
    'distance': list(),
    'threshold': list(),
    'model': list(),
    'detector_backend': list(),
    'similarity_metric': list(),
    'identity1': list(),
    'identity2': list(),
    'label': list(),
    'time': list()
}

fold_count = 0

test_config = {
    'skip_to_fold': 4,
    'model_name': models[1],
    'distance_matrix': metrics[2]
}

for fold, (train_idx, valid_idx) in tqdm(enumerate(k_fold.split(val_dataset)),total = split):
    
    valid_subsampler = torch.utils.data.Subset(val_dataset,valid_idx)

    if fold+1 != test_config['skip_to_fold']:
        print(f'skipping fold {fold+1}/{split}')
        continue
    print(f'starting fold {fold+1}/{split}')

    for anchor_img, test_img, _, label, _, _ in tqdm(valid_subsampler):

        pos_result = DeepFace.verify(img1_path = anchor_img, img2_path = test_img, model_name = test_config['model_name'], distance_metric = test_config['distance_matrix'], enforce_detection=False)
        del pos_result['facial_areas']
        pos_result['identity1'] = anchor_img.split('\\')[2]
        pos_result['identity2'] = test_img.split('\\')[2]
        pos_result['label'] = label
        
        for key in pos_result.keys():
            temp_pos_result[str(key)].append(pos_result[str(key)])

#     break

pos_df = pd.DataFrame(temp_pos_result)
# pos_neg_df = pd.DataFrame(temp_pos_neg_result)
pos_df.to_csv('./'+test_config['model_name']+ str(test_config['skip_to_fold']) +'_5_fold.csv')
# pos_neg_df.to_csv('./'+test_config['model_name']+'_true_neg'+ str(test_config['skip_to_fold']) +'_5_fold.csv')

  0%|          | 0/5 [00:00<?, ?it/s]

skipping fold 1/5
skipping fold 2/5
skipping fold 3/5
starting fold 4/5


  0%|          | 0/1744 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
pos_df.to_csv('./'+test_config['model_name']+'_pos_' + str(test_config['skip_to_fold']) +'_5_fold.csv')
pos_neg_df.to_csv('./'+test_config['model_name']+'_true_neg'+ str(test_config['skip_to_fold']) +'_5_fold.csv')

NameError: name 'pos_neg_df' is not defined