In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
from torchvision import models,transforms
import matplotlib.pyplot as plt
import pickle
from collections import OrderedDict
import csv
import collections
from  PIL import Image
from tqdm.notebook import tqdm_notebook
from scipy.spatial import distance
import warnings
warnings.filterwarnings('ignore')
import math
device = torch.device("mps" if torch.has_mps else "cpu")
from itertools import product
import resnet50
random_state = 1

In [2]:
model_scratch = resnet50.make_model()
fname = 'weights/resnet50_ft_weight.pkl'
with open(fname, 'rb') as f:
    weights = pickle.load(f, encoding='latin1')

own_state = model_scratch.state_dict()
for name, param in weights.items():
    if name in own_state:
        try:
            own_state[name].copy_(torch.from_numpy(param))
        except Exception:
            raise RuntimeError('While copying the parameter named {}, whose dimensions in the model are {} and whose '\
                                'dimensions in the checkpoint are {}.'.format(name, own_state[name].size(), param.shape))
    else:
        raise KeyError('unexpected key "{}" in state_dict'.format(name))
model_scratch = model_scratch.to(device)

In [12]:
### load data 
# create df to contain all identities, their image file names, their ethnicities
path = "data/FairFace_Paper/Labels/"
img_path = 'data/FairFace_Paper/Images/fairface-img-margin025-trainval/'

# train
train_images = pd.read_csv(path + 'fairface_label_train.csv')
# val
val_images = pd.read_csv(path + 'fairface_label_val.csv')
val_images

Unnamed: 0,file,age,gender,race,service_test
0,val/1.jpg,3-9,Male,East Asian,False
1,val/2.jpg,50-59,Female,East Asian,True
2,val/3.jpg,30-39,Male,White,True
3,val/4.jpg,20-29,Female,Latino_Hispanic,True
4,val/5.jpg,20-29,Male,Southeast Asian,False
...,...,...,...,...,...
10949,val/10950.jpg,30-39,Male,White,True
10950,val/10951.jpg,50-59,Male,White,False
10951,val/10952.jpg,60-69,Male,Latino_Hispanic,False
10952,val/10953.jpg,20-29,Female,East Asian,False


In [3]:



# Asian images
asian_images = pd.read_csv(path + 'fairface_label_train.csv', sep="\t", header=None)
asian_images.columns = ['File', 'Label']
asian_images['identityID'] = asian_images['File'].str[:-9]
asian_images['faceID'] = asian_images['File'].str[-8:-4]
asian_images['Ethnicity'] = 'Asian'
# Caucasian images
caucasian_images = pd.read_csv(path + 'Caucasian/Caucasian_images.txt', sep="\t", header=None)
caucasian_images.columns = ['File', 'Label']
caucasian_images['identityID'] = caucasian_images['File'].str[:-9]
caucasian_images['faceID'] = caucasian_images['File'].str[-8:-4]
caucasian_images['Ethnicity'] = 'Caucasian'
# Indian images
indian_images = pd.read_csv(path + 'Indian/Indian_images.txt', sep="\t", header=None)
indian_images.columns = ['File', 'Label']
indian_images['identityID'] = indian_images['File'].str[:-9]
indian_images['faceID'] = indian_images['File'].str[-8:-4]
indian_images['Ethnicity'] = 'Indian'
all_images = pd.concat([african_images,caucasian_images])

# remove any duplicate identities
v = all_images.reset_index().groupby('identityID').Ethnicity.nunique()
dup = v[v>1].index.tolist()
all_images = all_images[~all_images['identityID'].isin(dup)]

# get first image from each identity and use it as reference
identities = np.array(all_images.identityID.unique().tolist()).astype(object)
file_end =  np.array('_0001.jpg'.split()*len(identities)).astype(object)
first_images = identities + file_end

references = all_images[all_images['File'].isin(first_images)]
candidates = all_images[~all_images['File'].isin(first_images)]
references

Unnamed: 0,File,Label,identityID,faceID,Ethnicity
1,m.0c7mh2_0001.jpg,0,m.0c7mh2,0001,African
4,m.026tq86_0001.jpg,1,m.026tq86,0001,African
8,m.02wz3nc_0001.jpg,2,m.02wz3nc,0001,African
9,m.0c012t4_0001.jpg,3,m.0c012t4,0001,African
13,m.0p8s_gx_0001.jpg,4,m.0p8s_gx,0001,African
...,...,...,...,...,...
10179,m.0gchs9h_0001.jpg,2954,m.0gchs9h,0001,Caucasian
10182,m.0bf61__0001.jpg,2955,m.0bf61_,0001,Caucasian
10187,m.08pys0_0001.jpg,2956,m.08pys0,0001,Caucasian
10191,m.0d3dsv_0001.jpg,2957,m.0d3dsv,0001,Caucasian


In [4]:
candidates

Unnamed: 0,File,Label,identityID,faceID,Ethnicity
0,m.0c7mh2_0003.jpg,0,m.0c7mh2,0003,African
2,m.0c7mh2_0002.jpg,0,m.0c7mh2,0002,African
3,m.026tq86_0003.jpg,1,m.026tq86,0003,African
5,m.026tq86_0002.jpg,1,m.026tq86,0002,African
6,m.02wz3nc_0002.jpg,2,m.02wz3nc,0002,African
...,...,...,...,...,...
10189,m.08pys0_0005.jpg,2956,m.08pys0,0005,Caucasian
10190,m.08pys0_0003.jpg,2956,m.08pys0,0003,Caucasian
10192,m.0d3dsv_0002.jpg,2957,m.0d3dsv,0002,Caucasian
10193,m.0d3dsv_0003.jpg,2957,m.0d3dsv,0003,Caucasian


In [5]:
# create dataset class for RFW
class resnetRFW(data.Dataset):
    
    '''
    This will be a class to load data from RFW for resnet50 model
    '''
     
    mean_bgr = np.array([91.4953, 103.8827, 131.0912])  # from resnet50_ft.prototxt

    def __init__(self,img_path,img_df):
        """
        :param img_path: dataset directory
        :param img_df: contains image file names and other information
        """
        assert os.path.exists(img_path), "root: {} not found.".format(img_path)
        self.img_path = img_path
        self.img_df = img_df
        self.img_info = []

        for i, row in self.img_df.iterrows():
            self.img_info.append({
                'img_file': row.Ethnicity + '/' + row.identityID + '/' + row.File,
                'identityID': row.identityID,
                'Ethnicity': row.Ethnicity,
                'faceID': row.faceID,
            })
            if i % 5000 == 0:
                print("processing: {} images".format(i))

    def __len__(self):
        return len(self.img_info)

    def __getitem__(self, index):
        info = self.img_info[index]
        img_file = info['img_file']
        img = Image.open(os.path.join(self.img_path, img_file))
        img = transforms.Resize(256)(img)
        img = transforms.CenterCrop(224)(img)
        img = np.array(img, dtype=np.uint8)
        assert len(img.shape) == 3  # assumes color images and no alpha channel

        Ethnicity = info['Ethnicity']
        identityID = info['identityID']
        faceID = info['faceID']
        return self.transform(img), identityID, Ethnicity
  

    def transform(self, img):
        img = img[:, :, ::-1]  # RGB -> BGR
        img = img.astype(np.float32)
        img -= self.mean_bgr
        img = img.transpose(2, 0, 1)  # C x H x W
        img = torch.from_numpy(img).float()
        return img

    def untransform(self, img, lbl):
        img = img.numpy()
        img = img.transpose(1, 2, 0)
        #img += self.mean_bgr
        img = img.astype(np.uint8)
        img = img[:, :, ::-1]
        return img, lbl

def apply_model(model,dataloader,device):
    model.eval()
    outputs = []
    identities = []
    ethnicities = []

    with torch.no_grad():
        for _, (imgs, identityID, ethnicity) in tqdm_notebook(enumerate(dataloader),total=len(dataloader)):
            imgs = imgs.to(device)
            x = model(imgs)
            out = x.view(x.size(0),-1)
            outputs.append(out)
            identities.append(np.array(identityID))
            ethnicities.append(np.array(ethnicity))

    outputs=torch.cat(outputs)
    identities= np.concatenate(np.array(identities)).ravel()
    ethnicities= np.concatenate(np.array(ethnicities)).ravel()

    # torch.save(outputs, file_prefix + '_outputs.pt')
    # np.save(file_prefix + '_identities.npy', identities)
    # np.save(file_prefix + '_ethnicities.npy', ethnicities)
    # np.save(file_prefix + '_faceIDs.npy', genders)
    return outputs, identities, ethnicities


In [6]:
kwargs = {'num_workers': 4, 'pin_memory': True} if torch.cuda.is_available() else {}
# load reference images
reference_dataset = resnetRFW(img_path,references.reset_index(drop=True))
reference_loader = torch.utils.data.DataLoader(reference_dataset, batch_size=4, shuffle=False, **kwargs)
# load candidate images
candidate_dataset = resnetRFW(img_path,candidates.reset_index(drop=True))
candidate_loader = torch.utils.data.DataLoader(candidate_dataset, batch_size=4, shuffle=False, **kwargs)

reference_outputs, reference_identities, reference_ethnicities = apply_model(model_scratch,reference_loader,device)
candidate_outputs, candidate_identities, candidate_ethnicities = apply_model(model_scratch,candidate_loader,device)

processing: 0 images
processing: 5000 images
processing: 0 images
processing: 5000 images
processing: 10000 images


  0%|          | 0/1489 [00:00<?, ?it/s]

  0%|          | 0/3665 [00:00<?, ?it/s]

In [7]:
reference_outputs_list = []
for output in reference_outputs.cpu().numpy():
    reference_outputs_list.append(output)
candidate_outputs_list = []
for output in candidate_outputs.cpu().numpy():
    candidate_outputs_list.append(output)
output_references = {'outputs': reference_outputs_list, 'identity': reference_identities,'ethnicity': reference_ethnicities}
output_references = pd.DataFrame(output_references)

output_candidates = {'outputs': candidate_outputs_list, 'identity': candidate_identities,'ethnicity': candidate_ethnicities}
output_candidates = pd.DataFrame(output_candidates)

In [8]:
ids = np.array(list(product(output_references['identity'], output_candidates['identity'])))
ethnicities = np.array(list(product(output_references['ethnicity'], output_candidates['ethnicity'])))
logistic_df = { 
                'reference_identity': ids[:,0],'candidate_identity': ids[:,1],
                'reference_ethnicity': ethnicities[:,0],'candidate_ethnicity': ethnicities[:,1]}
logistic_df = pd.DataFrame(logistic_df)
logistic_df




Unnamed: 0,reference_identity,candidate_identity,reference_ethnicity,candidate_ethnicity
0,m.0c7mh2,m.0c7mh2,African,African
1,m.0c7mh2,m.0c7mh2,African,African
2,m.0c7mh2,m.026tq86,African,African
3,m.0c7mh2,m.026tq86,African,African
4,m.0c7mh2,m.02wz3nc,African,African
...,...,...,...,...
87259069,m.01g00c,m.08pys0,Caucasian,Caucasian
87259070,m.01g00c,m.08pys0,Caucasian,Caucasian
87259071,m.01g00c,m.0d3dsv,Caucasian,Caucasian
87259072,m.01g00c,m.0d3dsv,Caucasian,Caucasian


In [9]:
logistic_df['labels']=(logistic_df.reference_identity == logistic_df.candidate_identity )*1
logistic_df2 = logistic_df[(logistic_df['reference_ethnicity']==logistic_df['candidate_ethnicity'] )]
logistic_df2


Unnamed: 0,reference_identity,candidate_identity,reference_ethnicity,candidate_ethnicity,labels
0,m.0c7mh2,m.0c7mh2,African,African,1
1,m.0c7mh2,m.0c7mh2,African,African,1
2,m.0c7mh2,m.026tq86,African,African,0
3,m.0c7mh2,m.026tq86,African,African,0
4,m.0c7mh2,m.02wz3nc,African,African,0
...,...,...,...,...,...
87259069,m.01g00c,m.08pys0,Caucasian,Caucasian,0
87259070,m.01g00c,m.08pys0,Caucasian,Caucasian,0
87259071,m.01g00c,m.0d3dsv,Caucasian,Caucasian,0
87259072,m.01g00c,m.0d3dsv,Caucasian,Caucasian,0


In [10]:
labels= logistic_df2.labels
logistic_df2.labels.value_counts()

0    43618246
1       14658
Name: labels, dtype: int64

In [11]:
match_idx = np.where(labels==1)[0]
not_match_idx = np.where(labels==0)[0]


In [12]:
np.random.seed(random_state)
not_match_idx_sub  = not_match_idx[np.random.choice(len(not_match_idx), size=len(match_idx), replace=False)]
print((not_match_idx_sub))

[16854868 26642054 17706011 ... 16602078 20463754  6485722]


In [13]:
logistic_df2.iloc[match_idx]

Unnamed: 0,reference_identity,candidate_identity,reference_ethnicity,candidate_ethnicity,labels
0,m.0c7mh2,m.0c7mh2,African,African,1
1,m.0c7mh2,m.0c7mh2,African,African,1
14660,m.026tq86,m.026tq86,African,African,1
14661,m.026tq86,m.026tq86,African,African,1
29320,m.02wz3nc,m.02wz3nc,African,African,1
...,...,...,...,...,...
87229753,m.08pys0,m.08pys0,Caucasian,Caucasian,1
87229754,m.08pys0,m.08pys0,Caucasian,Caucasian,1
87244413,m.0d3dsv,m.0d3dsv,Caucasian,Caucasian,1
87244414,m.0d3dsv,m.0d3dsv,Caucasian,Caucasian,1


In [14]:
print(reference_outputs.shape,candidate_outputs.shape)

torch.Size([5953, 2048]) torch.Size([14658, 2048])


In [16]:
logistic_df2.iloc[not_match_idx_sub]

Unnamed: 0,reference_identity,candidate_identity,reference_ethnicity,candidate_ethnicity,labels
33292366,m.05l91f,m.0ndx0xb,African,African,0
52853484,m.01w0kw4,m.026f9wk,Caucasian,Caucasian,0
34975879,m.0g9wgx8,m.0b3szh,African,African,0
56955494,m.09d8lv,m.0chlsn,Caucasian,Caucasian,0
7744596,m.04rnxv,m.04gkvys,African,African,0
...,...,...,...,...,...
63845231,m.0402tg,m.01npnk3,Caucasian,Caucasian,0
58408197,m.05pbbnj,m.02rrb2n,Caucasian,Caucasian,0
32793484,m.09j6df,m.07kcsqd,African,African,0
40418920,m.0fhrbz,m.025zgjt,African,African,0


In [18]:
array_1 = np.arange(reference_outputs.shape[0])
array_2 = np.arange(candidate_outputs.shape[0])
mesh = np.array(np.meshgrid(array_1, array_2))
combinations = mesh.T.reshape(-1, 2)
combinations = combinations[logistic_df2.index.values]
combinations.shape

(43632904, 2)

In [19]:
match_pairs = combinations[match_idx]
not_match_pairs = combinations[not_match_idx_sub]

match_list = []
for _,pairs in tqdm_notebook(enumerate(match_pairs),total=len(match_pairs)):
    match_list.append(torch.concat((reference_outputs[pairs[0]],candidate_outputs[pairs[1]])))

not_match_list = []
for _,pairs in tqdm_notebook(enumerate(not_match_pairs),total=len(not_match_pairs)):
    not_match_list.append(torch.concat((reference_outputs[pairs[0]],candidate_outputs[pairs[1]])))

  0%|          | 0/14658 [00:00<?, ?it/s]

  0%|          | 0/14658 [00:00<?, ?it/s]

In [20]:
match_tensor=torch.stack(match_list)
not_match_tensor=torch.stack(not_match_list)

In [21]:
match_ref_ids =[]
match_ref_eth =[]

for _,pairs in tqdm_notebook(enumerate(match_pairs),total=len(match_pairs)):
    match_ref_ids.append(reference_identities[pairs[0]])
    match_ref_eth.append(reference_ethnicities[pairs[0]])

  0%|          | 0/14658 [00:00<?, ?it/s]

In [22]:
not_match_ref_ids =[]
not_match_ref_eth =[]
not_match_cand_ids =[]
not_match_cand_eth =[]

for _,pairs in tqdm_notebook(enumerate(not_match_pairs),total=len(not_match_pairs)):
    not_match_ref_ids.append(reference_identities[pairs[0]])
    not_match_ref_eth.append(reference_ethnicities[pairs[0]])
    not_match_cand_ids.append(candidate_identities[pairs[1]])
    not_match_cand_eth.append(candidate_ethnicities[pairs[1]])

  0%|          | 0/14658 [00:00<?, ?it/s]

In [23]:
all_inputs = torch.cat([match_tensor,not_match_tensor])
torch.save(all_inputs,'inputs/rfw_resnet50_face_embeddings.pt')
match_labels = torch.ones(len(match_pairs))
not_match_labels = torch.zeros(len(match_pairs))
all_labels = torch.cat([match_labels,not_match_labels])
torch.save(all_labels,'inputs/rfw_resnet50_labels.pt')

In [24]:
all_ref_ids = match_ref_ids + not_match_ref_ids
all_ref_eth = match_ref_eth + not_match_ref_eth
all_cand_ids = match_ref_ids + not_match_cand_ids
all_cand_eth = match_ref_eth + not_match_cand_eth

In [25]:
all_df = { 'reference_identity': all_ref_ids,'candidate_identity': all_cand_ids,
            'reference_ethnicity': all_ref_eth,'candidate_ethnicity': all_cand_eth,
            'labels': all_labels.cpu().numpy()}


all_df = pd.DataFrame(all_df)
all_df


Unnamed: 0,reference_identity,candidate_identity,reference_ethnicity,candidate_ethnicity,labels
0,m.0c7mh2,m.0c7mh2,African,African,1.0
1,m.0c7mh2,m.0c7mh2,African,African,1.0
2,m.026tq86,m.026tq86,African,African,1.0
3,m.026tq86,m.026tq86,African,African,1.0
4,m.02wz3nc,m.02wz3nc,African,African,1.0
...,...,...,...,...,...
29311,m.0402tg,m.01npnk3,Caucasian,Caucasian,0.0
29312,m.05pbbnj,m.02rrb2n,Caucasian,Caucasian,0.0
29313,m.09j6df,m.07kcsqd,African,African,0.0
29314,m.0fhrbz,m.025zgjt,African,African,0.0


In [26]:
all_df.to_csv('inputs/rfw_resnet50_df.csv',index=False)