In [1]:
import time
import glob
import cv2
import pprint
import sklearn
import numpy as np
import pandas as pd
import mediapipe as mp
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans, DBSCAN
# import hdbscan

%matplotlib inline

In [2]:
class FaceMeshDetector():

    def __init__(self, staticMode=False, maxFaces=2, minDetectionCon=0.5, minTrackCon=0.5):

        self.staticMode = staticMode
        self.maxFaces = maxFaces
        self.minDetectionCon = minDetectionCon
        self.minTrackCon = minTrackCon

        self.mpDraw = mp.solutions.drawing_utils
        self.mpFaceMesh = mp.solutions.face_mesh
        self.faceMesh = self.mpFaceMesh.FaceMesh(self.staticMode, self.maxFaces,
                                                 self.minDetectionCon, self.minTrackCon)
        self.drawSpec = self.mpDraw.DrawingSpec(thickness=1, circle_radius=2)

    def findFaceMesh(self, img, draw=True, land_mark_mask_=None):
        
        self.imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        self.results = self.faceMesh.process(self.imgRGB)
        
        faces = []
        
        if self.results.multi_face_landmarks:
            for faceLms in self.results.multi_face_landmarks:
                if draw:
                    self.mpDraw.draw_landmarks(img, faceLms, self.mpFaceMesh.FACE_CONNECTIONS,
                                           self.drawSpec, self.drawSpec)

                        
                face = []
                
                if land_mark_mask_ is None :
                    land_mark_mask_ = np.ones_like(faceLms.landmark)
                    
                selected_land_marks = np.array(faceLms.landmark)[land_mark_mask_]
                id_s_ = np.arange(468)[land_mark_mask_]
                
                for id, lm in zip(id_s_, selected_land_marks):
                    #print(lm)
                    
                    ih, iw, ic = self.imgRGB.shape
                    x = int(np.clip(lm.x, 0, 1)*(iw-1))
                    y = int(np.clip(lm.y, 0, 1)*(ih-1))
                    
#                     cv2.putText(img, str(id), (x, y), cv2.FONT_HERSHEY_PLAIN,
#                                1.3, (0, 255, 255), 2)

                    # print(id,x,y)

                    face.append(self.imgRGB[y, x, :])
                faces.append(face)
        return img, faces

In [3]:
# source_path = "data/test/"

source_path_ = "../fairface-img-margin025-trainval/all/"
files_path = source_path_  + '*.jpg'

image_files = [file_ for file_ in glob.glob(files_path)]

# image_files 

filter_points = []
temp_ = np.arange(468)

forehead = [10,151,337,109]
cheek = [50,330,101,203,423,205,425,207,427]
others = [69,299,36,266,280]

for i_ in [forehead + cheek + others]:
    filter_i = np.array([element_ in i_ for element_ in temp_ ])
    filter_points.append(filter_i)

# filter_points, len(image_files), len(image_files)

In [4]:
plt.rcParams['figure.figsize'] = [48, 48]

# for colored printing
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


In [5]:
# def get_face_land_values(image_files, FaceMeshDetector):

detector =  FaceMeshDetector(staticMode=True, maxFaces=1, minDetectionCon=False)

pTime = time.time()

median_rgbs = []
median_rgbs_idx = []
median_rgbs_info = []

# for idx, file_ in enumerate(image_files):
#     
for idx_file_, file_ in enumerate(image_files):
    for idx, filter_ in enumerate(filter_points):
        
        img_ = cv2.imread(file_)

        _, faces = detector.findFaceMesh(img_, draw=False, land_mark_mask_=filter_)

    #     plt.imshow(img_with_landmark)
    #     plt.show()

        if len(faces) != 0:
            median_rgbs.append(np.median(faces[0], axis=0))
            
            median_rgbs_idx.append(f'{idx}_{idx_file_}')
            median_rgbs_info.append({'fullname': f'{file_}_{idx_file_}'})
    #         print(f"{idx} file {file_} done ")
    #          cv2.imwrite(source_path + "out/" + str(idx) + ".jpg", img_with_landmark, [int(cv2.IMWRITE_JPEG_QUALITY), 100])

        else:
            print(f"{bcolors.WARNING}Warning: {idx} face in file {file_} not found {bcolors.ENDC}")

#     median_rgbs.append([0,0,0])



# test exit
    # if idx//1 == 1:

    # break


# local fps 
    # cTime = time.time()
    # fps = 1 / (cTime - pTime)
    # pTime = cTime
    #  print(f'fps {fps:.1f}')


# global fps
cTime = time.time()
fps = len(image_files)//((cTime - pTime)+(1.e-20))
print(f'fps {fps:.1f}')

# np.median(faces[0], axis=0)


pprint.pprint(median_rgbs)


fps 0.0
[]


In [6]:
gen_data = list(zip(median_rgbs_idx, median_rgbs[0], median_rgbs[1], median_rgbs[2], median_rgbs_info))
# gen_data

IndexError: list index out of range

In [None]:
# # save data
# df = pd.DataFrame(gen_data,
#                columns =['ID', 'MEDIAN_R', 'MEDIAN_G', 'MEDIAN_B', 'info'])

# df.to_csv('data_face_colors.csv')
# df

In [None]:
# faces[0][0]

In [3]:
df_medians = pd.read_feather('./median_rgbs.feather')
# numpy array
median_rgbs = df_medians[['median_rgbs_r', 'median_rgbs_g', 'median_rgbs_b']].values
median_rgbs.shape

(88004, 3)

In [4]:
# (df_medians['median_rgbs_info']:
#            [10]['fullname'].split('\\')[1].split('_')[0])
file_names = []
for i_ in df_medians['median_rgbs_info']:
    if 'tn' in  i_['fullname'].split('\\')[1].split('_')[0]:
        file_names.append('train/' + i_['fullname'].split('\\')[1].split('_')[0].replace(')', '').split('(')[1])
    else:
        file_names.append('val/' + i_['fullname'].split('\\')[1].split('_')[0])

        

In [5]:
len(file_names)

88004

In [6]:
df_medians['file'] = file_names

df_medians

Unnamed: 0,index,median_rgbs_idx,median_rgbs_r,median_rgbs_g,median_rgbs_b,median_rgbs_info,file
0,88004,0_0,88.5,53.0,33.0,{'fullname': '../fairface-img-margin025-trainv...,val/1.jpg
1,88004,0_1,190.0,110.5,71.5,{'fullname': '../fairface-img-margin025-trainv...,val/10.jpg
2,88004,0_2,226.0,189.0,179.5,{'fullname': '../fairface-img-margin025-trainv...,val/100.jpg
3,88004,0_3,108.5,79.0,49.0,{'fullname': '../fairface-img-margin025-trainv...,val/1000.jpg
4,88004,0_4,240.5,195.5,164.5,{'fullname': '../fairface-img-margin025-trainv...,val/10000.jpg
...,...,...,...,...,...,...,...
87999,88004,0_97693,136.5,98.0,55.5,{'fullname': '../fairface-img-margin025-trainv...,train/9995.jpg
88000,88004,0_97694,78.0,33.5,18.5,{'fullname': '../fairface-img-margin025-trainv...,train/9996.jpg
88001,88004,0_97695,190.5,121.5,94.5,{'fullname': '../fairface-img-margin025-trainv...,train/9997.jpg
88002,88004,0_97696,196.5,117.5,73.0,{'fullname': '../fairface-img-margin025-trainv...,train/9998.jpg


In [7]:
df_label_train = pd.read_csv('../fairface_label_train.csv')
df_label_val = pd.read_csv('../fairface_label_val.csv')

df_label_all = pd.concat([df_label_train, df_label_val])
# df_label_val['file']
# df_medians['file']
df_fair_face = df_medians.merge(df_label_all,  left_on='file', right_on='file', )
df_fair_face

Unnamed: 0,index,median_rgbs_idx,median_rgbs_r,median_rgbs_g,median_rgbs_b,median_rgbs_info,file,age,gender,race,service_test
0,88004,0_0,88.5,53.0,33.0,{'fullname': '../fairface-img-margin025-trainv...,val/1.jpg,3-9,Male,East Asian,False
1,88004,0_1,190.0,110.5,71.5,{'fullname': '../fairface-img-margin025-trainv...,val/10.jpg,3-9,Male,Southeast Asian,False
2,88004,0_2,226.0,189.0,179.5,{'fullname': '../fairface-img-margin025-trainv...,val/100.jpg,20-29,Female,East Asian,True
3,88004,0_3,108.5,79.0,49.0,{'fullname': '../fairface-img-margin025-trainv...,val/1000.jpg,20-29,Male,Latino_Hispanic,True
4,88004,0_4,240.5,195.5,164.5,{'fullname': '../fairface-img-margin025-trainv...,val/10000.jpg,20-29,Female,East Asian,True
...,...,...,...,...,...,...,...,...,...,...,...
87999,88004,0_97693,136.5,98.0,55.5,{'fullname': '../fairface-img-margin025-trainv...,train/9995.jpg,20-29,Female,Latino_Hispanic,True
88000,88004,0_97694,78.0,33.5,18.5,{'fullname': '../fairface-img-margin025-trainv...,train/9996.jpg,30-39,Female,White,False
88001,88004,0_97695,190.5,121.5,94.5,{'fullname': '../fairface-img-margin025-trainv...,train/9997.jpg,20-29,Male,Black,True
88002,88004,0_97696,196.5,117.5,73.0,{'fullname': '../fairface-img-margin025-trainv...,train/9998.jpg,40-49,Male,Southeast Asian,False


In [8]:
df_fair_face['median_grays']  = 0.299*df_fair_face['median_rgbs_r'] + 0.587*df_fair_face['median_rgbs_g'] + 0.114*df_fair_face['median_rgbs_b']

In [106]:
df_fair_face.to_feather('FairFace_Main.feather')

In [107]:
df_fair_face[['file', 'gender', 'race', 'median_rgbs_r', 'median_rgbs_g', 'median_rgbs_b', 'median_grays']].to_feather('FairFace_compressed.feather')

In [10]:
median_rgbs.shape

(88004, 3)

In [11]:
# median_rgbs = np.array(median_rgbs)

# median_dataframe = pd.DataFrame({'index': len(median_rgbs_info),
#                                  'median_rgbs_idx': median_rgbs_idx,
#                                  'median_rgbs_r': median_rgbs[:,0],
#                                  'median_rgbs_g': median_rgbs[:,1],
#                                  'median_rgbs_b': median_rgbs[:,2],
#                                  'median_rgbs_info': median_rgbs_info})
# median_dataframe.to_csv('median_rgbs.csv')
# median_dataframe.to_feather('median_rgbs.feather')

In [12]:
k_means = sklearn.cluster.KMeans(n_clusters=3, random_state=2)
k_means.fit_predict(median_rgbs)

k_means.labels_
k_means.cluster_centers_

array([[165.74649634, 111.03312307,  87.87673203],
       [214.12933958, 166.00531013, 144.48576528],
       [104.13595019,  62.48233171,  47.27287493]])

In [13]:
# best_epsilon = 5.2
# # for epsilon in range(1, 1000, 100): 
# epsilon =  best_epsilon
# dbscan = sklearn.cluster.DBSCAN(eps=epsilon/10, min_samples=len(median_rgbs[:,0])//50)
# if max(dbscan.fit_predict(median_rgbs)) == 2:
#     best_epsilon = epsilon/10
#     print(best_epsilon)
#     break
    


In [84]:
epsilon = 1.73206
dbscan = sklearn.cluster.DBSCAN(eps=epsilon, min_samples=len(median_rgbs[:,0])//2000)
print(max(dbscan.fit_predict(median_rgbs)))
print(epsilon)

    


3
1.73206


In [85]:
median_grays = df_fair_face['median_grays']

In [91]:
np.quantile(median_grays, 0.33333), np.quantile(median_grays, 0.66666)

(101.07703998999999, 141.04807998)

In [94]:
sklearn.cluster.kmeans_plusplus(median_rgbs, n_clusters=3)

(array([[155. , 109.5,  83.5],
        [225. , 156. , 140.5],
        [200.5, 140. , 101.5]]),
 array([40277, 22337, 55554]))

In [104]:
centers_, clusters_, _ = sklearn.cluster.k_means(median_rgbs, n_clusters=3)
centers_, clusters_

(array([[104.09457897,  62.45446332,  47.24812071],
        [214.07895792, 165.95263861, 144.42736584],
        [165.69217772, 110.97453734,  87.82466538]]),
 array([0, 2, 1, ..., 2, 2, 1]))

In [108]:
df_fair_face['k-means'] = clusters_

In [113]:
kmean_cluster = sklearn.cluster.KMeans(n_clusters=3)
print(kmean_cluster.fit_predict(median_rgbs))
kmean_cluster.cluster_centers_
# df_fair_face['KMeans'] = kmean_cluster.fit_predict(median_rgbs)

[0 2 1 ... 2 2 1]


array([[104.27660377,  62.57962264,  47.35739443],
       [214.19680946, 166.12486557, 144.63136315],
       [165.91874653, 111.18150965,  87.99803238]])

In [121]:
from numpy import unique
from numpy import where
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot

In [119]:
# birch clustering
# define the model
model = Birch(threshold=0.01, n_clusters=3)
# fit the model
print(model.fit_predict(median_rgbs))

# # retrieve unique clusters
# clusters = unique(yhat)
# # create scatter plot for samples from each cluster
# for cluster in clusters:
# 	# get row indexes for samples with this cluster
# 	row_ix = where(yhat == cluster)
# 	# create scatter of these samples
# 	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])


MemoryError: Unable to allocate 27.4 GiB for an array with shape (3671602278,) and data type float64

In [122]:
# define the model
model = GaussianMixture(n_components=3)
# fit the model
print(model.fit_predict(median_rgbs))
# # retrieve unique clusters
# clusters = unique(yhat)
# # create scatter plot for samples from each cluster
# for cluster in clusters:
# 	# get row indexes for samples with this cluster
# 	row_ix = where(yhat == cluster)
# 	# create scatter of these samples
# 	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# # show the plot
# pyplot.show()

[2 0 0 ... 0 0 0]


In [123]:
df_fair_face['Gauss_mix'] = model.fit_predict(median_rgbs)

In [124]:
df_fair_face.to_excel('fair_face_kmeans_types.xlsx')

In [105]:
df_fair_face['file']

0             val/1.jpg
1            val/10.jpg
2           val/100.jpg
3          val/1000.jpg
4         val/10000.jpg
              ...      
87999    train/9995.jpg
88000    train/9996.jpg
88001    train/9997.jpg
88002    train/9998.jpg
88003    train/9999.jpg
Name: file, Length: 88004, dtype: object

In [None]:
dbscan = sklearn.cluster.DBSCAN(eps=best_epsilon, min_samples=len(image_files)//50)
# dbscan.fit_predict(median_rgbs)

# hdbscan 
pacmap

In [None]:
max(dbscan.labels_)

In [None]:
dbscan.labels_


In [None]:
dbscan.cluster_centers_

In [None]:
len(median_rgbs[dbscan.fit_predict(median_rgbs)==0]), len(median_rgbs[dbscan.fit_predict(median_rgbs)==1]), len(median_rgbs[dbscan.fit_predict(median_rgbs)==2])

In [None]:
# https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_