# This is a notebook for embedding of vectors and generation of corresponding dataframes

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

from sklearn.neighbors import NearestNeighbors

from sklearn.manifold import TSNE
import plotly.express as px

from deepface import DeepFace


### Main function

In [19]:
def produce_dataframes(folder_path = '../raw_data/output_imdb_top100/'):
    # Generate file list. The folder with images should conatin only files with image extension such as jpg jpeg png etc.
    # The file names should be like name_surname.jpg. Ideally without diacritics. 
    file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]
    
    # Print progress
    for n, i in enumerate(file_list): print(n, os.path.splitext(os.path.basename(i))[0])
    
    # Create array of embeddings
    deepface_vectors = np.array([np.array(DeepFace.represent(i, model_name='VGG-Face', enforce_detection=False)[0]['embedding']) for i in file_list])
    
    # Generate cosine similarity arrays
    knn_deepface = NearestNeighbors(n_neighbors=len(deepface_vectors), metric='cosine', n_jobs=-1)
    knn_deepface.fit(deepface_vectors)
    deepface_distances, deepface_indices = knn_deepface.kneighbors(deepface_vectors)
    
    # Generate dataframes
    deepface_df = pd.DataFrame([os.path.splitext(os.path.basename(i))[0] for i in file_list]).rename(columns={0:'name'})
    deepface_df = pd.concat([deepface_df, pd.DataFrame(deepface_indices[:,1:])], axis=1)
    deepface_df[deepface_df.columns[1:]] = deepface_df[deepface_df.columns[1:]].applymap(lambda i: deepface_df.name.iloc[i])
    
    names_df = deepface_df.set_index('name').T

    distances_df = pd.concat([pd.DataFrame([os.path.splitext(os.path.basename(i))[0] for i in file_list])\
        .rename(columns={0:'name'}), pd.DataFrame(deepface_distances[:,1:])], axis=1).set_index('name').T

    indices_df = pd.concat([pd.DataFrame([os.path.splitext(os.path.basename(i))[0] for i in file_list])\
            .rename(columns={0:'name'}), pd.DataFrame(deepface_indices[:,1:])], axis=1).set_index('name').T

    scoring_df = distances_df.copy()
    
    tsne_1d = TSNE(n_components=1, random_state=0, perplexity=5) # used for plotting. Perplexity = 5 as the dataset has not so many values
    projections = tsne_1d.fit_transform(deepface_vectors, )

    projections_df = indices_df.T.reset_index().iloc[:,1:].applymap(lambda x: projections[x][0]).T
    projections_df.columns = scoring_df.columns
    projections_df
    
    return names_df, scoring_df, projections_df, indices_df

### Just provide a path to a folder where you have images with your celebrities

In [42]:
names_df, scoring_df, projections_df, indices_df = produce_dataframes(folder_path='../raw_data/output_imdb_top100')


0 ellen_burstyn
1 julia_roberts
2 liam_hemsworth
3 meryl_streep
4 jack_lemmon
5 jennifer_lawrence
6 kate_mara
7 adele_exarchopoulos
8 marilyn_monroe
9 michael_caine
10 spencer_tracy
11 mel_gibson
12 amanda_plummer
13 johnny_depp
14 brad_pitt
15 demi_moore
16 jessica_chastain
17 jimmy_fallon
18 charlize_theron
19 george_c._scott
20 gene_hackman
21 benicio_del_toro
22 gene_kelly
23 samuel_l._jackson
24 tom_hanks
25 emmy_rossum
26 janet_leigh
27 steve_mcqueen
28 gary_cooper
29 orson_welles
30 maggie_gyllenhaal
31 tommy_lee_jones
32 jeff_bridges
33 naomi_watts
34 alyson_hannigan
35 rachel_weisz
36 ralph_fiennes
37 hailee_steinfeld
38 alan_arkin
39 kirk_douglas
40 jim_carrey
41 joaquin_phoenix
42 robert_redford
43 felicity_jones
44 kim_basinger
45 kirsten_dunst
46 natalie_portman
47 albert_finney
48 matt_damon
49 cameron_diaz
50 sharon_stone
51 charlton_heston
52 grace_kelly
53 julia_louis-dreyfus
54 jodie_foster
55 marion_cotillard
56 gregory_peck
57 humphrey_bogart
58 burt_lancaster
59 ma

In [44]:
# Save to file
names_df.to_csv('../raw_data/names.csv')
scoring_df.to_csv('../raw_data/scoring.csv')
projections_df.to_csv('../raw_data/projections.csv')
indices_df.to_csv('../raw_data/indices.csv')


In [22]:
# For scores analysis
def image_check(name, names_df=names_df, scoring_df=scoring_df):
    names = names_df[name]
    scores = scoring_df[name]
    return pd.concat([names,scores], axis=1)

In [45]:
names_df.head()

name,ellen_burstyn,julia_roberts,liam_hemsworth,meryl_streep,jack_lemmon,jennifer_lawrence,kate_mara,adele_exarchopoulos,marilyn_monroe,michael_caine,...,eva_green,emily_blunt,hugh_jackman,robert_duvall,clark_gable,octavia_spencer,joan_allen,bill_murray,jeremy_irons,edward_g._robinson
0,peter_o'toole,julianne_moore,chris_hemsworth,nicole_kidman,john_goodman,melanie_laurent,emma_watson,emmy_rossum,kim_novak,peter_o'toole,...,julie_delpy,jodie_foster,chris_hemsworth,anthony_hopkins,laurence_olivier,viola_davis,maggie_smith,kevin_spacey,christoph_waltz,albert_finney
1,sigourney_weaver,evangeline_lilly,matt_damon,naomi_watts,alec_guinness,johnny_depp,bibi_andersson,emily_blunt,marlene_dietrich,jon_voight,...,jennifer_connelly,adele_exarchopoulos,matthew_mcconaughey,robert_redford,sean_connery,don_cheadle,michelle_pfeiffer,robin_williams,henry_fonda,george_c._scott
2,sharon_stone,hilary_swank,heath_ledger,carey_mulligan,robert_mitchum,carey_mulligan,robin_wright,olga_kurylenko,meryl_streep,robert_duvall,...,emma_watson,alyson_hannigan,brad_pitt,spencer_tracy,james_stewart,anthony_hopkins,uma_thurman,russell_crowe,daniel_day-lewis,charles_laughton
3,joan_allen,faye_dunaway,leonardo_dicaprio,jennifer_lawrence,john_wayne,emma_mackey,lea_seydoux,eva_green,angelina_jolie,john_goodman,...,saoirse_ronan,julianne_moore,tom_hanks,jason_robards,henry_fonda,morgan_freeman,amanda_plummer,jim_carrey,philip_seymour_hoffman,jack_lemmon
4,maggie_smith,jodie_foster,brad_pitt,angelina_jolie,joe_pesci,chloe_grace_moretz,jessica_chastain,elizabeth_olsen,virginia_cherrill,jack_nicholson,...,amy_adams,marion_cotillard,russell_crowe,michael_caine,gene_kelly,ian_mckellen,ellen_burstyn,john_goodman,clint_eastwood,elliot_page


In [46]:
scoring_df

name,ellen_burstyn,julia_roberts,liam_hemsworth,meryl_streep,jack_lemmon,jennifer_lawrence,kate_mara,adele_exarchopoulos,marilyn_monroe,michael_caine,...,eva_green,emily_blunt,hugh_jackman,robert_duvall,clark_gable,octavia_spencer,joan_allen,bill_murray,jeremy_irons,edward_g._robinson
0,0.393694,0.363772,0.255083,0.140787,0.223894,0.078622,0.295861,0.370248,0.331272,0.226414,...,0.352416,0.390768,0.309979,0.186653,0.293713,0.343306,0.370138,0.190542,0.254345,0.373177
1,0.401021,0.378794,0.333176,0.162265,0.271252,0.103272,0.368113,0.393614,0.382534,0.242548,...,0.384863,0.393614,0.323990,0.203195,0.309910,0.494182,0.385836,0.241941,0.308656,0.405609
2,0.405558,0.386467,0.411394,0.177884,0.282591,0.113155,0.398993,0.414373,0.400170,0.249888,...,0.387402,0.399558,0.325256,0.244336,0.328509,0.529435,0.388018,0.243102,0.311805,0.405720
3,0.406163,0.398142,0.423737,0.199677,0.283379,0.115136,0.399680,0.421046,0.415377,0.260621,...,0.390164,0.417410,0.342058,0.246964,0.334536,0.536601,0.402839,0.289938,0.314134,0.405788
4,0.426537,0.399806,0.445019,0.208655,0.288183,0.139725,0.417491,0.447967,0.430180,0.270665,...,0.406319,0.428567,0.357280,0.249888,0.336980,0.567086,0.406163,0.293471,0.322536,0.413066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,0.844788,0.940765,0.957998,0.864209,0.926329,0.894636,0.956685,0.838640,0.990504,0.933551,...,0.832967,0.892392,0.949379,0.781702,0.956859,0.954681,0.799548,0.869307,0.902664,0.895595
202,0.859890,0.955673,0.989778,0.876334,0.928763,0.904712,0.959069,0.842059,0.993273,0.942540,...,0.841539,0.893257,0.950903,0.782696,0.980053,0.954964,0.804758,0.877596,0.907769,0.929150
203,0.888442,0.970522,0.990261,0.894891,0.954474,0.906295,0.963079,0.846488,1.013070,0.963079,...,0.843241,0.896073,0.991371,0.810398,0.983847,0.976027,0.816429,0.897704,0.919542,0.940765
204,0.895959,0.975988,1.038924,0.896980,0.956301,0.918412,0.977768,0.871848,1.059219,0.982461,...,0.854757,0.909300,0.995844,0.830546,0.995343,0.989778,0.816469,0.925115,0.934413,0.944075


In [47]:
image_check('margot_robbie')

Unnamed: 0,margot_robbie,margot_robbie.1
0,hailee_steinfeld,0.326541
1,samara_weaving,0.354098
2,naomi_watts,0.356592
3,daisy_ridley,0.360571
4,charlize_theron,0.387094
...,...,...
201,charlton_heston,0.858020
202,samuel_l._jackson,0.859447
203,michael_caine,0.871580
204,viola_davis,0.872041


In [48]:
projections_df

name,ellen_burstyn,julia_roberts,liam_hemsworth,meryl_streep,jack_lemmon,jennifer_lawrence,kate_mara,adele_exarchopoulos,marilyn_monroe,michael_caine,...,eva_green,emily_blunt,hugh_jackman,robert_duvall,clark_gable,octavia_spencer,joan_allen,bill_murray,jeremy_irons,edward_g._robinson
0,-5.847637,7.192966,-46.503056,27.585815,-14.197795,28.443249,15.308876,21.569750,26.108665,-5.847637,...,11.746453,6.481924,-46.503056,-9.912743,-35.519753,2.031495,-8.277208,-23.146187,-20.010386,-14.826070
1,-3.770584,8.548803,-47.497498,27.637901,-17.027782,18.340479,15.025901,20.585474,25.694407,-8.263890,...,23.698853,21.816792,-43.950195,-30.168154,-42.034348,-0.060083,6.369839,-12.828065,-34.080925,-16.429581
2,-2.187772,42.972565,-27.665766,28.691648,-33.132259,28.691648,24.607462,21.876064,26.767708,-9.866195,...,15.308876,20.497644,-45.268753,-39.824951,-32.049522,-9.912743,-3.270234,-25.035971,-20.143358,-16.108551
3,5.769936,-2.402707,-47.549248,28.582142,-13.423986,29.779570,34.669415,23.508514,30.494530,-14.197795,...,37.669670,7.192966,-23.326687,-40.362965,-34.080925,0.184028,-29.809515,-24.055618,-20.793697,-14.366277
4,-8.277208,6.481924,-45.268753,30.494530,-33.508835,30.360929,11.865150,21.064133,17.206284,-6.705454,...,23.222748,20.451962,-25.035971,-7.877814,-28.860556,-11.147629,-5.725354,-14.197795,-17.471899,3.953416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,30.737099,-15.171880,11.224054,-45.268753,40.929520,-45.268753,-32.050415,-35.519753,-45.268753,15.308876,...,2.031495,-0.249916,30.890013,30.737099,8.257996,-35.519753,21.876064,30.737099,30.890013,6.369839
202,-46.592503,-35.074825,2.125116,43.648884,43.648884,-23.940500,-4.791681,-21.825199,-25.649073,8.548803,...,0.184028,-46.592503,30.737099,31.605349,34.636566,-36.355904,19.078037,30.890013,30.360929,8.257996
203,29.779570,3.953416,-8.277208,-46.592503,34.636566,-25.649073,-7.877814,-36.048618,-44.558472,15.199759,...,2.125116,-35.519753,8.548803,34.636566,42.329990,-36.028496,29.779570,31.433588,34.636566,8.904810
204,31.605349,-36.028496,8.548803,-46.503056,42.329990,8.548803,-13.423986,-0.249916,-46.592503,34.636566,...,-0.859690,-36.048618,30.360929,31.433588,43.648884,-46.592503,31.605349,34.636566,31.433588,43.648884


### Plotting test

In [49]:
hidden_celebrity = 'margot_robbie' # Pass a hidden celebrity for a test. 
names_df=pd.read_csv("../raw_data/names.csv")
indices_df=pd.read_csv("../raw_data/indices.csv")
scoring_df=pd.read_csv("../raw_data/scoring.csv")
projections_df=pd.read_csv("../raw_data/projections.csv")
plotting_df = pd.concat([names_df[hidden_celebrity], indices_df[hidden_celebrity], scoring_df[hidden_celebrity], projections_df[hidden_celebrity]], axis=1)
plotting_df.columns = ['names','indices', 'distances', 'projections']
plotting_df['radial_projections'] = plotting_df['projections'].apply(lambda x: (x- plotting_df['projections'].max())/(plotting_df['projections'].min()-plotting_df['projections'].max())*360)
plotting_df

Unnamed: 0,names,indices,distances,projections,radial_projections
0,samara_weaving,184,0.354098,44.562260,7.603532
1,naomi_watts,33,0.356592,27.637901,72.352058
2,daisy_ridley,102,0.360571,31.433588,57.830673
3,charlize_theron,18,0.387094,41.601143,18.932054
4,katheryn_winnick,188,0.401245,30.737099,60.495272
...,...,...,...,...,...
201,charlton_heston,51,0.858020,-4.791681,196.419837
202,samuel_l._jackson,23,0.859447,-0.249916,179.044136
203,michael_caine,9,0.871580,-7.877814,208.226640
204,viola_davis,179,0.872041,2.031495,170.316008


In [50]:
fig = px.scatter_polar(plotting_df, r='distances', theta="radial_projections", hover_data='names')
fig.show()

### Query test

In [37]:
def celeb_and_score_query(celebrity, hidden_celebrity=hidden_celebrity, names_df=names_df, score_df=scoring_df):
    celeb_rank = names_df[hidden_celebrity][names_df[hidden_celebrity] == celebrity].index
    celeb_score = scoring_df[hidden_celebrity].loc[celeb_rank].iloc[0]
    return celebrity, celeb_score
    

In [None]:
celeb_and_score_query('margot_robbie')