# Data Preparation

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

## 1. NimStim experiment data

In [2]:
NIMSTIM_CLEANED = "../experiment/NimStim/a1478908-cleaned.csv"
nimstim = pd.read_csv(NIMSTIM_CLEANED,header=0)

In [3]:
def get_average(values): 
    values = [int(x) for x in values.split('\n')]
    n = len(values)
    result = sum(values)/n
    return int(np.round(result))

In [4]:
def get_canonical_name(path): 
    return path.split('/')[-1]

In [5]:
age_average = nimstim["age"].apply(get_average)

In [6]:
nimstim["age"] = age_average

In [7]:
nimstim["image_url"] = nimstim["image_url"].apply(get_canonical_name)

In [8]:
nimstim.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,age,gender,gender:confidence,age_gold,gender_gold,image_url
0,2617202715,False,finalized,3,1/29/2020 15:10:35,26,female,1.0,,,02F_NE_C.jpeg
1,2617202716,False,finalized,3,1/29/2020 15:10:35,20,female,1.0,,,03F_NE_C.jpeg
2,2617202717,False,finalized,3,1/29/2020 15:10:20,28,female,1.0,,,05F_NE_C.jpeg
3,2617202718,False,finalized,3,1/29/2020 15:10:20,28,female,1.0,,,06F_NE_C.jpeg
4,2617202719,False,finalized,3,1/29/2020 15:09:53,31,female,1.0,,,07F_NE_C.jpeg


## 2. AirBnb experiment data 

In [9]:
AIRBNB_RAW = "../experiment/AirBnb/a1478920-cleaned.csv"
airbnb = pd.read_csv(AIRBNB_RAW,header=0)

**Note:** unlike the NimStim experiment, the data here is too big to be manually cleansed, so it is raw. Unlike NimStim, restrictions (validators) on the input have been added, so the only cleansing would be dealing with outliers. 

First, we need to get rid of images with multiple faces: 

In [10]:
display("Only one face detected: ", airbnb["multiple_faces"][airbnb["multiple_faces"]=="no"].count())
display("Multiple faces detected: ", airbnb["multiple_faces"][airbnb["multiple_faces"]=="yes"].count())

'Only one face detected: '

549

'Multiple faces detected: '

218

Let's leave just the single-face entries: 

In [11]:
airbnb = airbnb[airbnb["multiple_faces"]=="no"]

In [12]:
airbnb = airbnb.drop(["_unit_id", "_unit_state", "_golden", "_trusted_judgments", "_last_judgment_at", 
                                             "age_gold", "emotion_gold", "gender_gold", "multiple_faces_gold", 
                                             "race_gold"], axis=1)

In [13]:
airbnb["image_url"] = airbnb["image_url"].apply(get_canonical_name)

In [14]:
airbnb["age"] = airbnb["age"].apply(get_average)

In [15]:
airbnb

Unnamed: 0,age,emotion,emotion:confidence,gender,gender:confidence,multiple_faces,multiple_faces:confidence,race,race:confidence,image_url
0,26,happy,1.0000,female,1.0000,no,1.0,asian,0.6667,cffc6e2d-cea8-484b-a44c-16b9402ed896.jpg
1,26,happy,1.0000,male,1.0000,no,1.0,white,1.0000,34202cee-1a17-413d-acfc-73f2eeb15130.jpg
2,29,neutral,1.0000,male,0.6667,no,1.0,white,1.0000,f3070346-8e85-41e8-aa77-d9dd9b4ced40.jpg
3,28,neutral,0.6667,male,1.0000,no,1.0,asian,1.0000,f9d35343-fb95-4c64-9abf-026cd142f318.jpg
4,30,neutral,1.0000,male,0.6667,no,1.0,white,1.0000,e260b413-5234-4c07-8293-78029f00421e.jpg
...,...,...,...,...,...,...,...,...,...,...
760,38,happy,0.9594,female,0.9594,no,1.0,white,0.9239,eca787fa-1a0b-4f9b-b447-c7c057f328f9.jpg
762,27,happy,0.9591,male,1.0000,no,1.0,asian,0.7586,5477d977-4382-48c1-abbb-e74b54dab720.jpg
764,26,neutral,0.8370,male,1.0000,no,1.0,white,0.7617,41d985a0-50e1-49da-83c0-07eeab29aa11.jpg
765,26,neutral,0.7188,female,1.0000,no,1.0,asian,0.4821,cc1ee4e5-61ad-4e72-bc22-3e9398210dbf.jpg


## 3. Combining the Ground Truth 

### 3.1. CFD

In [16]:
CFD_NORMING_DATA = "../api_processing/Datasets/CFD/CFD_norming_data.csv"
cfd_norming_data = pd.read_csv(CFD_NORMING_DATA,header=0)

Also, will one of the api processing tables for the already extracted metadata: 

In [17]:
MICROSOFT_CFD = "../tables/microsoft_CFD.csv"
microsoft_cfd = pd.read_csv(MICROSOFT_CFD,header=0)

In [18]:
def is_neutral(file_name): 
    if file_name[-5] == "N": 
        return True 
    return False

Leaving neutral rows only, we can see we have norming data for all of the neutral images. 

In [19]:
microsoft_cfd[microsoft_cfd.apply(is_neutral, axis=1)].shape

(597, 10)

In [20]:
cfd_norming_data.shape

(597, 69)

In [21]:
metadata = microsoft_cfd[["File Name", "Target ID", "Race", "Gender", "Expression"]]

In [22]:
def get_age_for(target_id): 
    return int(cfd_norming_data.loc[cfd_norming_data["Target"] == target_id].Age)

In [23]:
ages = metadata["Target ID"].apply(get_age_for)

In [24]:
metadata["Age"] = ages

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


So we have the complete metadata for CFD now: 

In [25]:
metadata.head()

Unnamed: 0,File Name,Target ID,Race,Gender,Expression,Age
0,CFD-WM-026-019-A.jpg,WM-026,W,M,A,25
1,CFD-WM-026-006-HO.jpg,WM-026,W,M,HO,25
2,CFD-WM-026-004-HC.jpg,WM-026,W,M,HC,25
3,CFD-WM-026-001-N.jpg,WM-026,W,M,N,25
4,CFD-WM-026-013-F.jpg,WM-026,W,M,F,25


### 3.2. NimStim 

In [26]:
races = {1: "white", 2: "white", 3: "white", 5: "white", 6:"white", 7:"white", 8:"white", 9:"white", 10:"white", 11:"black",
 12: "black", 13: "black", 14: "black", 15: "asian", 16: "asian", 17: "asian", 18: "asian", 19: "asian", 20: "white", 
 21: "white", 22: "white", 23: "white", 24: "white", 25: "white", 26: "white", 27: "white", 28: "white", 29: "white", 
 30: "white", 31: "white", 32: "white", 33: "white", 34: "white", 35: "white", 36: "white", 37: "latino", 38: "black", 
 39: "black", 40: "black", 41: "black", 42: "black", 43: "black", 45: "asian"
}
# Note: unsure about 3 and 37

In [27]:
def url_to_id(url): 
    return int(url[:2])

In [28]:
nimstim["Target ID"] = nimstim["image_url"].apply(url_to_id)
nimstim.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,age,gender,gender:confidence,age_gold,gender_gold,image_url,Target ID
0,2617202715,False,finalized,3,1/29/2020 15:10:35,26,female,1.0,,,02F_NE_C.jpeg,2
1,2617202716,False,finalized,3,1/29/2020 15:10:35,20,female,1.0,,,03F_NE_C.jpeg,3
2,2617202717,False,finalized,3,1/29/2020 15:10:20,28,female,1.0,,,05F_NE_C.jpeg,5
3,2617202718,False,finalized,3,1/29/2020 15:10:20,28,female,1.0,,,06F_NE_C.jpeg,6
4,2617202719,False,finalized,3,1/29/2020 15:09:53,31,female,1.0,,,07F_NE_C.jpeg,7


In [29]:
AMAZON_NIMSTIM = "../tables/amazon_NimStim.csv" # Note: using Amazon as Microsoft uses wrong (.BMP) extension
amazon_nimstim = pd.read_csv(AMAZON_NIMSTIM,header=0)

In [30]:
amazon_nimstim

Unnamed: 0,File Name,Target ID,Gender,Emotion,Mouth,age_low,age_high,smile_value,smile_confidence,gender_value,gender_confidence,top_emotion
0,01F_AN_O.jpeg,1,F,AN,O,25,39,False,86.550598,Female,96.860107,ANGRY
1,01F_CA_C.jpeg,1,F,CA,C,22,34,False,99.292320,Female,97.433662,CALM
2,01F_CA_O.jpeg,1,F,CA,O,22,34,False,98.794647,Female,98.804298,SURPRISED
3,01F_DI_C.jpeg,1,F,DI,C,26,40,False,79.647949,Female,96.154381,ANGRY
4,01F_DI_O.jpeg,1,F,DI,O,23,37,False,80.629723,Female,98.592545,ANGRY
...,...,...,...,...,...,...,...,...,...,...,...,...
668,45M_NE_C.jpeg,45,M,NE,C,24,38,False,99.139053,Male,98.601120,CALM
669,45M_NE_O.jpeg,45,M,NE,O,22,34,False,99.524788,Male,99.666840,CALM
670,45M_SA_C.jpeg,45,M,SA,C,23,37,False,98.591789,Male,96.003944,FEAR
671,45M_SA_O.jpeg,45,M,SA,O,37,55,False,93.306198,Male,99.696747,SAD


In [31]:
amazon_nimstim = amazon_nimstim[["File Name", "Target ID", "Gender", "Emotion"]]

In [32]:
nimstim_races = amazon_nimstim["Target ID"].apply(lambda x: races[x])
amazon_nimstim["Race"] = nimstim_races

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [33]:
amazon_nimstim

Unnamed: 0,File Name,Target ID,Gender,Emotion,Race
0,01F_AN_O.jpeg,1,F,AN,white
1,01F_CA_C.jpeg,1,F,CA,white
2,01F_CA_O.jpeg,1,F,CA,white
3,01F_DI_C.jpeg,1,F,DI,white
4,01F_DI_O.jpeg,1,F,DI,white
...,...,...,...,...,...
668,45M_NE_C.jpeg,45,M,NE,asian
669,45M_NE_O.jpeg,45,M,NE,asian
670,45M_SA_C.jpeg,45,M,SA,asian
671,45M_SA_O.jpeg,45,M,SA,asian


In [34]:
def get_nimstim_age(target_id): 
    return int(nimstim.loc[nimstim["Target ID"] == target_id].age)

In [35]:
amazon_nimstim["Age"] = amazon_nimstim["Target ID"].apply(get_nimstim_age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [36]:
nimstim_meta = amazon_nimstim.copy()

### 3.3. AirBnb

In [37]:
airbnb

Unnamed: 0,age,emotion,emotion:confidence,gender,gender:confidence,multiple_faces,multiple_faces:confidence,race,race:confidence,image_url
0,26,happy,1.0000,female,1.0000,no,1.0,asian,0.6667,cffc6e2d-cea8-484b-a44c-16b9402ed896.jpg
1,26,happy,1.0000,male,1.0000,no,1.0,white,1.0000,34202cee-1a17-413d-acfc-73f2eeb15130.jpg
2,29,neutral,1.0000,male,0.6667,no,1.0,white,1.0000,f3070346-8e85-41e8-aa77-d9dd9b4ced40.jpg
3,28,neutral,0.6667,male,1.0000,no,1.0,asian,1.0000,f9d35343-fb95-4c64-9abf-026cd142f318.jpg
4,30,neutral,1.0000,male,0.6667,no,1.0,white,1.0000,e260b413-5234-4c07-8293-78029f00421e.jpg
...,...,...,...,...,...,...,...,...,...,...
760,38,happy,0.9594,female,0.9594,no,1.0,white,0.9239,eca787fa-1a0b-4f9b-b447-c7c057f328f9.jpg
762,27,happy,0.9591,male,1.0000,no,1.0,asian,0.7586,5477d977-4382-48c1-abbb-e74b54dab720.jpg
764,26,neutral,0.8370,male,1.0000,no,1.0,white,0.7617,41d985a0-50e1-49da-83c0-07eeab29aa11.jpg
765,26,neutral,0.7188,female,1.0000,no,1.0,asian,0.4821,cc1ee4e5-61ad-4e72-bc22-3e9398210dbf.jpg


In [38]:
airbnb_metadata = airbnb[["image_url", "age", "emotion", "gender", "race"]]

In [39]:
airbnb_metadata.columns = ["File Name", "Age", "Emotion", "Gender", "Race"]

In [40]:
airbnb_metadata

Unnamed: 0,File Name,Age,Emotion,Gender,Race
0,cffc6e2d-cea8-484b-a44c-16b9402ed896.jpg,26,happy,female,asian
1,34202cee-1a17-413d-acfc-73f2eeb15130.jpg,26,happy,male,white
2,f3070346-8e85-41e8-aa77-d9dd9b4ced40.jpg,29,neutral,male,white
3,f9d35343-fb95-4c64-9abf-026cd142f318.jpg,28,neutral,male,asian
4,e260b413-5234-4c07-8293-78029f00421e.jpg,30,neutral,male,white
...,...,...,...,...,...
760,eca787fa-1a0b-4f9b-b447-c7c057f328f9.jpg,38,happy,female,white
762,5477d977-4382-48c1-abbb-e74b54dab720.jpg,27,happy,male,asian
764,41d985a0-50e1-49da-83c0-07eeab29aa11.jpg,26,neutral,male,white
765,cc1ee4e5-61ad-4e72-bc22-3e9398210dbf.jpg,26,neutral,female,asian


### 3.4. AI-Generated Faces

In [41]:
AMAZON_AI = "../tables/amazon_AI.csv"
amazon_ai = pd.read_csv(AMAZON_AI,header=0)

In [42]:
ai_meta = amazon_ai[["File Name", "Target ID", "Gender", "Race", "Age"]]

In [43]:
ai_meta["Emotion"] = None
ai_meta.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,File Name,Target ID,Gender,Race,Age,Emotion
0,5dd083b5def8b400084c30c5_female_asian_adult.jpeg,5dd083b5def8b400084c30c5,female,asian,adult,
1,5dd08728def8b400084c8a23_female_asian_adult.jpeg,5dd08728def8b400084c8a23,female,asian,adult,
2,5dd08797def8b400084c939d_male_black_child.jpeg,5dd08797def8b400084c939d,male,black,child,
3,5dd088d4def8b400084cad1c_female_asian_adult.jpeg,5dd088d4def8b400084cad1c,female,asian,adult,
4,5dd08950def8b400084cb699_female_asian_adult.jpeg,5dd08950def8b400084cb699,female,asian,adult,


### 3.5. Combining the DataFrames

In [44]:
# CFD
metadata = metadata.rename(columns = {"Expression": "Emotion"})
metadata["Origin"] = "CFD"
metadata.head()

Unnamed: 0,File Name,Target ID,Race,Gender,Emotion,Age,Origin
0,CFD-WM-026-019-A.jpg,WM-026,W,M,A,25,CFD
1,CFD-WM-026-006-HO.jpg,WM-026,W,M,HO,25,CFD
2,CFD-WM-026-004-HC.jpg,WM-026,W,M,HC,25,CFD
3,CFD-WM-026-001-N.jpg,WM-026,W,M,N,25,CFD
4,CFD-WM-026-013-F.jpg,WM-026,W,M,F,25,CFD


In [45]:
# NimStim
nimstim_meta.head()
nimstim_meta["Origin"] = "NimStim"

In [46]:
# AirBnb
airbnb_metadata.head()
airbnb_metadata["Target ID"] = airbnb_metadata["File Name"].apply(lambda x: x.split('.')[0])
airbnb_metadata["Origin"] = "AirBnb"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [47]:
airbnb_metadata.head()

Unnamed: 0,File Name,Age,Emotion,Gender,Race,Target ID,Origin
0,cffc6e2d-cea8-484b-a44c-16b9402ed896.jpg,26,happy,female,asian,cffc6e2d-cea8-484b-a44c-16b9402ed896,AirBnb
1,34202cee-1a17-413d-acfc-73f2eeb15130.jpg,26,happy,male,white,34202cee-1a17-413d-acfc-73f2eeb15130,AirBnb
2,f3070346-8e85-41e8-aa77-d9dd9b4ced40.jpg,29,neutral,male,white,f3070346-8e85-41e8-aa77-d9dd9b4ced40,AirBnb
3,f9d35343-fb95-4c64-9abf-026cd142f318.jpg,28,neutral,male,asian,f9d35343-fb95-4c64-9abf-026cd142f318,AirBnb
4,e260b413-5234-4c07-8293-78029f00421e.jpg,30,neutral,male,white,e260b413-5234-4c07-8293-78029f00421e,AirBnb


In [48]:
# AI
ai_meta["Origin"] = "AI"
ai_meta

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,File Name,Target ID,Gender,Race,Age,Emotion,Origin
0,5dd083b5def8b400084c30c5_female_asian_adult.jpeg,5dd083b5def8b400084c30c5,female,asian,adult,,AI
1,5dd08728def8b400084c8a23_female_asian_adult.jpeg,5dd08728def8b400084c8a23,female,asian,adult,,AI
2,5dd08797def8b400084c939d_male_black_child.jpeg,5dd08797def8b400084c939d,male,black,child,,AI
3,5dd088d4def8b400084cad1c_female_asian_adult.jpeg,5dd088d4def8b400084cad1c,female,asian,adult,,AI
4,5dd08950def8b400084cb699_female_asian_adult.jpeg,5dd08950def8b400084cb699,female,asian,adult,,AI
...,...,...,...,...,...,...,...
243,5dd09cb3def8b400084dc569_female_white_young-ad...,5dd09cb3def8b400084dc569,female,white,young-adult,,AI
244,5dd09cb3def8b400084dc56a_female_white_young-ad...,5dd09cb3def8b400084dc56a,female,white,young-adult,,AI
245,5dd09cb3def8b400084dc56b_female_white_adult.jpeg,5dd09cb3def8b400084dc56b,female,white,adult,,AI
246,5dd09cb4def8b400084dc56c_female_white_young-ad...,5dd09cb4def8b400084dc56c,female,white,young-adult,,AI


In [49]:
dataframes = [metadata, nimstim_meta, airbnb_metadata, ai_meta]

In [50]:
inputs = pd.concat(dataframes, sort=False, ignore_index=True)
inputs

Unnamed: 0,File Name,Target ID,Race,Gender,Emotion,Age,Origin
0,CFD-WM-026-019-A.jpg,WM-026,W,M,A,25,CFD
1,CFD-WM-026-006-HO.jpg,WM-026,W,M,HO,25,CFD
2,CFD-WM-026-004-HC.jpg,WM-026,W,M,HC,25,CFD
3,CFD-WM-026-001-N.jpg,WM-026,W,M,N,25,CFD
4,CFD-WM-026-013-F.jpg,WM-026,W,M,F,25,CFD
...,...,...,...,...,...,...,...
2672,5dd09cb3def8b400084dc569_female_white_young-ad...,5dd09cb3def8b400084dc569,white,female,,young-adult,AI
2673,5dd09cb3def8b400084dc56a_female_white_young-ad...,5dd09cb3def8b400084dc56a,white,female,,young-adult,AI
2674,5dd09cb3def8b400084dc56b_female_white_adult.jpeg,5dd09cb3def8b400084dc56b,white,female,,adult,AI
2675,5dd09cb4def8b400084dc56c_female_white_young-ad...,5dd09cb4def8b400084dc56c,white,female,,young-adult,AI


## 4. Adding the tool results 

### 4.1. Amazon

In [51]:
AMAZON_CFD = "../tables/amazon_CFD.csv"
AMAZON_NIMSTIM = "../tables/amazon_NimStim.csv"
AMAZON_AI = "../tables/amazon_AI.csv"
AMAZON_AIRBNB = "../tables/amazon_AirBnb.csv"

In [52]:
amazon_cfd = pd.read_csv(AMAZON_CFD,header=0)
amazon_nimstim = pd.read_csv(AMAZON_NIMSTIM,header=0)
amazon_ai = pd.read_csv(AMAZON_AI,header=0)
amazon_airbnb = pd.read_csv(AMAZON_AIRBNB,header=0)
amazon = [amazon_cfd, amazon_nimstim, amazon_ai, amazon_airbnb]
amazon = pd.concat(amazon, sort=False, ignore_index=True)

Deleting and renaming columns. 

In [53]:
amazon_outputs = amazon[['File Name', 'age_low', 'age_high', 'smile_value', 'smile_confidence',
       'gender_value', 'gender_confidence', 'top_emotion']]

In [54]:
amazon_outputs = amazon_outputs.rename(columns={'age_low': 'amazon_age_low', 
                       'age_high': 'amazon_age_high', 
                       'smile_value': 'amazon_smile', 
                       'smile_confidence': 'amazon_smile_confidence',
                       'gender_value': 'amazon_gender', 
                       'gender_confidence': 'amazon_gender_confidence', 
                       'top_emotion': 'amazon_emotion'})

In [55]:
amazon_outputs.head()

Unnamed: 0,File Name,amazon_age_low,amazon_age_high,amazon_smile,amazon_smile_confidence,amazon_gender,amazon_gender_confidence,amazon_emotion
0,CFD-WM-026-019-A.jpg,22,34,False,97.711411,Male,81.990562,ANGRY
1,CFD-WM-026-006-HO.jpg,22,34,True,98.808716,Male,96.583267,HAPPY
2,CFD-WM-026-004-HC.jpg,22,34,False,94.573059,Female,52.528931,CALM
3,CFD-WM-026-001-N.jpg,22,34,False,99.664253,Female,59.811039,CONFUSED
4,CFD-WM-026-013-F.jpg,22,34,False,94.766983,Female,69.630272,FEAR


**Note:** Amazon has processed both of the single-face and multi-face entries of AirBnb - when doing the left join, we need to omit the entries that are not in the `inputs` table. Also note - by left joining, we are effectively only leaving out the entries in AirBnb that have been *recognised by all APIs* and that *consist of one face only*. 

In [56]:
display(len(amazon_airbnb))
display(len(airbnb_metadata))

894

549

In [57]:
processing_data = pd.merge(inputs, amazon_outputs, left_on="File Name", right_on="File Name", how="left")

### 4.2. Clarifai

In [58]:
CLARIFAI_CFD = "../tables/clarifai_CFD.csv"
CLARIFAI_NIMSTIM = "../tables/clarifai_NimStim.csv"
CLARIFAI_AI = "../tables/clarifai_AI.csv"
CLARIFAI_AIRBNB = "../tables/clarifai_AirBnb.csv"

In [59]:
clarifai_cfd = pd.read_csv(CLARIFAI_CFD,header=0)
clarifai_nimstim = pd.read_csv(CLARIFAI_NIMSTIM,header=0)
clarifai_ai = pd.read_csv(CLARIFAI_AI,header=0)
clarifai_airbnb = pd.read_csv(CLARIFAI_AIRBNB,header=0)
clarifai = [clarifai_cfd, clarifai_nimstim, clarifai_ai, clarifai_airbnb]
clarifai = pd.concat(clarifai, sort=False, ignore_index=True)

In [60]:
clarifai.columns

Index(['File Name', 'Target ID', 'Image ID', 'Race', 'Gender', 'Expression',
       'Clarifai Race', 'Race Confidence', 'Clarifai Gender',
       'Gender Confidence', 'Clarifai Age', 'Age Confidence', 'Emotion',
       'Mouth', 'Age'],
      dtype='object')

In [61]:
clarifai_outputs = clarifai[['File Name', 'Clarifai Race', 'Race Confidence', 'Clarifai Gender',
       'Gender Confidence', 'Clarifai Age', 'Age Confidence']]

In [62]:
clarifai_outputs = clarifai_outputs.rename(columns={
    'Clarifai Race' : 'clarifai_race', 
    'Race Confidence' : 'clarifai_race_confidence', 
    'Clarifai Gender' : 'clarifai_gender',
    'Gender Confidence' : 'clarifai_gender_confidence',
    'Clarifai Age' : 'clarifai_age', 
    'Age Confidence' : 'clarifai_age_confidence'
})

In [63]:
processing_data = pd.merge(processing_data, clarifai_outputs, left_on="File Name", right_on="File Name", how="left")

In [64]:
processing_data.head()

Unnamed: 0,File Name,Target ID,Race,Gender,Emotion,Age,Origin,amazon_age_low,amazon_age_high,amazon_smile,amazon_smile_confidence,amazon_gender,amazon_gender_confidence,amazon_emotion,clarifai_race,clarifai_race_confidence,clarifai_gender,clarifai_gender_confidence,clarifai_age,clarifai_age_confidence
0,CFD-WM-026-019-A.jpg,WM-026,W,M,A,25,CFD,22,34,False,97.711411,Male,81.990562,ANGRY,"hispanic, latino, or spanish origin",0.457294,masculine,0.990514,36,0.491376
1,CFD-WM-026-006-HO.jpg,WM-026,W,M,HO,25,CFD,22,34,True,98.808716,Male,96.583267,HAPPY,white,0.993373,masculine,0.979519,39,0.407927
2,CFD-WM-026-004-HC.jpg,WM-026,W,M,HC,25,CFD,22,34,False,94.573059,Female,52.528931,CALM,white,0.973847,masculine,0.994148,27,0.548631
3,CFD-WM-026-001-N.jpg,WM-026,W,M,N,25,CFD,22,34,False,99.664253,Female,59.811039,CONFUSED,white,0.975381,masculine,0.997049,28,0.501461
4,CFD-WM-026-013-F.jpg,WM-026,W,M,F,25,CFD,22,34,False,94.766983,Female,69.630272,FEAR,white,0.967802,masculine,0.998019,28,0.496877


In [65]:
FACEPP_CFD = "../tables/face++_CFD.csv"
FACEPP_NIMSTIM = "../tables/face++_NimStim.csv"
FACEPP_AI = "../tables/face++_AI.csv"
FACEPP_AIRBNB = "../tables/face++_AirBnb.csv"

In [66]:
facepp_cfd = pd.read_csv(FACEPP_CFD,header=0)
facepp_nimstim = pd.read_csv(FACEPP_NIMSTIM,header=0)
facepp_ai = pd.read_csv(FACEPP_AI,header=0)
facepp_airbnb = pd.read_csv(FACEPP_AIRBNB,header=0)

### 4.3. Face++
Face++ needs some minor data standatdizing. It wrongly uses the .jpg instead of the .jpeg extension in `facepp_nimstim`'s `File Name`. 

In [67]:
facepp_nimstim["File Name"] = facepp_nimstim["File Name"].map(lambda x: x.split('.')[0] + ".jpeg")

In [68]:
facepp = [facepp_cfd, facepp_nimstim, facepp_ai, facepp_airbnb]
facepp = pd.concat(facepp, sort=False, ignore_index=True)

In [69]:
facepp.columns

Index(['File Name', 'Target ID', 'Image ID', 'Race', 'Gender', 'Expression',
       'Face++ Gender', 'Face++ Age', 'Face++ Emotion', 'Face++ Smiling',
       'Emotion', 'Mouth', 'Age'],
      dtype='object')

In [70]:
facepp_outputs = facepp[['File Name', 'Face++ Gender', 'Face++ Age', 'Face++ Emotion', 'Face++ Smiling']]

In [71]:
facepp_outputs = facepp_outputs.rename(columns={
    'Face++ Gender' : 'face++_gender', 
    'Face++ Age' : 'face++_age', 
    'Face++ Emotion' : 'face++_emotion', 
    'Face++ Smiling' : 'face++_smile'
})

In [72]:
processing_data = pd.merge(processing_data, facepp_outputs, left_on="File Name", right_on="File Name", how="left")

In [73]:
processing_data.head()

Unnamed: 0,File Name,Target ID,Race,Gender,Emotion,Age,Origin,amazon_age_low,amazon_age_high,amazon_smile,...,clarifai_race,clarifai_race_confidence,clarifai_gender,clarifai_gender_confidence,clarifai_age,clarifai_age_confidence,face++_gender,face++_age,face++_emotion,face++_smile
0,CFD-WM-026-019-A.jpg,WM-026,W,M,A,25,CFD,22,34,False,...,"hispanic, latino, or spanish origin",0.457294,masculine,0.990514,36,0.491376,Male,31,neutral,False
1,CFD-WM-026-006-HO.jpg,WM-026,W,M,HO,25,CFD,22,34,True,...,white,0.993373,masculine,0.979519,39,0.407927,Male,29,happiness,True
2,CFD-WM-026-004-HC.jpg,WM-026,W,M,HC,25,CFD,22,34,False,...,white,0.973847,masculine,0.994148,27,0.548631,Male,31,neutral,False
3,CFD-WM-026-001-N.jpg,WM-026,W,M,N,25,CFD,22,34,False,...,white,0.975381,masculine,0.997049,28,0.501461,Male,25,neutral,False
4,CFD-WM-026-013-F.jpg,WM-026,W,M,F,25,CFD,22,34,False,...,white,0.967802,masculine,0.998019,28,0.496877,Male,24,anger,False


### 4.4. Microsoft
With Microsoft we need to be extra careful because of the .BMP and .TIFF images in the NimStim processing table. For AirBnb, the `Source` field (URL) needs to be converted into a canonical `File Name`. For AI, the identifier (`ID`) needs to be converted into a `File Name`. Also, the output fields are inconsistent across the tables so that needs to be adjusted as well (I checked and all the other APIs produce consistent output field names across the processing output tables). 

In [74]:
MS_CFD = "../tables/microsoft_CFD.csv"
MS_NIMSTIM = "../tables/microsoft_NimStim.csv"
MS_AI = "../tables/microsoft_AI.csv"
MS_AIRBNB = "../tables/microsoft_AirBnb.csv"

In [75]:
microsoft_cfd = pd.read_csv(MS_CFD,header=0)
microsoft_nimstim = pd.read_csv(MS_NIMSTIM,header=0)
microsoft_ai = pd.read_csv(MS_AI,header=0)
microsoft_airbnb = pd.read_csv(MS_AIRBNB,header=0)

First, fixing `microsoft_ai`.

In [76]:
microsoft_ai.head()

Unnamed: 0,ID,Source,Gender,Ethinicty,Age,Microsoft Gender,Microsoft Age,Microsoft Emotion,Microsoft Smiling
0,5dd09c1cdef8b400084dbeee,https://images.generated.photos/6fmZ07UXh9qfB9...,male,white,infant,female,1,happiness,0.999
1,5dd09c17def8b400084dbeb4,https://images.generated.photos/leempnkLRrt7DY...,male,white,infant,female,0,neutral,0.222
2,5dd09bc1def8b400084dbb08,https://images.generated.photos/LphN-Lj7rYR3NP...,male,white,infant,female,0,neutral,0.001
3,5dd09bbcdef8b400084dbaca,https://images.generated.photos/wSvds6lkw8fBOe...,male,white,infant,female,5,neutral,0.001
4,5dd09b90def8b400084db8e7,https://images.generated.photos/wX509wmnl3WZd_...,male,white,infant,male,1,happiness,0.825


In [77]:
def get_file_name(image_id): 
    return (inputs.loc[inputs["Target ID"] == image_id]['File Name'].values[0])

In [78]:
get_file_name('5dd09bbcdef8b400084dbaca')

'5dd09bbcdef8b400084dbaca_male_white_infant.jpeg'

In [79]:
microsoft_ai["File Name"] = microsoft_ai["ID"].map(get_file_name)

In [80]:
microsoft_ai.columns

Index(['ID', 'Source', 'Gender', 'Ethinicty', 'Age', 'Microsoft Gender',
       'Microsoft Age', 'Microsoft Emotion', 'Microsoft Smiling', 'File Name'],
      dtype='object')

In [81]:
microsoft_ai = microsoft_ai[['File Name', 'Microsoft Gender',
       'Microsoft Age', 'Microsoft Emotion', 'Microsoft Smiling']]

In [82]:
microsoft_ai = microsoft_ai.rename(columns={
    'Microsoft Gender' : 'microsoft_gender',
    'Microsoft Age' : 'microsoft_age', 
    'Microsoft Emotion' : 'microsoft_emotion', 
    'Microsoft Smiling' : 'microsoft_smile'
})

In [83]:
microsoft_ai["File Name"]

0       5dd09c1cdef8b400084dbeee_male_white_infant.jpeg
1       5dd09c17def8b400084dbeb4_male_white_infant.jpeg
2       5dd09bc1def8b400084dbb08_male_white_infant.jpeg
3       5dd09bbcdef8b400084dbaca_male_white_infant.jpeg
4       5dd09b90def8b400084db8e7_male_white_infant.jpeg
                             ...                       
243    5dd091e2def8b400084d4174_female_black_adult.jpeg
244    5dd09059def8b400084d2bbe_female_black_adult.jpeg
245    5dd08fe1def8b400084d24e1_female_black_adult.jpeg
246    5dd08b6bdef8b400084cdda5_female_black_adult.jpeg
247    5dd08a64def8b400084ccae7_female_black_adult.jpeg
Name: File Name, Length: 248, dtype: object

Now, dealing with `microsoft_airbnb`.

In [84]:
microsoft_airbnb["File Name"] = microsoft_airbnb["Source"].map(get_canonical_name)

In [85]:
microsoft_airbnb = microsoft_airbnb[["File Name", "Gender", "Age", "Emotion", "Smile"]]

In [86]:
microsoft_airbnb = microsoft_airbnb.rename(columns = {
    "Gender" : 'microsoft_gender', 
    "Age" : 'microsoft_age', 
    "Emotion" : 'microsoft_emotion', 
    "Smile" : 'microsoft_smile'
})

And finaly `microsoft_nimstim`. To start with, all teh .tiff images are duplicates, so they need to be deleted. 

In [87]:
microsoft_nimstim = microsoft_nimstim[microsoft_nimstim["File Name"].map(lambda x: 'tiff' not in x)]

In [88]:
microsoft_nimstim["File Name"] = microsoft_nimstim["File Name"].map(lambda x: x.split('.')[0] + '.jpeg')

In [89]:
microsoft_nimstim = microsoft_nimstim[['File Name', 'Microsoft Gender',
       'Microsoft Age', 'Microsoft Emotion', 'Microsoft Smiling']]

In [90]:
microsoft_nimstim = microsoft_nimstim.rename(columns={
    'Microsoft Gender' : 'microsoft_gender',
    'Microsoft Age' : 'microsoft_age', 
    'Microsoft Emotion' : 'microsoft_emotion', 
    'Microsoft Smiling' : 'microsoft_smile'
})

We also need to normalise `microsoft_cfd` to follow the same field convention as the other tables (otherwise the concatenation of tables will not be successful). 

In [91]:
microsoft_cfd = microsoft_cfd[['File Name', 'Microsoft Gender',
       'Microsoft Age', 'Microsoft Emotion', 'Microsoft Smiling']]

In [92]:
microsoft_cfd = microsoft_cfd.rename(columns={
    'Microsoft Gender' : 'microsoft_gender',
    'Microsoft Age' : 'microsoft_age', 
    'Microsoft Emotion' : 'microsoft_emotion', 
    'Microsoft Smiling' : 'microsoft_smile'
})

In [93]:
microsoft_outputs = [microsoft_cfd, microsoft_nimstim, microsoft_ai, microsoft_airbnb]
microsoft_outputs = pd.concat(microsoft_outputs, sort=False, ignore_index=True)

In [94]:
microsoft_outputs.head()

Unnamed: 0,File Name,microsoft_gender,microsoft_age,microsoft_emotion,microsoft_smile
0,CFD-WM-026-019-A.jpg,male,22.0,neutral,0.001
1,CFD-WM-026-006-HO.jpg,male,21.0,happiness,1.0
2,CFD-WM-026-004-HC.jpg,male,23.0,happiness,0.774
3,CFD-WM-026-001-N.jpg,male,21.0,neutral,0.0
4,CFD-WM-026-013-F.jpg,male,23.0,surprise,0.0


In [95]:
processing_data = pd.merge(processing_data, microsoft_outputs, left_on="File Name", right_on="File Name", how="left")

Finally, our combined table looks like this: 

In [96]:
processing_data

Unnamed: 0,File Name,Target ID,Race,Gender,Emotion,Age,Origin,amazon_age_low,amazon_age_high,amazon_smile,...,clarifai_age,clarifai_age_confidence,face++_gender,face++_age,face++_emotion,face++_smile,microsoft_gender,microsoft_age,microsoft_emotion,microsoft_smile
0,CFD-WM-026-019-A.jpg,WM-026,W,M,A,25,CFD,22,34,False,...,36,0.491376,Male,31,neutral,False,male,22.0,neutral,0.001
1,CFD-WM-026-006-HO.jpg,WM-026,W,M,HO,25,CFD,22,34,True,...,39,0.407927,Male,29,happiness,True,male,21.0,happiness,1.000
2,CFD-WM-026-004-HC.jpg,WM-026,W,M,HC,25,CFD,22,34,False,...,27,0.548631,Male,31,neutral,False,male,23.0,happiness,0.774
3,CFD-WM-026-001-N.jpg,WM-026,W,M,N,25,CFD,22,34,False,...,28,0.501461,Male,25,neutral,False,male,21.0,neutral,0.000
4,CFD-WM-026-013-F.jpg,WM-026,W,M,F,25,CFD,22,34,False,...,28,0.496877,Male,24,anger,False,male,23.0,surprise,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2672,5dd09cb3def8b400084dc569_female_white_young-ad...,5dd09cb3def8b400084dc569,white,female,,young-adult,AI,18,30,True,...,36,0.511536,Female,26,happiness,True,female,24.0,happiness,1.000
2673,5dd09cb3def8b400084dc56a_female_white_young-ad...,5dd09cb3def8b400084dc56a,white,female,,young-adult,AI,19,31,True,...,36,0.585459,Female,29,happiness,True,female,24.0,happiness,1.000
2674,5dd09cb3def8b400084dc56b_female_white_adult.jpeg,5dd09cb3def8b400084dc56b,white,female,,adult,AI,26,40,True,...,38,0.693285,Female,43,happiness,True,female,31.0,happiness,1.000
2675,5dd09cb4def8b400084dc56c_female_white_young-ad...,5dd09cb4def8b400084dc56c,white,female,,young-adult,AI,20,32,True,...,39,0.630657,Female,24,happiness,True,female,30.0,happiness,1.000


In [97]:
output_fields = processing_data.columns[7:]

Test for NaN values in the output fields. 

In [98]:
for field in output_fields: 
    if sum(processing_data[field].map(pd.isna)) is not 0: 
        raise Exception("NaN value encountered in the output fields.")

To export as a .csv file, uncomment the cell below: 

In [99]:
# processing_data.to_csv('./processing_data.csv', index=False)

## 5. Standardization

In [100]:
processing_data.columns

Index(['File Name', 'Target ID', 'Race', 'Gender', 'Emotion', 'Age', 'Origin',
       'amazon_age_low', 'amazon_age_high', 'amazon_smile',
       'amazon_smile_confidence', 'amazon_gender', 'amazon_gender_confidence',
       'amazon_emotion', 'clarifai_race', 'clarifai_race_confidence',
       'clarifai_gender', 'clarifai_gender_confidence', 'clarifai_age',
       'clarifai_age_confidence', 'face++_gender', 'face++_age',
       'face++_emotion', 'face++_smile', 'microsoft_gender', 'microsoft_age',
       'microsoft_emotion', 'microsoft_smile'],
      dtype='object')

### 5.1. Race

In [101]:
display(set(processing_data["Race"]))
display(set(processing_data["clarifai_race"]))

{'A',
 'B',
 'L',
 'W',
 'asian',
 'black',
 'latino',
 'middle_eastern',
 'other',
 'white'}

{'american indian or alaska native',
 'asian',
 'black or african american',
 'hispanic, latino, or spanish origin',
 'middle eastern or north african',
 'native hawaiian or pacific islander',
 'white'}

In [102]:
race_mapping = {
    1: ('A', 'asian'),
    2: ('B', 'black', 'black or african american'),
    3: ('L', 'latino', 'hispanic, latino, or spanish origin'),
    4: ('W', 'white'),
    5: ('middle_eastern', 'middle eastern or north african'),
    0: ('other', 'american indian or alaska native', 'native hawaiian or pacific islander'),
}

### 5.2. Gender

In [103]:
gender_columns = ["Gender", "amazon_gender", "clarifai_gender", "face++_gender", "microsoft_gender"]
genders = set()
for col in gender_columns: 
    genders = genders | set(processing_data[col])

In [104]:
genders

{'F',
 'Female',
 'M',
 'Male',
 'f',
 'female',
 'feminine',
 'male',
 'masculine',
 'unsure'}

In [105]:
gender_mapping = {
    1: ('F', 'Female', 'f', 'female', 'feminine'), 
    2: ('M', 'Male', 'male', 'masculine'), 
    0: ('unsure')
}

### 5.3. Emotion and Smile 

In [106]:
emotion_columns = ["Emotion", "amazon_emotion", "face++_emotion", "microsoft_emotion"]
emotions = set()
for col in emotion_columns: 
    emotions = emotions | set(processing_data[col])

In [107]:
emotions

{'A',
 'AN',
 'ANGRY',
 'CA',
 'CALM',
 'CONFUSED',
 'DI',
 'DISGUSTED',
 'F',
 'FE',
 'FEAR',
 'HA',
 'HAPPY',
 'HC',
 'HO',
 'N',
 'NE',
 None,
 'SA',
 'SAD',
 'SP',
 'SURPRISED',
 'an',
 'anger',
 'angry',
 'ca',
 'contempt',
 'di',
 'disgust',
 'fear',
 'happiness',
 'happy',
 'neutral',
 'other',
 'sad',
 'sadness',
 'scared',
 'surprise',
 'surprised'}

In [108]:
emotion_mapping = {
    # Angry
    1: ('A','AN','ANGRY', 'an', 'anger', 'angry'),
    # Calm / Neutral 
    2: ('CA', 'CALM', 'N', 'NE', 'ca', 'neutral'),
    # Disgusted
    3: ('DI', 'DISGUSTED', 'di', 'disgust'),
    # Fearful
    4: ('F', 'FE', 'FEAR', 'fear', 'scared'),
    # Happy
    5: ('HA', 'HAPPY', 'HC', 'HO', 'happiness', 'happy'),
    # Sad
    6: ('SA', 'SAD', 'sad', 'sadness'),
    # Surprised
    7: ('SURPRISED', 'surprise', 'surprised'),
    # Other 
    0: ('CONFUSED', None, 'contempt', 'other'),
}

In [112]:
smile_columns = ["amazon_smile", "face++_smile", "microsoft_smile"]
smiles = set()
for col in smile_columns: 
    smiles = smiles | set(processing_data[col])

In [109]:
st_data = processing_data.copy()