# CFD Processed Data

In [70]:
import numpy as np
import pandas as pd

from scipy.stats import spearmanr

import matplotlib.pyplot as plt

## Importing the data sets

Setting the data paths: 

In [71]:
CLARIFAI_OUTPUT_PATH = "../api_processing/Code/Clarifai/CFD/clarifai_output_corrected.csv"

In [72]:
FACEPP_OUTPUT_PATH = "../api_processing/Code/Face++/CFD/face++_output.csv"

In [73]:
MICROSOFT_OUTPUT_PATH = "../api_processing/Code/Microsoft/CFD/microsoft_output_corrected.csv"

In [74]:
AMAZON_OUTPUT_PATH = "../api_processing/Code/Amazon/CFD/amazon_output.csv"

Importing CSV outputs as Pandas data frames.

In [75]:
clarifai_raw = pd.read_csv(CLARIFAI_OUTPUT_PATH,header=0)
facepp_raw = pd.read_csv(FACEPP_OUTPUT_PATH,header=0)
microsoft_raw = pd.read_csv(MICROSOFT_OUTPUT_PATH,header=0)
amazon_raw = pd.read_csv(AMAZON_OUTPUT_PATH,header=0)

## Browsing the data

Displaying the first 5 entries of the (sorted) table. Sorting is done to compare corresponding entries from the (File Name being the primary key of the entries). 

### Clarifai

In [76]:
clarifai_raw.sort_values(by=['File Name']).head()

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,Clarifai Race,Race Confidence,Clarifai Gender,Gender Confidence,Clarifai Age,Age Confidence
715,CFD-AF-200-228-N.jpg,AF-200,228,A,F,N,asian,0.999892,feminine,0.99557,39,0.764153
688,CFD-AF-201-060-N.jpg,AF-201,60,A,F,N,asian,0.998641,masculine,0.657457,39,0.461942
900,CFD-AF-202-122-N.jpg,AF-202,122,A,F,N,asian,0.735281,masculine,0.78087,39,0.587385
303,CFD-AF-203-077-N.jpg,AF-203,77,A,F,N,asian,0.998931,feminine,0.938326,34,0.489016
559,CFD-AF-204-067-N.jpg,AF-204,67,A,F,N,asian,0.999699,feminine,0.919285,38,0.703313


### Face++

In [77]:
facepp_raw.sort_values(by=['File Name']).head()

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,Face++ Gender,Face++ Age,Face++ Emotion,Face++ Smiling
543,CFD-AF-200-228-N.jpg,AF-200,228,A,F,N,Female,48,surprise,False
934,CFD-AF-201-060-N.jpg,AF-201,60,A,F,N,Female,24,neutral,False
551,CFD-AF-202-122-N.jpg,AF-202,122,A,F,N,Male,30,neutral,False
1147,CFD-AF-203-077-N.jpg,AF-203,77,A,F,N,Female,26,neutral,False
1081,CFD-AF-204-067-N.jpg,AF-204,67,A,F,N,Female,20,sadness,False


### Microsoft 

In [78]:
microsoft_raw.sort_values(by=['File Name']).head()

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,Microsoft Gender,Microsoft Age,Microsoft Emotion,Microsoft Smiling
459,CFD-AF-200-228-N.jpg,AF-200,228,A,F,N,female,32.0,neutral,0.0
380,CFD-AF-201-060-N.jpg,AF-201,60,A,F,N,female,22.0,neutral,0.0
933,CFD-AF-202-122-N.jpg,AF-202,122,A,F,N,female,22.0,neutral,0.0
993,CFD-AF-203-077-N.jpg,AF-203,77,A,F,N,female,19.0,neutral,0.0
1009,CFD-AF-204-067-N.jpg,AF-204,67,A,F,N,female,28.0,neutral,0.0


### Amazon

In [79]:
amazon_raw.sort_values(by=['File Name']).head()

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,age_low,age_high,smile_value,smile_confidence,gender_value,gender_confidence,top_emotion
459,CFD-AF-200-228-N.jpg,AF-200,228,A,F,N,22,34,False,96.689507,Female,98.523636,SURPRISED
380,CFD-AF-201-060-N.jpg,AF-201,60,A,F,N,22,34,False,98.7957,Female,93.703087,CALM
933,CFD-AF-202-122-N.jpg,AF-202,122,A,F,N,22,34,False,99.194702,Female,91.597496,CALM
993,CFD-AF-203-077-N.jpg,AF-203,77,A,F,N,20,32,False,96.263527,Female,87.574516,ANGRY
1009,CFD-AF-204-067-N.jpg,AF-204,67,A,F,N,22,34,False,99.297867,Female,99.072227,CALM


#### Displaying basic statistics about the sets: 

In [80]:
clarifai_raw.describe(include = [np.object, np.number, np.bool])

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,Clarifai Race,Race Confidence,Clarifai Gender,Gender Confidence,Clarifai Age,Age Confidence
count,1207,1207,1207.0,1207,1207,1207,1207,1207.0,1207,1207.0,1207.0,1207.0
unique,1207,597,,4,2,5,6,,2,,,
top,CFD-WF-035-019-F.jpg,WF-007,,B,F,N,black or african american,,masculine,,,
freq,1,5,,526,644,597,490,,713,,,
mean,,,59.38691,,,,,0.88258,,0.916526,36.070423,0.574756
std,,,69.992716,,,,,0.190902,,0.12773,8.096281,0.116007
min,,,1.0,,,,,0.213848,,0.50126,1.0,0.261275
25%,,,9.0,,,,,0.840955,,0.88658,34.0,0.491227
50%,,,26.0,,,,,0.989538,,0.987086,39.0,0.566697
75%,,,88.0,,,,,0.999417,,0.998416,39.0,0.652895


**Problem**: The statistics are not very useful, because most of the categories are represented by strings, not numbers. Let's replace strings with numerical values. 

## Standardizing Values 
We need to turn `String` values into numerical values.   

Let's extract the Chicago Fire Dataset features in a dictionary. We are now going to try to adjust the mapping of the API values to this one. 

In [81]:
chicago_features = {
    'Race' : {'A': 0, 'W': 1, 'L': 2, 'B': 3}, 
    'Gender': {'F': 0, 'M': 1}, 
    'Expression': {'N': 0, 'A': 1, 'HO': 2, 'HC': 2, 'F': 3}
}

**Note:** HO and HC are basically the same emotion, so we map them to the same value. 

In [82]:
clarifai_features = {
    'Clarifai Race': {
        'asian': 0, 
        'white': 1,
        'hispanic, latino, or spanish origin': 2, 
        'black or african american': 3,
        'american indian or alaska native': 5, 
        'middle eastern or north african': 6
    }, 
    'Clarifai Gender': {'feminine': 0, 'masculine': 1}, 
}

In [83]:
facepp_features = {
    "Face++ Gender": {'Female': 0, 'Male': 1},
    "Face++ Emotion": {
        'neutral': 0,
        'anger': 1,
        'happiness': 2,
        'fear': 3,
        'surprise': 4,
        'disgust': 5,
        'sadness': 6
    },
    "Face++ Smiling": {False: 0, True: 1}
}

In [84]:
amazon_features = {
    "smile_value": {False: 0, True: 1}, 
    "gender_value": {"Female": 0, "Male": 1}, 
    "top_emotion" : {"CALM": 0, "ANGRY": 1, "HAPPY": 2, "FEAR": 3, 
                     "SURPRISED": 4, "DISGUSTED": 5, "SAD": 6, "CONFUSED": 8}
}

In [85]:
microsoft_features = {
    "Microsoft Gender": {"female": 0, "male": 1}, 
    "Microsoft Emotion": {"neutral": 0, "anger": 1, "happiness": 2, "fear": 3,
                          "surprise": 4, "disgust": 5, "sadness": 6, "contempt": 7}
}

In [86]:
facepp = facepp_raw.copy()
for field in {**chicago_features, **facepp_features}: 
    facepp[field] = facepp_raw[field].map({**chicago_features, **facepp_features}[field])

In [87]:
clarifai = clarifai_raw.copy()
for field in {**chicago_features, **clarifai_features}: 
    clarifai[field] = clarifai_raw[field].map({**chicago_features, **clarifai_features}[field])

In [None]:
# facepp = facepp_raw.copy()
# for field in {**chicago_features, **facepp_features}: 
#     facepp[field] = facepp_raw[field].map({**chicago_features, **facepp_features}[field])

In [None]:
# facepp = facepp_raw.copy()
# for field in {**chicago_features, **facepp_features}: 
#     facepp[field] = facepp_raw[field].map({**chicago_features, **facepp_features}[field])

In [53]:
facepp.head()

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,Face++ Gender,Face++ Age,Face++ Emotion,Face++ Smiling
0,CFD-BM-015-012-A.jpg,BM-015,12,3,1,1,1,38,1,0
1,CFD-BF-039-031-N.jpg,BF-039,31,3,0,0,0,24,0,0
2,CFD-BF-048-006-HC.jpg,BF-048,6,3,0,2,0,52,2,1
3,CFD-AM-218-085-N.jpg,AM-218,85,0,1,0,1,42,0,0
4,CFD-AF-249-092-N.jpg,AF-249,92,0,0,0,0,35,4,0


In [51]:
clarifai.head()

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,Clarifai Race,Race Confidence,Clarifai Gender,Gender Confidence,Clarifai Age,Age Confidence
0,CFD-LF-212-066-N.jpg,LF-212,66,2,0,0,0,0.446271,0,0.883454,1,0.917314
1,CFD-BF-205-141-N.jpg,BF-205,141,3,0,0,3,0.999764,1,0.931101,1,0.450996
2,CFD-WF-030-002-N.jpg,WF-030,2,1,0,0,1,0.887473,1,0.553333,1,0.474595
3,CFD-WF-001-003-N.jpg,WF-001,3,1,0,0,1,0.966461,1,0.955751,1,0.527979
4,CFD-LF-239-148-N.jpg,LF-239,148,2,0,0,2,0.429596,0,0.833086,1,0.740188


Now, let's add a few new features indicating correct or incorrect misclassification. 


In [None]:
clarifai["Gender Classification"] = 0

for index, row in clarifai.iterrows(): 
    if row["Gender"] == row["Clarifai Gender"]:
        clarifai.at[index, 'Gender Classification'] = 1
        

In [None]:
facepp["Gender Classification"] = 0

for index, row in facepp.iterrows(): 
    if row["Gender"] == row["Face++ Gender"]:
        facepp.at[index, 'Gender Classification'] = 1

Finally, the tables look as follows: 

In [54]:
clarifai.head()

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,Clarifai Race,Race Confidence,Clarifai Gender,Gender Confidence,Clarifai Age,Age Confidence
0,CFD-LF-212-066-N.jpg,LF-212,66,2,0,0,0,0.446271,0,0.883454,1,0.917314
1,CFD-BF-205-141-N.jpg,BF-205,141,3,0,0,3,0.999764,1,0.931101,1,0.450996
2,CFD-WF-030-002-N.jpg,WF-030,2,1,0,0,1,0.887473,1,0.553333,1,0.474595
3,CFD-WF-001-003-N.jpg,WF-001,3,1,0,0,1,0.966461,1,0.955751,1,0.527979
4,CFD-LF-239-148-N.jpg,LF-239,148,2,0,0,2,0.429596,0,0.833086,1,0.740188


In [55]:
facepp.head()

Unnamed: 0,File Name,Target ID,Image ID,Race,Gender,Expression,Face++ Gender,Face++ Age,Face++ Emotion,Face++ Smiling
0,CFD-BM-015-012-A.jpg,BM-015,12,3,1,1,1,38,1,0
1,CFD-BF-039-031-N.jpg,BF-039,31,3,0,0,0,24,0,0
2,CFD-BF-048-006-HC.jpg,BF-048,6,3,0,2,0,52,2,1
3,CFD-AM-218-085-N.jpg,AM-218,85,0,1,0,1,42,0,0
4,CFD-AF-249-092-N.jpg,AF-249,92,0,0,0,0,35,4,0


And the statistics: 

In [56]:
clarifai.describe()

Unnamed: 0,Image ID,Race,Gender,Expression,Clarifai Race,Race Confidence,Clarifai Gender,Gender Confidence,Clarifai Age,Age Confidence
count,1207.0,1207.0,1207.0,1207.0,1207.0,1207.0,1207.0,1207.0,1207.0,1207.0
mean,59.38691,1.870754,0.466446,1.006628,1.829329,0.88258,0.590721,0.916526,36.070423,0.574756
std,69.992716,1.079677,0.49908,1.11523,1.20007,0.190902,0.491905,0.12773,8.096281,0.116007
min,1.0,0.0,0.0,0.0,0.0,0.213848,0.0,0.50126,1.0,0.261275
25%,9.0,1.0,0.0,0.0,1.0,0.840955,0.0,0.88658,34.0,0.491227
50%,26.0,2.0,0.0,1.0,2.0,0.989538,1.0,0.987086,39.0,0.566697
75%,88.0,3.0,1.0,2.0,3.0,0.999417,1.0,0.998416,39.0,0.652895
max,357.0,3.0,1.0,3.0,6.0,1.0,1.0,0.999999,71.0,0.983919


In [57]:
facepp.describe()

Unnamed: 0,Image ID,Race,Gender,Expression,Face++ Gender,Face++ Age,Face++ Emotion,Face++ Smiling
count,1207.0,1207.0,1207.0,1207.0,1207.0,1207.0,1207.0,1207.0
mean,59.38691,1.870754,0.466446,1.006628,0.529412,34.6628,1.444076,0.318144
std,69.992716,1.079677,0.49908,1.11523,0.499341,9.429399,1.740641,0.465949
min,1.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0
25%,9.0,1.0,0.0,0.0,0.0,28.0,0.0,0.0
50%,26.0,2.0,0.0,1.0,1.0,33.0,1.0,0.0
75%,88.0,3.0,1.0,2.0,1.0,41.0,2.0,1.0
max,357.0,3.0,1.0,3.0,1.0,73.0,6.0,1.0


### 3. Spearman test for Clarifai

In [None]:
gender_classification = np.array(clarifai["Gender Classification"]).reshape(-1,1)
race = np.array(clarifai["Race"]).reshape(-1,1)
gender = np.array(clarifai["Gender"]).reshape(-1,1)

race_and_genderclass = np.concatenate([race, gender_classification], axis=1)
race_and_gender = np.concatenate([race, gender], axis=1)

**Note:** we can pass both arrays separately or together as a matrix. 

In [None]:
spearmanr(race, gender_classification)

In [None]:
spearmanr(race_and_genderclass)

In [None]:
spearmanr(gender, gender_classification)

In [None]:
rho, p = spearmanr(race_and_gender, gender_classification)
rho

In [None]:
misclassified_women = clarifai[(clarifai["Gender"]==0) & (clarifai["Gender Classification"]==0)]
misclassified_men = clarifai[(clarifai["Gender"]==1) & (clarifai["Gender Classification"]==0)]

In [None]:
print("# of misclassified women: ", len(misclassified_women))
print("# of misclassified men: ", len(misclassified_men))

In [None]:
misclassified_black_women = clarifai[(clarifai["Gender"]==0) 
                                     & (clarifai["Gender Classification"]==0)
                                     & (clarifai["Race"]==3)]
misclassified_white_women = clarifai[(clarifai["Gender"]==0) 
                                     & (clarifai["Gender Classification"]==0)
                                     & (clarifai["Race"]==1)]

In [None]:
print("# of misclassified black women: ", len(misclassified_black_women))
print("# of misclassified white women: ", len(misclassified_white_women))

TODO: Recompute Spearman with gender (for race)

### 4. Spearman test for Face++

In [None]:
fpp_gender_classification = np.array(facepp["Gender Classification"]).reshape(-1,1)
fpp_race = np.array(facepp["Race"]).reshape(-1,1)
fpp_gender = np.array(facepp["Gender"]).reshape(-1,1)

In [None]:
spearmanr(fpp_race, fpp_gender_classification)

In [None]:
spearmanr(fpp_gender, fpp_gender_classification)

In [None]:
fpp_misclassified_women = facepp[(facepp["Gender"]==0) & (facepp["Gender Classification"]==0)]
fpp_misclassified_men = facepp[(facepp["Gender"]==1) & (facepp["Gender Classification"]==0)]\

In [None]:
print("# of misclassified women: ", len(fpp_misclassified_women))
print("# of misclassified men: ", len(fpp_misclassified_men))

In [None]:
fpp_misclassified_black_women = facepp[(facepp["Gender"]==0) 
                                     & (facepp["Gender Classification"]==0)
                                     & (facepp["Race"]==3)]
fpp_misclassified_white_women = facepp[(facepp["Gender"]==0) 
                                     & (facepp["Gender Classification"]==0)
                                     & (facepp["Race"]==1)]

In [None]:
print("# of misclassified black women: ", len(fpp_misclassified_black_women))
print("# of misclassified white women: ", len(fpp_misclassified_white_women))

TODO: Exclude repetitions