# Analyze Celebrity Recognition Results
## In this file, we test AWS's celebrity recognition model on the top 40 Chinese and top 40 Western (from Anglophone countries) celebrity face images. 

In [66]:
import pandas as pd # library for data manipulation
import boto3 # Amazon Web Services (AWS) Software Development Kit (SDK) for Python; used to interact with AWS services, such as S3 (Simple Storage Service),
import json # imports the built-in json module, which provides methods for encoding and decoding JSON data
import s3fs # imports the s3fs library, which is a Python interface to interact with data stored on Amazon S3
import io # imports the built-in io module, which provides the Python interfaces for stream handling.
import os
import subprocess
import re

import numpy as np
from IPython.display import Image as IImage

from PIL import Image, ImageDraw, ExifTags, ImageColor
from matplotlib.pyplot import imshow


In [107]:
# Initialize rekognition client 
client=boto3.client('rekognition')

In [108]:
# load csvs for Western and Chinese celebrities

western_celebs_csv = pd.read_csv('/home/ec2-user/SageMaker/Celebrity_recognition_project/western_celebs.csv')
chinese_celebs_csv = pd.read_csv('/home/ec2-user/SageMaker/Celebrity_recognition_project/chinese_celebs.csv')


In [109]:
def recognize_celebs(bucket_name):
    """
    Recognize celebrities in images containing the specified celebrity name.

    Parameters:
    bucket_name (str): The bucket name (bucket must contain all celebrity images)

    Returns:
    dictionary: 
    
    The function searches for images in a list of file names that contain the specified celebrity name.
    It then uses a recognition service to identify celebrity faces in each image.
    For each image, the recognized celebrity names and matching confidence scores are stored as a tuple 
    If no celebrity matches are found, store (None, 0)
    
    Note:
    - You need to have the 'client' and other required variables set up for this function to work correctly.
    - The function does not include code to visualize each image being compared per iteration. You can add that functionality separately.
    """
    s3 = boto3.client('s3')
    confidence_dict = {}

    # List objects at the root level
    objects = s3.list_objects_v2(Bucket=bucket_name, Delimiter='/')

    # Iterate through each "folder" (common prefix)
    for common_prefix in objects.get('CommonPrefixes', []):
        folder_key = common_prefix['Prefix']
        # Check if folder_key exists in confidence_dict
        if folder_key not in confidence_dict:
            confidence_dict[folder_key] = {}
        # List all objects in the folder
        folder_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_key)
        pics = folder_objects.get('Contents', [])
        for pic in pics:
            pic_name = pic['Key']
            # Check if pic_key exists in the nested dictionary
            if pic['Key'] not in confidence_dict[folder_key]:
                confidence_dict[folder_key][pic_name] = {}
            response = client.recognize_celebrities(Image={'S3Object':{'Bucket': bucket_name,'Name': pic_name}})
            if 'CelebrityFaces' in response and response['CelebrityFaces'] != []:
                for celebrity in response['CelebrityFaces']:
                    # Access keys you want from the response
                    match_name = celebrity.get('Name')
                    confidence_score = celebrity.get('MatchConfidence')
                    # Process the keys as needed
                    print(f"File: {pic['Key']}, Match Name: {match_name}, Match Confidence: {confidence_score}")  
                    confidence_dict[folder_key][pic_name]['match_name'] = match_name
                    confidence_dict[folder_key][pic_name]['confidence_score'] = confidence_score

            else: 
                print(f'No celebrity matches found for {pic_name}')
                confidence_dict[folder_key][pic_name]['match_name'] = 'None'
                confidence_dict[folder_key][pic_name]['confidence_score'] = 0 
        
    return confidence_dict



In [148]:
results_chinese_celebs = recognize_celebs('chinese-celebs')

File: Andy Lau/Andy Lau_42.jpg, Match Name: Andy Lau, Match Confidence: 89.07782745361328
File: Andy Lau/Andy Lau_44.jpg, Match Name: Andy Lau, Match Confidence: 99.75121307373047
File: Andy Lau/Andy Lau_46.jpg, Match Name: Andy Lau, Match Confidence: 96.03267669677734
File: Andy Lau/Andy Lau_7.jpg, Match Name: Andy Lau, Match Confidence: 99.63941955566406
File: Andy Lau/Andy Lau_9.jpg, Match Name: Andy Lau, Match Confidence: 88.46404266357422
File: Angelababy/Angelababy_47.jpg, Match Name: Angelababy, Match Confidence: 99.1843490600586
File: Angelababy/Angelababy_5.jpg, Match Name: Angelababy, Match Confidence: 99.60169982910156
File: Angelababy/Angelababy_50.jpg, Match Name: Angelababy, Match Confidence: 99.8139877319336
File: Angelababy/Angelababy_6.jpg, Match Name: Angelababy, Match Confidence: 99.24556732177734
File: Angelababy/Angelababy_8.jpg, Match Name: Angelababy, Match Confidence: 99.60244750976562
File: Chen Xuedong/Chen Xuedong_48.jpg, Match Name: Cheney Chen, Match Confid

In [149]:
# print out the dictionary in a more readable format

for celebrity, images in results_chinese_celebs.items():
    print(f"{celebrity}:")
    for image, data in images.items():
        print(f"  {image}: {data}")


Andy Lau/:
  Andy Lau/Andy Lau_42.jpg: {'match_name': 'Andy Lau', 'confidence_score': 89.07782745361328}
  Andy Lau/Andy Lau_44.jpg: {'match_name': 'Andy Lau', 'confidence_score': 99.75121307373047}
  Andy Lau/Andy Lau_46.jpg: {'match_name': 'Andy Lau', 'confidence_score': 96.03267669677734}
  Andy Lau/Andy Lau_7.jpg: {'match_name': 'Andy Lau', 'confidence_score': 99.63941955566406}
  Andy Lau/Andy Lau_9.jpg: {'match_name': 'Andy Lau', 'confidence_score': 88.46404266357422}
Angelababy/:
  Angelababy/Angelababy_47.jpg: {'match_name': 'Angelababy', 'confidence_score': 99.1843490600586}
  Angelababy/Angelababy_5.jpg: {'match_name': 'Angelababy', 'confidence_score': 99.60169982910156}
  Angelababy/Angelababy_50.jpg: {'match_name': 'Angelababy', 'confidence_score': 99.8139877319336}
  Angelababy/Angelababy_6.jpg: {'match_name': 'Angelababy', 'confidence_score': 99.24556732177734}
  Angelababy/Angelababy_8.jpg: {'match_name': 'Angelababy', 'confidence_score': 99.60244750976562}
Chen Xuedong/

In [150]:
# Count how many incorrect, correct scores there are and calculate average confidence 
correct = {}
unidentified = {}
incorrect = {}
avg_correct_confidence = {}
avg_incorrect_confidence = {} 

for celebrity, images in results_chinese_celebs.items():
    celebrity = celebrity.rstrip('//')
    correct[celebrity] = 0
    incorrect[celebrity] = 0
    unidentified[celebrity] = 0
    avg_correct_confidence[celebrity] = 0
    avg_incorrect_confidence[celebrity] = 0
    for image, data in images.items():
        if celebrity in data['match_name']:
            avg_correct_confidence[celebrity] += data['confidence_score']
            correct[celebrity] += 1
        elif ('None' in data['match_name']):
            unidentified[celebrity] += ('None' == data['match_name'])
        else:
            avg_incorrect_confidence[celebrity] += data['confidence_score']
            incorrect[celebrity] += 1
            
    if incorrect[celebrity] != 0:
        avg_incorrect_confidence[celebrity] /= incorrect[celebrity]
    if correct[celebrity] != 0:
        avg_correct_confidence[celebrity] /= correct[celebrity]

In [151]:
# Convert dictionaries to pandas dataframes

correct = pd.DataFrame(list(correct.items()), columns=['Celebrity', 'correct'])
unidentified = pd.DataFrame(list(unidentified.items()), columns=['Celebrity', 'unidentified'])
incorrect = pd.DataFrame(list(incorrect.items()), columns=['Celebrity', 'incorrect'])
avg_correct_confidence = pd.DataFrame(list(avg_correct_confidence.items()), columns=['Celebrity', 'avg_correct_confidence'])
avg_incorrect_confidence = pd.DataFrame(list(avg_incorrect_confidence.items()), columns=['Celebrity', 'avg_incorrect_confidence'])



In [152]:
# Merge together dictionaries

chinese_celebs_df = pd.merge(pd.merge(correct, incorrect), unidentified)
df1 = pd.merge(avg_correct_confidence, avg_incorrect_confidence)
chinese_celebs_df = pd.merge(chinese_celebs_df, df1)

In [169]:
# Data cleaning (replace 0 values with NaN in confidence columns)
chinese_celebs_df['avg_incorrect_confidence'].replace(0., np.NaN, inplace = True)
chinese_celebs_df['avg_correct_confidence'].replace(0., np.NaN, inplace = True)
chinese_celebs_df

# Merge together the original celebrity list csv and the results dataframe
chinese_celebs_df = pd.merge(chinese_celebs_csv, chinese_celebs_df, left_on='Name', right_on='Celebrity', how='inner')

chinese_celebs_df.drop(['Celebrity', 'Rank'], axis = 1, inplace = True)


KeyError: 'Celebrity'

In [170]:
# export results to csv
chinese_celebs_df.to_csv('chinese_celebs_results.csv')


# Calculate Accuracy

In [167]:
miss_rate = np.sum(chinese_celebs_df['incorrect']) / (np.sum(chinese_celebs_df['incorrect']) + np.sum(chinese_celebs_df['correct']))
accuracy = 1 - miss_rate
print(f'The percent of Chinese celebrities incorrectly or not identified is: {miss_rate * 100} %')
print(f'The percent of Chinese celebrities correctly identified is: {accuracy * 100} %')


The percent of Chinese celebrities incorrectly or not identified is: 19.11764705882353 %
The percent of Chinese celebrities correctly identified is: 80.88235294117648 %


# Repeating the same procedure with Western celebrities:

In [171]:
%%capture
western_celeb_results_raw = recognize_celebs('western-celebs')

In [172]:
# print out the 'western_celeb_results_raw' dictionary in a more readable format

for celebrity, images in western_celeb_results_raw.items():
    print(f"{celebrity}:")
    for image, data in images.items():
        print(f"  {image}: {data}")


Adam Sandler/:
  Adam Sandler/Adam Sandler_30.jpg: {'match_name': 'Adam Sandler', 'confidence_score': 99.46633911132812}
  Adam Sandler/Adam Sandler_5.jpg: {'match_name': 'Adam Sandler', 'confidence_score': 99.68639373779297}
  Adam Sandler/Adam Sandler_6.jpg: {'match_name': 'Adam Sandler', 'confidence_score': 91.56005096435547}
  Adam Sandler/Adam Sandler_7.jpg: {'match_name': 'Adam Sandler', 'confidence_score': 99.04778289794922}
  Adam Sandler/Adam Sandler_9.jpg: {'match_name': 'Adam Sandler', 'confidence_score': 89.05445861816406}
Adele/:
  Adele/Adele_5.jpg: {'match_name': 'None', 'confidence_score': 0}
  Adele/Adele_6.jpg: {'match_name': 'Adele', 'confidence_score': 97.05509948730469}
  Adele/Adele_7.jpg: {'match_name': 'Adele', 'confidence_score': 80.65895080566406}
  Adele/Adele_8.jpg: {'match_name': 'Ángela Ponce', 'confidence_score': 87.249755859375}
  Adele/Adele_9.jpg: {'match_name': 'Adele', 'confidence_score': 83.32523345947266}
Andrew Luck/:
  Andrew Luck/Andrew Luck_25.

In [173]:
# Count how many incorrect, correct scores there are and calculate average confidence 
correct = {}
unidentified = {}
incorrect = {}
avg_correct_confidence = {}
avg_incorrect_confidence = {} 

for celebrity, images in western_celeb_results_raw.items():
    celebrity = celebrity.rstrip('//')
    correct[celebrity] = 0
    incorrect[celebrity] = 0
    unidentified[celebrity] = 0
    avg_correct_confidence[celebrity] = 0
    avg_incorrect_confidence[celebrity] = 0
    for image, data in images.items():
        if celebrity in data['match_name']:
            avg_correct_confidence[celebrity] += data['confidence_score']
            correct[celebrity] += 1
        elif ('None' in data['match_name']):
            unidentified[celebrity] += ('None' == data['match_name'])
        else:
            avg_incorrect_confidence[celebrity] += data['confidence_score']
            incorrect[celebrity] += 1
            
    if incorrect[celebrity] != 0:
        avg_incorrect_confidence[celebrity] /= incorrect[celebrity]
    if correct[celebrity] != 0:
        avg_correct_confidence[celebrity] /= correct[celebrity]

In [175]:
# Convert dictionaries to pandas dataframes

correct = pd.DataFrame(list(correct.items()), columns=['Celebrity', 'correct'])
unidentified = pd.DataFrame(list(unidentified.items()), columns=['Celebrity', 'unidentified'])
incorrect = pd.DataFrame(list(incorrect.items()), columns=['Celebrity', 'incorrect'])
avg_correct_confidence = pd.DataFrame(list(avg_correct_confidence.items()), columns=['Celebrity', 'avg_correct_confidence'])
avg_incorrect_confidence = pd.DataFrame(list(avg_incorrect_confidence.items()), columns=['Celebrity', 'avg_incorrect_confidence'])



In [176]:
# Merge together dictionaries

western_celebs_df = pd.merge(pd.merge(correct, incorrect), unidentified)
df1 = pd.merge(avg_correct_confidence, avg_incorrect_confidence)
western_celebs_df = pd.merge(western_celebs_df, df1)

In [179]:
# Data cleaning (replace 0 values with NaN in confidence columns)
western_celebs_df['avg_incorrect_confidence'].replace(0., np.NaN, inplace = True)
western_celebs_df['avg_correct_confidence'].replace(0., np.NaN, inplace = True)

# Merge together the original celebrity list csv and the results dataframe
western_celebs_df = pd.merge(western_celebs_csv, western_celebs_df, left_on='Name', right_on='Celebrity', how='inner')

western_celebs_df.drop(['Celebrity', 'Rank'], axis = 1, inplace = True)

# export results to csv
western_celebs_df.to_csv('western_celebs_results.csv')


KeyError: 'Celebrity'

In [180]:
western_celebs_df

Unnamed: 0,Name,Earnings,Occupation,Country,correct,incorrect,unidentified,avg_correct_confidence,avg_incorrect_confidence
0,Diddy,$130 million,Musician,U.S.,0,5,0,,98.206206
1,Beyoncé,$105 million,Musician,U.S.,4,0,1,91.656811,
2,J.K. Rowling,$95 million,Author,U.K.,0,5,0,,99.116017
3,Drake,$94 million,Musician,Canada,5,0,0,96.644044,
4,The Weeknd,$92 million,Musician,Canada,4,0,1,98.875992,
5,Howard Stern,$90 million,Personality,U.S.,5,0,0,99.951685,
6,James Patterson,$87 million,Author,U.S.,5,0,0,99.532245,
7,LeBron James,$86 million,Athlete,U.S.,5,0,0,99.450798,
8,Justin Bieber,$83.5 million,Musician,Canada,1,0,4,78.264984,
9,Dr. Phil McGraw,$79 million,Personality,U.S.,0,5,0,,99.877449


# Calculate Accuracy

In [178]:
miss_rate = np.sum(western_celebs_df['incorrect']) / (np.sum(western_celebs_df['incorrect']) + np.sum(western_celebs_df['correct']))
accuracy = 1 - miss_rate
print(f'The percent of Western celebrities incorrectly or not identified is: {miss_rate * 100} %')
print(f'The percent of Western celebrities correctly identified is: {accuracy * 100} %')


The percent of Western celebrities incorrectly or not identified is: 14.393939393939394 %
The percent of Western celebrities correctly identified is: 85.60606060606061 %
