In [41]:
import numpy as np
import pandas as pd
import re
import ast

In [307]:
# create dataframe with all photos with ENFJ as base dataframe (thus ENFJ not included in personality types str)
original_df = pd.read_csv('../Tagged Photos//ENFJ-1000.csv')
original_df['personality_type'] = 'ENFJ'

personality_types_str = ['ENTJ', 'ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP','ESTP', \
                         'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ']

for personality_type in personality_types_str:
    temp_df = pd.read_csv('../Tagged Photos//' + personality_type + '-1000.csv')
    temp_df['personality_type'] = personality_type
    original_df = pd.concat([original_df, temp_df], axis = 0)
    
# create copy of original dataframe to work on
df = original_df

# drop all people in duplicate rows - some people are in multiple groups
df.drop_duplicates(subset = 'Labels', keep = False, inplace = True)
df.reset_index(inplace = True, drop = True)

In [308]:
# get dictionary mapping tag to confidence of the tag for a given string (label)
def get_tag(tag_string):
    tag_list = ast.literal_eval(tag_string)
    tag_dict = {}
    for i in range(0, len(tag_list)):
        tag_dict[tag_list[i]['Name']] = int(tag_list[i]['Confidence'])
    return(tag_dict)

In [309]:
#create exhaustive list of all tags in dataset - this is to be used to create column names in dataframe
tag_list = []
for i in range (0, len(df)):
    tag_dict = get_tag(df['Labels'][i])
    for key, value in tag_dict.items(): 
        if key in tag_list:
            None
        else:
            tag_list.append(key)

In [310]:
#create a column for each tag 
for i in tag_list:
    df[i] = 0

In [311]:
# add confidence value of each tag for each picture (0 if tag is not in picture)
for i in range (0, len(df)):
    tag_dict = get_tag(df['Labels'][i])
    temp_df = pd.DataFrame(tag_dict, index = [i])
    columns = [key for key, value in tag_dict.items()]
    for j in columns:
        df.loc[i,j] = temp_df.loc[i,j]

In [329]:
# create df copy to play with confidence intervals
df_copy = df.copy()

In [321]:
import time

start_time = time.time()

# remove tags with less than 75% confidence, convert values to binary
for i in range(0, len(df_copy)):
    for j in tag_list:
        if df_copy.loc[i,j] < 75:
            df_copy.loc[i,j] = 0
        else:
            df_copy.loc[i,j] = 1
            
end_time = time.time() - start_time
print(end_time)

6.070268869400024


In [330]:
start_time = time.time()

# dictionary with count of each tag 
tag_count = {}
for col in tag_list:
    tag_count[col] = df_copy[df_copy[col] > 0].shape[0]
    
                
end_time = time.time() - start_time
print(end_time)

3.6472368240356445


In [338]:
# print tags that have been used over x times
for key, value in tag_count.items():
    if value > 0:
        print(key, value)

Human 10060
People 10125
Person 10403
Apparel 991
Clothing 1922
Maillot 235
Female 4729
Dress 250
Bra 59
Lingerie 62
Underwear 67
Art 766
Modern Art 363
Face 7032
Selfie 8250
Alphabet 87
Ampersand 85
Portrait 7104
Architecture 229
Castle 33
Mansion 30
Palace 17
Fort 22
Smile 3789
Parliament 4
Downtown 321
Plaza 47
Town 288
Town Square 47
Head 2046
Campus 11
Housing 141
Monastery 40
Building 282
Office Building 49
Convention Center 41
Glasses 2099
Goggles 2111
Sunglasses 780
Outdoors 1558
Drawing 210
Doodle 33
Blonde 715
Woman 2770
Teddy Bear 19
Toy 86
Afro Hairstyle 215
Hair 1078
Baby 325
Child 417
Kid 417
Costume 536
Bridesmaid 39
Plant 1303
Potted Plant 522
Flower 681
Flower Arrangement 220
Flower Bouquet 211
Blossom 603
Rose 57
Floral Design 59
Flora 580
Accessories 589
Petal 99
Beverage 237
Drink 212
Asphalt 44
Tarmac 44
Sand 163
Soil 299
Dirt Road 154
Gravel 154
Road 410
Boat 80
Watercraft 46
Leisure Activities 846
Dimples 1645
Animal 410
Cat 73
Mammal 370
Manx 25
Pet 270
Girl 156

Barbie 1
Blueberry 1
Cylinder 2
Navel 1
Stomach 1
Hacienda 1
Apse 1
Tart 1
Strawberry 1
Bandage 1
First Aid 1
Bullfighter 1
Bullfighting 1
Manatee 1
Control Tower 1
Seaweed 1
Key 1
Tsunami 1
Shopping Bag 1
Executive 1
Pop Bottle 1
Soda 3
Coke 3
Supermarket 2
Shower 1
Dolphin 1
Crib 2
Cobra 1
Sliding Door 2
Pencil 1
Bun 1
Bagel 1
Jazz 1
Moth 1
Bongo 1
Pen 1
Bar Stool 1
Paintball 1
Snail 1
Koi 1
Goldfish 1
Puddle 1
Suede 1
Iguana 1
Bomb 1
Jaguar Car 1
Popcorn 1
Snack 1
Broccoli 1
Fly 1
Anisoptera 1
Dragonfly 1
Black Widow 1
Mailbox 1
Postbox 1
Dvd 1
Coyote 1
Mesa 1
Medication 1
Cradle 1
Traffic Light 1
Frying Pan 2
Wok 2
Calf 1
Ribs 1
Jaguar 1
Toucan 1
Christmas Stocking 1
Stocking 1
Steak 1


In [230]:
# drop columns with less than 5 tags 
for key, value in tag_count.items():
    if value < 5:
        df.drop(key, axis = 1, inplace = True)

In [348]:
import pickle

df.to_pickle('tagged_df') 