In [None]:
import pandas as pd
import matplotlib.pyplot as plt 


In [None]:
raw_data = pd.read_csv('OASIS_bygender.csv')
raw_data[raw_data['Valence_mean_men'] >= 3]

In [None]:
plt.imshow(plt.imread('images/Acorns 1.jpg'))

In [None]:
#CATEGORIES ANIMALS, OBJECTS, PEOPLE, SCENES
#raw_data[raw_data['Category'] ==1]['Theme'] CATEGORY 1 is animal
#raw_data[raw_data['Category'] ==2]['Theme'] #CATEGORY 2 is objects
#raw_data[raw_data['Category'] ==3]['Theme'] #category 3 is people
#raw_data[raw_data['Category'] ==4]['Theme'] #category 4 is scenes
cates = {'animals':1, 'objects':2,'people':3,'scenes':4}

In [None]:
#Valence and Arousal measured on a 1-7 scale
print('Valence max min men:',raw_data['Valence_mean_men'].max(),raw_data['Valence_mean_men'].min())
print('Valence max min women:',raw_data['Valence_mean_women'].max(),raw_data['Valence_mean_women'].min())
print('Arousal max min men:',raw_data['Arousal_mean_men'].max(),raw_data['Arousal_mean_men'].min())
print('Arousal max min women:',raw_data['Arousal_mean_women'].max(),raw_data['Arousal_mean_women'].min())

In [None]:
def print_theme_types(data,category):
    cate_df = data[data['Category'] == category]
    themes = list(cate_df['Theme'].unique())
    theme_types = []
    for theme in themes:
        theme_type = ' '.join(theme.split(' ')[:-1])
        if theme_type not in theme_types:
            theme_types.append(theme_type)
            #print(theme_type) 
    return themes
people_themes = print_theme_types(raw_data,cates['people'])
people_themes

In [None]:
people_graphic = ['BDSM','Child labor' ,'Dead bodies' ,'Injury', 'Nude' ,'Severed', 'finger']
def get_not_graphic(themes, graph_str_list):
    themes_safe = [theme for theme in themes if all([graphic not in theme for graphic  in                             graph_str_list])]
    return themes_safe
people_themes_safe = get_not_graphic(people_themes,people_graphic)
people_themes_safe[:10]

In [None]:
#get animal types
animal_themes = print_theme_types(raw_data,cates['animals'])

In [None]:

animal_graphic = ['Animal carcass' 'Dog attack']
animal_themes_safe = get_not_graphic(animal_themes,animal_graphic)
animal_themes_safe[:10]

In [None]:
#honestly probably not too graphic
object_themes = print_theme_types(raw_data,cates['objects'])

In [None]:
scene_themes = print_theme_types(raw_data,cates['scenes'])

In [None]:
[animal_themes_safe, people_themes_safe]
themes_to_check = {cates['animals']:animal_themes_safe,
                    cates['people']:people_themes_safe}
safe_inds = []
for i,row in raw_data.iterrows():
    safe_list = themes_to_check.get(row['Category'],None)
    if safe_list:
        if row['Theme'] in safe_list:
            safe_inds.append(i)
        else:
            print('rejected:',row['Theme'])
    else:
        safe_inds.append(i)

In [None]:
safe_df = raw_data.iloc[safe_inds]
safe_df.head()

In [None]:
bin_num = 3
#data['p_5'] = pd.qcut(data[2],q=4,labels=labels)
vals_2_bin = ['Valence_mean_men','Valence_mean_women','Arousal_mean_women','Arousal_mean_men']
for val in vals_2_bin:
    col_name = val +'_bin'
    safe_df[col_name] = pd.qcut(safe_df[val],q=bin_num,labels=list(range(1,bin_num+1)))
safe_df.head()

In [None]:

idx = safe_df[(safe_df['Valence_mean_men_bin']==3) & (safe_df['Valence_SD_men'] >= 1)]['Valence_mean_men'].idxmax()

theme = safe_df.iloc[idx]['Theme']
file_name = 'Images/'+theme+'.jpg'
plt.imshow(plt.imread(file_name))
#safe_df[(safe_df['Valence_mean_men_bin']==4) & (safe_df['Valence_SD_men'] <= 1)]

In [None]:
#bin combinations
combos = []
for v_bin in range(1,4):
    for a_bin in range(1,4):
        combos.append((v_bin,a_bin))



In [None]:
def get_subset_of_images_bygender(data,path,sex = 'men',num_images=160,num_bins=3):
    '''
    save a csv of images with even distribution of valence, arousal combos
    for sex the options are man and women, sorry if thats problematic.
    people can do all instead of by gender if they wish
    ''' 
    #get valence, arousal bin combos. bins are from 1 to 4
    #so we want 1,1 1,2 ... 2,3 2,4 ... 3,1 ,3,2 ... 4,1 ... 4,4
    combos = []
    for v_bin in range(1,num_bins+1):
        for a_bin in range(1,num_bins+1):
            combos.append((v_bin,a_bin))

    num_images_per_combo = round(num_images/len(combos))

    v_mean_col = 'Valence_mean_'+sex
    a_mean_col = 'Arousal_mean_'+sex
    v_mean_col_bin = v_mean_col + '_bin'
    a_mean_col_bin = a_mean_col +'_bin'

    sample_df = None
    num_pics_needed = 0
   
    for combo in combos:

        combo_subset = data[(data[v_mean_col_bin] == combo[0]) & (data[a_mean_col_bin] == combo[1])]

        if len(combo_subset) == 0:
            
            #don't worry about arousal, and use pics of people
            combo_subset = data[(data[v_mean_col_bin] == combo[0]) & (data['Category'] == 3)]
            combo_sample = combo_subset.sample(num_images_per_combo)

        elif len(combo_subset) <= num_images_per_combo:
            num_needed = num_images_per_combo - len(combo_subset)
            rest_sample = data[(data[v_mean_col_bin] == combo[0]) & (data['Category'] == 3)].sample(num_needed)
            combo_sample = combo_subset.append(rest_sample).drop_duplicates()

        else:
            combo_samples = combo_subset.sample(num_images_per_combo)
        print(len(combo_samples))
        if sample_df is None:
            sample_df = combo_samples
        else: 
            sample_df = sample_df.append(combo_samples)

    sample_df = sample_df.drop_duplicates()

    while len(sample_df) < num_images:
        
        #people or animals
        extra_sample = data[(data['Category'] == 1) | (data['Category'] == 3)].sample(3)
        sample_df = sample_df.append(extra_sample)
        sample_df = sample_df.drop_duplicates()
            
    return sample_df