In [52]:
import pandas as pd
import os
import numpy as np

In [5]:
data_path = '../data/raw/'
image_dir = os.path.join(data_path, 'fairface_pad025')

In [6]:
train_csv = pd.read_csv(os.path.join(data_path, 'fairface_label_train.csv'))
val_csv = pd.read_csv(os.path.join(data_path, 'fairface_label_val.csv'))
df = pd.concat([train_csv, val_csv])

df.head(5)

Unnamed: 0,file,age,gender,race,service_test
0,train/1.jpg,50-59,Male,East Asian,True
1,train/2.jpg,30-39,Female,Indian,False
2,train/3.jpg,3-9,Female,Black,False
3,train/4.jpg,20-29,Female,Indian,True
4,train/5.jpg,20-29,Female,Indian,True


### Data reflected on US population

In [47]:
"""
Population stats manually added based on 2020 US Census. 
The Asian populations are estimated based on 2016 population data.
"""

races = df["race"].unique()
population = {"White": 0.601, 
             "Black": 0.134,
             "Latino_Hispanic": 0.185,
             "East Asian": 0.022,
             "Southeast Asian": 0.022,
             "Indian": 0.012,
              "Middle Eastern": 0.024
             }

total = int(len(df.loc[df.race == 'White'])/population['White'])

biased_df = pd.DataFrame()
for race in races:
    num_rows = int(total*population[race])
    # Randomly sample rows based on population
    single_race = df.loc[df.race == race].sample(num_rows)
    if biased_df.empty:
        biased_df = single_race
    else:
        biased_df = pd.concat([biased_df, single_race])
biased_df = biased_df.sample(frac=1).reset_index(drop=True)

In [48]:
#biased_df.to_csv('../data/biased/biased.csv')
biased_df.head()

Unnamed: 0,file,age,gender,race,service_test
0,train/2640.jpg,20-29,Female,White,True
1,train/58337.jpg,40-49,Male,White,True
2,train/37193.jpg,0-2,Female,White,False
3,train/24471.jpg,40-49,Male,White,False
4,train/19604.jpg,40-49,Male,White,False


### Randomly Assign Classification Based on Incaceration Rates

In [63]:
equal = np.random.choice([0, 1], size=biased_df.shape[0], p=[0.9, 0.1])
# Non-biased df with equal representation, equally distributed statistic
non_biased_df_equal = (df.sample(biased_df.shape[0])
                    .assign(stat=equal))
non_biased_df_equal.head()

Unnamed: 0,file,age,gender,race,service_test,stat
2307,val/2308.jpg,50-59,Male,Middle Eastern,False,0
10843,val/10844.jpg,3-9,Male,East Asian,False,0
41134,train/41135.jpg,0-2,Male,Black,False,1
75439,train/75440.jpg,10-19,Male,Southeast Asian,True,0
14061,train/14062.jpg,more than 70,Female,East Asian,False,0


In [64]:
# Biased df with equally distributed statistic
biased_df_equal = biased_df.assign(stat=equal)
biased_df_equal.head()

Unnamed: 0,file,age,gender,race,service_test,stat
0,train/2640.jpg,20-29,Female,White,True,0
1,train/58337.jpg,40-49,Male,White,True,0
2,train/37193.jpg,0-2,Female,White,False,1
3,train/24471.jpg,40-49,Male,White,False,0
4,train/19604.jpg,40-49,Male,White,False,0


In [None]:
# None-biased df with equal representation, biased statistic