In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import model_selection
from tqdm import notebook 

BASE_DIR = '../../../'
import sys
sys.path.append(BASE_DIR)

# custom code
import utils.utils
CONFIG = utils.utils.load_config("../../config.json")

Using TensorFlow backend.


In [2]:
DATASET = os.path.basename(os.getcwd()) # name of folder this file is in
RANDOM_SEED = CONFIG['random_seed']
VAL_FULL_SPLIT = CONFIG['experiment_configs'][DATASET]['val_full_split']
HYPER_VAL_SPLIT = CONFIG['experiment_configs'][DATASET]['hyper_val_split']

PROCESSED_DIR = os.path.join(BASE_DIR, f'processed/{DATASET}/rs={RANDOM_SEED}/vs={VAL_FULL_SPLIT}')
MODELS_DIR = os.path.join(BASE_DIR, f'models/{DATASET}/rs={RANDOM_SEED}/vs={VAL_FULL_SPLIT}')
# original data folder
DATA_F = os.path.join(BASE_DIR, f"data/adience/")

os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

In [3]:
# symlink the model
src = os.path.abspath(
    os.path.join(BASE_DIR, f'models/adience/rs={RANDOM_SEED}/adience_mt=base.h5')
)
dest = os.path.abspath(
    os.path.join(BASE_DIR, f'models/adience_ablation/rs={RANDOM_SEED}/vs={VAL_FULL_SPLIT}/adience_ablation_mt=base.h5')
)
os.symlink(src, dest)


In [4]:
# symlink the train/hyper train processing because that is invariant of VAL_SPLIT

# symlink train.csv
src = os.path.abspath(
    os.path.join(BASE_DIR, f'processed/adience/rs={RANDOM_SEED}/train.csv')
)
dest = os.path.abspath(
    os.path.join(BASE_DIR, f'processed/adience_ablation/rs={RANDOM_SEED}/vs={VAL_FULL_SPLIT}/train.csv')
)
os.symlink(src, dest)

# symlink train folder
src = os.path.abspath(
    os.path.join(BASE_DIR, f'processed/adience/rs={RANDOM_SEED}/train')
)
dest = os.path.abspath(
    os.path.join(BASE_DIR, f'processed/adience_ablation/rs={RANDOM_SEED}/vs={VAL_FULL_SPLIT}/train')
)
os.symlink(src, dest, target_is_directory = True)

# symlink hyper_train csv
src = os.path.abspath(
    os.path.join(BASE_DIR, f'processed/adience/rs={RANDOM_SEED}/hyper_train.csv')
)
dest = os.path.abspath(
    os.path.join(BASE_DIR, f'processed/adience_ablation/rs={RANDOM_SEED}/vs={VAL_FULL_SPLIT}/hyper_train.csv')
)
os.symlink(src, dest)

# symlink hyper train folder
src = os.path.abspath(
    os.path.join(BASE_DIR, f'processed/adience/rs={RANDOM_SEED}/hyper_train')
)
dest = os.path.abspath(
    os.path.join(BASE_DIR, f'processed/adience_ablation/rs={RANDOM_SEED}/vs={VAL_FULL_SPLIT}/hyper_train')
)
os.symlink(src, dest, target_is_directory = True)


In [5]:
# we can be clever here... we need do the following:
# setup val (different per val split), hyper_val (different per val split), test (consistent across val splits)
# we can load the existing val, hyper_val, test computed by adience and reshuffle
# them to match the new conditions. we can use the existing embedding features so that
# FWEG runs faster

In [6]:
# load existing dfs for manupilation
adience_processed_f = os.path.join(BASE_DIR, f'processed/adience/rs={RANDOM_SEED}/')
adience_val_df = utils.utils.load_sorted_df(adience_processed_f, "val")
adience_hyper_val_df = utils.utils.load_sorted_df(adience_processed_f, "hyper_val")
adience_test_df = utils.utils.load_sorted_df(adience_processed_f, "test")

In [7]:
# load existing embeddings for manipulation
ADIENCE_X_TRAIN_FULL_EMB_SAVEPATH = utils.utils.get_savepath(adience_processed_f, "x_train_full_emb", ".npy")
ADIENCE_X_VAL_EMB_SAVEPATH = utils.utils.get_savepath(adience_processed_f, "x_val_emb", ".npy")
ADIENCE_X_HYPER_VAL_EMB_SAVEPATH = utils.utils.get_savepath(adience_processed_f, "x_hyper_val_emb", ".npy")
ADIENCE_X_TEST_EMB_SAVEPATH = utils.utils.get_savepath(adience_processed_f, "x_test_emb", ".npy")

assert os.path.exists(ADIENCE_X_TRAIN_FULL_EMB_SAVEPATH)

x_train_full_emb = np.load(
    ADIENCE_X_TRAIN_FULL_EMB_SAVEPATH,
)

x_val_emb = np.load(
    ADIENCE_X_VAL_EMB_SAVEPATH,
)

x_hyper_val_emb = np.load(
    ADIENCE_X_HYPER_VAL_EMB_SAVEPATH,
)

x_test_emb = np.load(
    ADIENCE_X_TEST_EMB_SAVEPATH,
)

In [8]:
all_df = pd.concat([adience_val_df, adience_hyper_val_df, adience_test_df])
all_emb = np.concatenate([x_val_emb, x_hyper_val_emb, x_test_emb])

def get_age_and_gender(df):
    age_and_gender = [(df['age'].values[i], df['gender'].values[i]) for i in range(len(df))]
    return age_and_gender

# to keep the test set consistent, we split val/test deterministically
# then depending on the current val_split, we choose subset of val
# to make val/hyper_val
max_val_split = 0.5
expected_val_full_size = int(len(all_df) * VAL_FULL_SPLIT)

# this executes the same for a particular random seed, independent of val split
age_and_gender = get_age_and_gender(all_df)
df_faces_test, df_faces_val_full, x_test_emb, x_val_full_emb = model_selection.train_test_split(
    all_df,
    all_emb,
    test_size=max_val_split,
    stratify=age_and_gender,
    random_state=RANDOM_SEED,
)

if expected_val_full_size < len(df_faces_val_full):
    # reduce val_full set to the expected size. the remaining samples are just dropped
    age_and_gender = get_age_and_gender(df_faces_val_full)
    _, df_faces_val_full, _, x_val_full_emb = model_selection.train_test_split(
        df_faces_val_full,
        x_val_full_emb,
        test_size=expected_val_full_size,
        stratify=age_and_gender,
        random_state=RANDOM_SEED,
    )
    
# split off a hyper val from the val set
age_and_gender = get_age_and_gender(df_faces_val_full)
df_faces_val, df_faces_hyper_val, x_val_emb, x_hyper_val_emb = model_selection.train_test_split(
    df_faces_val_full,
    x_val_full_emb,
    test_size=HYPER_VAL_SPLIT,
    stratify=age_and_gender,
    random_state=RANDOM_SEED,
)

del df_faces_val_full, x_val_full_emb

In [9]:
df_faces_val.shape, df_faces_hyper_val.shape, df_faces_test.shape

((996, 5), (996, 5), (1992, 5))

In [10]:
x_val_emb.shape, x_hyper_val_emb.shape, x_test_emb.shape

((996, 10), (996, 10), (1992, 10))

In [11]:
def sort_and_align(df, x_emb):
    # df will have a fresh indes
    df = df.reset_index(drop=True)
    # df is now sorted by what it will appear like when saved and tensorflow
    # loads it with shuffle=False
    df = df.sort_values(by=['gender'])
    # reorganize x_emb based on the new ordering in df
    x_emb = x_emb[df.index]
    # df's index is reset because sort_values changed it
    df = df.reset_index(drop=True)
    # now, if we pass this df to symlink_df it will save based on gender
    # and the index. if we load this using tf's image_dataset_from_directory
    # x_emb will correspond properly
    return df, x_emb

In [12]:
df_faces_val, x_val_emb = sort_and_align(df_faces_val, x_val_emb)
df_faces_hyper_val, x_hyper_val_emb = sort_and_align(df_faces_hyper_val, x_hyper_val_emb)
df_faces_test, x_test_emb = sort_and_align(df_faces_test, x_test_emb)

In [13]:
def get_im_path(df, idx):
    prefix = 'coarse_tilt_aligned_face'
    uid = df['user_id'].loc[idx]
    iname = df['original_image'].loc[idx]
    fid = df['face_id'].loc[idx]
    full_iname = f"{prefix}.{fid}.{iname}"
    return os.path.join(DATA_F, "faces", uid, full_iname)

def symlink_df(df, folder):
    for i in notebook.tqdm(df.index):
        src = os.path.abspath( get_im_path(df, i) )
        gender = df['gender'].loc[i]
        # get the file extension
        ext = src.split('.')[-1]
        dst_iname = f"{i}.{ext}"
        dest = os.path.join(PROCESSED_DIR, folder, gender, dst_iname)
        if os.symlink(src, dest, target_is_directory = False) != None:
            raise ValueError("error creating symlink")

In [14]:
valid_gender_groups = ['m', 'f']
for f in ['val', 'hyper_val', 'test']:
    fpath = os.path.join(PROCESSED_DIR, f)
    os.makedirs(fpath, exist_ok=False)
    for l in valid_gender_groups:
        os.makedirs(os.path.join(fpath, l), exist_ok=False)

In [15]:
symlink_df(df_faces_val, "val")

HBox(children=(FloatProgress(value=0.0, max=996.0), HTML(value='')))




In [16]:
symlink_df(df_faces_hyper_val, "hyper_val")

HBox(children=(FloatProgress(value=0.0, max=996.0), HTML(value='')))




In [17]:
symlink_df(df_faces_test, "test")

HBox(children=(FloatProgress(value=0.0, max=1992.0), HTML(value='')))




In [18]:
df_faces_val.to_csv( os.path.join(PROCESSED_DIR, "val.csv"), index=False )
df_faces_hyper_val.to_csv( os.path.join(PROCESSED_DIR, "hyper_val.csv"), index=False )
df_faces_test.to_csv( os.path.join(PROCESSED_DIR, "test.csv"), index=False )

In [19]:
X_TRAIN_FULL_EMB_SAVEPATH = utils.utils.get_savepath(PROCESSED_DIR, "x_train_full_emb", ".npy")
X_VAL_EMB_SAVEPATH = utils.utils.get_savepath(PROCESSED_DIR, "x_val_emb", ".npy")
X_HYPER_VAL_EMB_SAVEPATH = utils.utils.get_savepath(PROCESSED_DIR, "x_hyper_val_emb", ".npy")
X_TEST_EMB_SAVEPATH = utils.utils.get_savepath(PROCESSED_DIR, "x_test_emb", ".npy")

In [20]:
# hyper train is the same so it just gets saved
np.save(
    X_TRAIN_FULL_EMB_SAVEPATH,
    x_train_full_emb,
)

# val/hyper_val/test are made on the fly
np.save(
    X_VAL_EMB_SAVEPATH,
    x_val_emb,
)

np.save(
    X_HYPER_VAL_EMB_SAVEPATH,
    x_hyper_val_emb,
)

np.save(
    X_TEST_EMB_SAVEPATH,
    x_test_emb,
)