In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import model_selection

BASE_DIR = '../../../'
import sys
sys.path.append(BASE_DIR)

# custom code
import utils.utils
CONFIG = utils.utils.load_config("../../config.json")

In [18]:
DATASET = os.path.basename(os.getcwd()) # name of folder this file is in
RANDOM_SEED = CONFIG['random_seed']
# type of noise
# asym: classes flip to a single other class
# sym: classes flip uniformly to any other class
TYPE = CONFIG["experiment_configs"][DATASET]["type"]
 # chance of flip
NOISE_P = CONFIG["experiment_configs"][DATASET]["noise_p"]
HYPER_TRAIN_SPLIT = CONFIG["experiment_configs"][DATASET]["hyper_train_split"]
VAL_FULL_SPLIT = CONFIG["experiment_configs"][DATASET]["val_full_split"]
HYPER_VAL_SPLIT = CONFIG["experiment_configs"][DATASET]["hyper_val_split"]

print(RANDOM_SEED, TYPE, NOISE_P)

PROCESSED_DIR = os.path.join(BASE_DIR, f'processed/{DATASET}/rs={RANDOM_SEED}')
MODELS_DIR = os.path.join(BASE_DIR, f'models/{DATASET}/rs={RANDOM_SEED}')

PROCESSED_SAVEPATH = utils.utils.get_savepath(PROCESSED_DIR, DATASET, ".npz", t=TYPE, np=NOISE_P)

# processing saved here
if os.path.exists(PROCESSED_SAVEPATH):
    print(f"warning: processing has been done for rs={RANDOM_SEED}_t={TYPE}_np={NOISE_P}")

25 asym 0.6


In [19]:
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

In [20]:
def load_data():
    np.random.seed(RANDOM_SEED)
    (x, y), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
    x = x.reshape(x.shape[0], 32, 32, 3)
    x_test = x_test.reshape(x_test.shape[0], 32, 32, 3)
    
    # standardize data
    means = x.mean(axis=0)
    x = (x - means)
    x_test = (x_test - means)
    
    # shuffle data
    idx_perm = np.random.permutation(x.shape[0])
    x, y = x[idx_perm], y[idx_perm]
    
    # 2D -> 1D
    y = y.ravel()
    y_test = y_test.ravel()

    return x, y, x_test, y_test

In [21]:
x, y, x_test, y_test = load_data()

In [22]:
x.shape, y.shape, x_test.shape, y_test.shape

((50000, 32, 32, 3), (50000,), (10000, 32, 32, 3), (10000,))

In [23]:
num_ht = int(HYPER_TRAIN_SPLIT * x.shape[0])
num_v = int(VAL_FULL_SPLIT * x.shape[0])
print(num_ht, num_v)

10000 2000


In [24]:
# split off train_full/val_full
x_train_full, x_val_full, y_train_full, y_val_full = model_selection.train_test_split(
    x,
    y,
    test_size=num_v,
    stratify=y,
    random_state=RANDOM_SEED,
)

# these variables are no longer needed
del x, y

# #TODO: trying, handle
# x_test, x_val_full, y_test, y_val_full = model_selection.train_test_split(
#     x_test,
#     y_test,
#     test_size=VAL_FULL_SPLIT,
#     stratify=y_test,
#     random_state=RANDOM_SEED,
# )

# split off hyper val from val
x_val, x_hyper_val, y_val, y_hyper_val = model_selection.train_test_split(
    x_val_full,
    y_val_full,
    test_size=HYPER_VAL_SPLIT,
    stratify=y_val_full,
    random_state=RANDOM_SEED,
)

# these variables are no longer needed
del x_val_full, y_val_full

In [25]:
def add_noise(y, nc):
    '''
    Adds noise to `y` labels with `nc` classes.
    Uses NOISE_P global to determine the chance of flipping the label.
    Uses TYPE global to determine how to apply the flip.
        sym: flip to a different class randomly and uniformly
        asym: flip to a specific class always (one with semantic meaning)
    '''
    np.random.seed(RANDOM_SEED)
    
    y = np.copy(y)
    
    if TYPE == 'asym':
        # mappings, from https://arxiv.org/pdf/1609.03683.pdf
        # TRUCK → AUTOMOBILE (9, 1)
        # BIRD → AIRPLANE (2, 0)
        # DEER → HORSE (4, 7)
        # CAT ↔ DOG (3, 5)

        noise_transitions = {
            9: 1,
            2: 0,
            4: 7,
            3: 5,
            5: 3,
        }

        for i in range(len(y)):
            yi_true = y[i]
            if yi_true in noise_transitions and np.random.uniform() < NOISE_P:
                # flip to corresponding asym noise transition
                y[i] = noise_transitions[ yi_true ]
                    
    elif TYPE == 'sym':
        for i in range(len(y)):
            yi_true = y[i]
            if np.random.uniform() < NOISE_P:
                # flip to random class that is not the true class
                rand_c = np.random.randint(0, nc - 1)
                while rand_c == yi_true:
                    rand_c = np.random.randint(0, nc - 1)
                y[i] = rand_c
    
    else:
        raise ValueError(f"unrecognized type {TYPE}")
        
    return y

In [26]:
# rename the true y to y_true
y_train_full_true = y_train_full

# deepcopies and creates y_train (noisy)
y_train_full = add_noise(y_train_full, nc=10)

In [27]:
# split off hyper train from train
x_train, x_hyper_train, y_train, y_hyper_train, y_train_true, y_hyper_train_true = model_selection.train_test_split(
    x_train_full,
    y_train_full,
    y_train_full_true,
    test_size=num_ht,
    stratify=y_train_full, # stratify on noisy label
    random_state=RANDOM_SEED,
)

In [28]:
print(x_train.shape, x_hyper_train.shape, x_val.shape, x_hyper_val.shape, x_test.shape)
print(y_train.shape, y_hyper_train.shape, y_val.shape, y_hyper_val.shape, y_test.shape)

(38000, 32, 32, 3) (10000, 32, 32, 3) (1000, 32, 32, 3) (1000, 32, 32, 3) (10000, 32, 32, 3)
(38000,) (10000,) (1000,) (1000,) (10000,)


In [33]:
np.savez(
    PROCESSED_SAVEPATH,
    x_train=x_train,
    y_train=y_train,
    y_train_true=y_train_true,
    
    x_hyper_train=x_hyper_train,
    y_hyper_train=y_hyper_train,
    y_hyper_train_true=y_hyper_train_true,
    
    x_val=x_val,
    y_val=y_val,
    
    x_hyper_val=x_hyper_val,
    y_hyper_val=y_hyper_val,
    
    x_test=x_test,
    y_test=y_test,
)