In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import math
import pickle
import os
import tensorflow as tf
import numpy as np
import pandas as pd


BASE_DIR = '../../../'
import sys
sys.path.append(BASE_DIR)

# custom code
import utils.utils
CONFIG = utils.utils.load_config("../../config.json")

In [4]:
DATASET = os.path.basename(os.getcwd()) # name of folder this file is in
RANDOM_SEED = CONFIG['random_seed']
BATCH_SIZE = CONFIG["experiment_configs"][DATASET]["batch_size"]

print(RANDOM_SEED)

PROCESSED_DIR = os.path.join(BASE_DIR, f'processed/{DATASET}/rs={RANDOM_SEED}')
MODELS_DIR = os.path.join(BASE_DIR, f'models/{DATASET}/rs={RANDOM_SEED}')

BASE_MODEL_SAVEPATH = utils.utils.get_savepath(MODELS_DIR, DATASET, ".h5", mt="base") # mt = model_type

if os.path.exists(BASE_MODEL_SAVEPATH):
    print(f"warning: model has been done for rs={RANDOM_SEED}")

55


In [7]:
train_df = pd.read_csv(os.path.join(PROCESSED_DIR, "train.csv"))
hyper_train_df = pd.read_csv(os.path.join(PROCESSED_DIR, "hyper_train.csv"))
val_df = pd.read_csv(os.path.join(PROCESSED_DIR, "val.csv"))
hyper_val_df = pd.read_csv(os.path.join(PROCESSED_DIR, "hyper_val.csv"))
test_df = pd.read_csv(os.path.join(PROCESSED_DIR, "test.csv"))

val_full_df = pd.concat([val_df, hyper_val_df])

In [8]:
x_train = train_df.drop('label', axis=1).values
y_train = train_df['label'].values

x_hyper_train = hyper_train_df.drop('label', axis=1).values
y_hyper_train = hyper_train_df['label'].values

x_val_full = val_full_df.drop('label', axis=1).values
y_val_full = val_full_df['label'].values

x_test = test_df.drop('label', axis=1).values
y_test = test_df['label'].values

In [9]:
y_train = tf.keras.utils.to_categorical(y_train)
y_hyper_train = tf.keras.utils.to_categorical(y_hyper_train)
y_val_full = tf.keras.utils.to_categorical(y_val_full)
y_test = tf.keras.utils.to_categorical(y_test)

In [10]:
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=x_train.shape[1]),
    tf.keras.layers.Dense(2, activation=tf.nn.softmax),
])

In [11]:
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'], )

In [12]:
class WeightsSaver(tf.keras.callbacks.Callback):
    def __init__(self, save_freq):
        self.save_freq = save_freq
        self.batch = 0

    def on_batch_end(self, batch, logs=None):
        if (self.batch % self.save_freq) == 0:
            filename = f"adult_periodic_base_batch={self.batch}.h5"
            filepath = os.path.join(MODELS_DIR, filename)
            self.model.save_weights(filepath)
        self.batch += 1

# save every 50 batches
ws = WeightsSaver(50)

callbacks = [ws]

In [13]:
model.fit(
    x_train,
    y_train,
    batch_size = BATCH_SIZE,
    epochs = 2,
    validation_data = (x_hyper_train, y_hyper_train),
    verbose=1,
    callbacks=callbacks,
)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f4b296a5fd0>

In [14]:
# see the model at a batch
batch = 1200

model.load_weights( os.path.join(MODELS_DIR, f"adult_periodic_base_batch={batch}.h5") )


In [15]:
# train acc
preds_train = utils.utils.compute_preds(
    model,
    x_train,
    batch_size=BATCH_SIZE,
)
(np.argmax(preds_train, axis=1) == np.argwhere(y_train)[:,1]).mean()

0.7388952555581022

In [16]:
# hyper train acc
preds_hyper_train = utils.utils.compute_preds(
    model,
    x_hyper_train,
    batch_size=1024,
)
(np.argmax(preds_hyper_train, axis=1) == np.argwhere(y_hyper_train)[:,1]).mean()

0.7451144611948632

In [18]:
# val acc
preds_val_full = utils.utils.compute_preds(
    model,
    x_val_full,
    batch_size=BATCH_SIZE,
)
(np.argmax(preds_val_full, axis=1) == np.argwhere(y_val_full)[:,1]).mean()

0.5828220858895705

In [19]:
# test acc
preds_test = utils.utils.compute_preds(
    model,
    x_test,
    batch_size=BATCH_SIZE,
)
(np.argmax(preds_test, axis=1) == np.argwhere(y_test)[:,1]).mean()

0.5797865740166274