# Data Preparation

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('sample_data/data.csv').dropna()
del data['educational-num']
data.insert(9, 'capital', data['capital-gain'] - data['capital-loss'])
del data['capital-gain'], data['capital-loss']

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40727 entries, 0 to 43956
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             40727 non-null  int64 
 1   workclass       40727 non-null  object
 2   fnlwgt          40727 non-null  int64 
 3   education       40727 non-null  object
 4   marital-status  40727 non-null  object
 5   occupation      40727 non-null  object
 6   relationship    40727 non-null  object
 7   race            40727 non-null  object
 8   gender          40727 non-null  object
 9   capital         40727 non-null  int64 
 10  hours-per-week  40727 non-null  int64 
 11  native-country  40727 non-null  object
 12  income_>50K     40727 non-null  int64 
dtypes: int64(5), object(8)
memory usage: 4.4+ MB


In [4]:
from sklearn.preprocessing import LabelEncoder

ind_cat = []    # categorical index
ind_cont = []   # continuous index

for i, v in enumerate(data.columns[:-1]):
  if data[v].dtype == 'object':
    ind_cat.append(i)
    le_str = 'le'+str(i)
    globals()[le_str] = LabelEncoder().fit(data[v])
    print("{} {}".format(le_str, v))
  else:
    ind_cont.append(i)

print()
print(f"ind_cont = {ind_cont}")
print(f"ind_cat = {ind_cat}")

le1 workclass
le3 education
le4 marital-status
le5 occupation
le6 relationship
le7 race
le8 gender
le11 native-country

ind_cont = [0, 2, 9, 10]
ind_cat = [1, 3, 4, 5, 6, 7, 8, 11]


# Conversion to Array

In [5]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

In [6]:
def convertx(df):
  mat = np.zeros([len(df), len(df.columns)-1], dtype='int32')
  for i in ind_cont:
    mat[:,i] = df.iloc[:,i]
  for i in ind_cat:
    mat[:,i] = globals()['le'+str(i)].transform(df.iloc[:,i])
  return mat

def converty(df):
  return df.iloc[:,-1].to_numpy(dtype='bool')

In [7]:
X_train, y_train = convertx(train), converty(train)
X_test, y_test = convertx(test), converty(test)

# GPU Environment Preparation

In [8]:
# List all NVIDIA GPUs as avaialble in this computer (or Colab's session)
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-42eb22e8-4622-3ac5-9050-bf6111b696a6)


In [9]:
import sys
print(f"Python {sys.version}\n")

import numpy as np
print(f"NumPy {np.__version__}\n")

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
print(f"TensorFlow {tf.__version__}")
print(f"tf.keras.backend.image_data_format() = {tf.keras.backend.image_data_format()}")

# Count the number of GPUs as detected by tensorflow
gpus = tf.config.list_physical_devices('GPU')
print(f"TensorFlow detected {len(gpus)} GPU(s):")
for i, gpu in enumerate(gpus):
  print(f".... GPU No. {i}: Name = {gpu.name}, Type = {gpu.device_type}")

Python 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]

NumPy 1.21.6

TensorFlow 2.8.2
tf.keras.backend.image_data_format() = channels_last
TensorFlow detected 1 GPU(s):
.... GPU No. 0: Name = /physical_device:GPU:0, Type = GPU


In [10]:
# Set fixed seeding values for reproducability during experiments
# Skip this cell if random initialization (with varied results) is needed
np.random.seed(1234)
tf.random.set_seed(5678)

# Dimensionality

In [11]:
input_dim = np.uint8(X_train.shape[1])  # the number of features per one input
print(f"input_dim = {input_dim}")
output_dim = np.uint8(1)                # the number of output classes
print(f"output_dim = {output_dim}")

input_dim = 12
output_dim = 1


In [12]:
# Training data
n_train = np.uint8(X_train.shape[0])
print(f"X_train: shape = {X_train.shape}, dtype = {X_train.dtype}, min = {np.min(X_train)}, max = {np.max(X_train)}")
print(f"y_train: shape = {y_train.shape}, dtype = {y_train.dtype}, min = {np.min(y_train)}, max = {np.max(y_train)}")

X_train: shape = (32581, 12), dtype = int32, min = -4356, max = 1490400
y_train: shape = (32581,), dtype = bool, min = False, max = True


In [13]:
# Testing data
n_test = np.uint8(X_test.shape[0])
print(f"X_test: shape = {X_test.shape}, dtype = {X_test.dtype}, min = {np.min(X_test)}, max = {np.max(X_test)}")
print(f"y_test: shape = {y_test.shape}, dtype = {y_test.dtype}, min = {np.min(y_test)}, max = {np.max(y_test)}")

X_test: shape = (8146, 12), dtype = int32, min = -3770, max = 1038553
y_test: shape = (8146,), dtype = bool, min = False, max = True


# User-Defined MLP

In [14]:
import timeit

def mlp(hsize, hdepth, dropout, lr, bs, ep, vsplit):
  '''
    INPUTS
      hsize: the number of nodes in a hidden layer
      hdepth: the number of all hidden layers
      dropout: dropout rate
      lr: learning rate
      bs: batch size
      ep: the number of epochs
      vsplit: validation split rate

    OUTPUTS
      train_acc, val_acc, test_acc (accuracy: percentage)
      train_loss, val_loss, test_loss (loss)
      rtime (running time: seconds)
  '''

  # Start a timer
  start = timeit.default_timer()

  # Sequential network architecture
  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(input_dim,)))
  for i in range(hdepth):
    model.add(tf.keras.layers.Dense(hsize, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization(axis=-1))  
  model.add(tf.keras.layers.Dense(output_dim, activation='sigmoid'))

  # Model compilation
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), 
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=False) ,
                metrics=['acc'])
  
  # Model training
  model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath="bestmodel.hdf5",
                                                                 save_weights_only=True,
                                                                 monitor='val_acc',
                                                                 mode='max',
                                                                 save_best_only=True)
  history = model.fit(X_train, y_train, 
                      batch_size=bs, epochs=ep, 
                      verbose=0,  # no print
                      validation_split=vsplit, 
                      callbacks=[model_checkpoint_callback])  # keep saving model
  
  # End the timer
  rtime = timeit.default_timer() - start  # in seconds

  # Model evaluation
  val_acc = max(history.history['val_acc'])
  ind = history.history['val_acc'].index(val_acc)
  val_loss = history.history['val_loss'][ind]
  train_acc = history.history['acc'][ind]
  train_loss = history.history['loss'][ind]
  test_eval = model.evaluate(X_test, y_test,
                             batch_size=bs,
                             verbose=0)
  test_acc = test_eval[1]
  test_loss = test_eval[0]

  # Outputs
  return {'train_acc': np.float32(train_acc*100),
          'val_acc': np.float32(val_acc*100),
          'test_acc': np.float32(test_acc*100),
          'train_loss': np.float32(train_loss),
          'val_loss': np.float32(val_loss),
          'test_loss': np.float32(test_loss),
          'rtime': np.float32(rtime)}

# Experimentation

In [15]:
r_hsize = range(7, 9)
print(f"hsize = {list(r_hsize)}")
r_hdepth = range(3, 5)
print(f"hdepth = {list(r_hdepth)}")
ls_dropout = np.array([0.1, 0.3], dtype='float32')
print(f"dropout = {ls_dropout}")
lr = np.float32(0.001)
bs, ep, vsplit = np.uint16(128), np.uint8(20), np.float32(0.2)

hsize = [7, 8]
hdepth = [3, 4]
dropout = [0.1 0.3]


In [16]:
df_eval = pd.DataFrame(columns = ['hsize', 'hdepth', 'dropout',
                                  'lr', 'bs', 'ep', 'vsplit',
                                  'train_acc', 'val_acc', 'test_acc',
                                  'train_loss', 'val_loss', 'test_loss',
                                  'rtime'])

rows = []

for hsize in r_hsize:
  for hdepth in r_hdepth:
    for dropout in ls_dropout:
      dc = {'hsize': np.uint8(hsize), 'hdepth': np.uint8(hdepth), 
            'dropout': np.float32(dropout), 'lr': np.float32(lr), 
            'bs': np.uint16(bs), 'ep': np.uint8(ep), 
            'vsplit': np.float32(vsplit)}
      dc.update(mlp(hsize, hdepth, dropout, lr, bs, ep, vsplit))
      rows.append(dc)

df_eval = pd.DataFrame(rows)

In [17]:
df_eval.head()

Unnamed: 0,hsize,hdepth,dropout,lr,bs,ep,vsplit,train_acc,val_acc,test_acc,train_loss,val_loss,test_loss,rtime
0,7,3,0.1,0.001,128,20,0.2,79.596375,80.128899,28.308374,0.491822,0.497203,1.724796,46.741764
1,7,3,0.3,0.001,128,20,0.2,79.385361,79.42305,78.750305,0.445546,0.571725,0.638634,21.75869
2,7,4,0.1,0.001,128,20,0.2,79.316299,79.868034,55.462803,0.495542,0.498364,0.819377,21.697035
3,7,4,0.3,0.001,128,20,0.2,79.35083,79.714592,79.081757,0.497686,0.497554,0.500895,42.191311
4,8,3,0.1,0.001,128,20,0.2,79.281769,79.330978,30.309355,0.450523,0.476836,1.537304,21.491909


In [18]:
df_eval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   hsize       8 non-null      int8   
 1   hdepth      8 non-null      int8   
 2   dropout     8 non-null      float32
 3   lr          8 non-null      float32
 4   bs          8 non-null      int16  
 5   ep          8 non-null      int8   
 6   vsplit      8 non-null      float32
 7   train_acc   8 non-null      float32
 8   val_acc     8 non-null      float32
 9   test_acc    8 non-null      float32
 10  train_loss  8 non-null      float32
 11  val_loss    8 non-null      float32
 12  test_loss   8 non-null      float32
 13  rtime       8 non-null      float32
dtypes: float32(10), int16(1), int8(3)
memory usage: 488.0 bytes


In [19]:
df_eval.to_csv('sample_data/eval_mlp.csv', index=False)

In [20]:
df_eval.iloc[df_eval['test_acc'].argmax(),:]  # maximum accuracy

hsize           7.000000
hdepth          4.000000
dropout         0.300000
lr              0.001000
bs            128.000000
ep             20.000000
vsplit          0.200000
train_acc      79.350830
val_acc        79.714592
test_acc       79.081757
train_loss      0.497686
val_loss        0.497554
test_loss       0.500895
rtime          42.191311
Name: 3, dtype: float32

In [21]:
df_eval.iloc[df_eval['rtime'].argmin(),:]  # minimum running time

hsize           8.000000
hdepth          3.000000
dropout         0.100000
lr              0.001000
bs            128.000000
ep             20.000000
vsplit          0.200000
train_acc      79.281769
val_acc        79.330978
test_acc       30.309355
train_loss      0.450523
val_loss        0.476836
test_loss       1.537304
rtime          21.491909
Name: 4, dtype: float32