In [29]:
%cd ~/africa_poverty_clean
%load_ext autoreload
%autoreload 2
%matplotlib inline

/home/jupyter/africa_poverty_clean
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from glob import glob
from batchers import dataset_constants, tfrecord_paths_utils
from models import processing
#from models.loss_utils import r2_metric
import datetime

### Constants

In [31]:
DHS_TFRECORDS_PATH_ROOT = 'data/dhs_tfrecords/' # CHANGE
CSV_PATH = 'data/dhs_clusters.csv' # CHANGE
CLUSTERS_DF = pd.read_csv(CSV_PATH, float_precision='high', index_col=False)
IR_BANDS = ['NIR', 'SWIR1', 'SWIR2']
RGB_BANDS = ['RED', 'GREEN', 'BLUE']
MEANS = dataset_constants._MEANS_DHS
STDS = dataset_constants._STD_DEVS_DHS
BATCH_SIZE = 64
DATASET = "DHS_OOC_A"
SHUFFLE = 50
PREFETCH = 2
EPOCHS = 5 # CHANGE
CHECKPOINT_PATH = 'models/checkpoints' # CHANGE

In [38]:
def process_datasets(dataset, shuffle, batch_size, prefetch):
    
    # for testing - comment out
    #tfrecord_paths = []
    #for country, year in dataset: # SY: added this for testing
    #    country_year = f"{country}_{year}"
    #    glob_path = os.path.join(
    #        DHS_TFRECORDS_PATH_ROOT, country_year + '*', '*.tfrecord.gz')
    #    tfrecord_paths += glob(glob_path)
    #tfrecord_paths = np.sort(tfrecord_paths)

    # for testing - comment out
    #train_tfrecord_paths = tfrecord_paths[:3]
    #val_tfrecord_paths = tfrecord_paths[3:6]
    #test_tfrecord_paths = tfrecord_paths[:3]
    #print(train_tfrecord_paths, val_tfrecord_paths, test_tfrecord_paths)
    
    tfrecord_path = DHS_TFRECORDS_PATH_ROOT
    #feature_map = get_first_feature_map(tfrecord_path)
    #feature_types = get_feature_types(feature_map)
    
    train_tfrecord_paths = tfrecord_paths_utils.dhs_ooc(dataset, split="train")
    val_tfrecord_paths = tfrecord_paths_utils.dhs_ooc(dataset, split="val")
    test_tfrecord_paths = tfrecord_paths_utils.dhs_ooc(dataset, split="test")
    
#     train_tfrecord_paths = train_tfrecord_paths[:100]
#     val_tfrecord_paths = val_tfrecord_paths[100:200]
#     test_tfrecord_paths = test_tfrecord_paths[:3]

    print(len(train_tfrecord_paths), len(val_tfrecord_paths), len(test_tfrecord_paths))

    train_ds = tf.data.TFRecordDataset(train_tfrecord_paths, compression_type="GZIP")
    val_ds = tf.data.TFRecordDataset(val_tfrecord_paths, compression_type="GZIP")
    test_ds = tf.data.TFRecordDataset(test_tfrecord_paths, compression_type="GZIP")

    # normalize and resize
    train_ds = train_ds.map(processing.process_tfrecords)
    val_ds = val_ds.map(processing.process_tfrecords)  
    test_ds = test_ds.map(processing.process_tfrecords)

    # train_ds = train_ds.map(processing.augment)
    # val_ds = val_ds.map(processing.augment)  
    # test_ds = test_ds.map(processing.augment)

    train_ds = train_ds.cache()
    train_ds = train_ds.shuffle(shuffle)
    train_ds = train_ds.batch(batch_size)
    train_ds = train_ds.prefetch(prefetch)

    val_ds = val_ds.cache()
    val_ds = val_ds.shuffle(shuffle)
    val_ds = val_ds.batch(batch_size)
    val_ds = val_ds.prefetch(prefetch)

    test_ds = test_ds.cache()
    test_ds = test_ds.shuffle(shuffle)
    test_ds = test_ds.batch(batch_size)
    test_ds = test_ds.prefetch(prefetch)

    return train_ds, val_ds, test_ds

In [57]:
def vgg16_model_train_val(train_ds, val_ds, epochs, checkpoint_path):
    checkpoint_path = checkpoint_path

    input_tensor = tf.keras.Input(shape=(224, 224, 3), name = 'images')
    vgg16_model = tf.keras.applications.vgg16.VGG16(include_top = False, input_tensor = input_tensor)    
    x = vgg16_model.output
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    predictions = tf.keras.layers.Dense(1, activation='linear')(x)
    model = tf.keras.Model(inputs=vgg16_model.input, outputs=predictions)
    model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])
    
    file_path = os.path.join(checkpoint_path, f"{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}.hdf5")
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        file_path,
        monitor = 'val_loss',
        verbose = 2,
        save_best_only = False,
        save_freq = 30,
    )
    hist = model.fit(
        train_ds, epochs=epochs, validation_data=val_ds,
        callbacks=[cp_callback], verbose=2)
    
    # save the model
    trained_model = hist.model
    now = datetime.datetime.now()
    date_time = now.strftime("%Y%m%d-%H%M%S")
    model_name = f"{checkpoint_path}/model-{date_time}.h5"
    trained_model.save(model_name)

    return trained_model, hist.history, hist.params

In [45]:
# evaluate function to be implemented TODO
# def vgg16_test(test_ds, checkpoint_path, batch_size):
#     trained_model = model.load_weights(checkpoint_path)
#     trained_model.evaluate()

In [41]:
train_ds, val_ds, test_ds = process_datasets(DATASET, SHUFFLE, BATCH_SIZE, PREFETCH)

11797 3909 3963
Tensor("stack_1:0", shape=(224, 224, 3), dtype=float32) (224, 224, 3)
Tensor("stack_1:0", shape=(224, 224, 3), dtype=float32) (224, 224, 3)
Tensor("stack_1:0", shape=(224, 224, 3), dtype=float32) (224, 224, 3)


In [42]:
train_ds

<DatasetV1Adapter shapes: {images: (?, 224, 224, 3), y: (?,)}, types: {images: tf.float32, y: tf.float32}>

In [58]:
trained_model, perf, params = vgg16_model_train_val(train_ds, val_ds, EPOCHS, CHECKPOINT_PATH)

Train on None steps
Epoch 1/5


2023-03-04 12:05:57.062942: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:145] Filling up shuffle buffer (this may take a while): 28 of 50
2023-03-04 12:06:02.674871: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:195] Shuffle buffer filled.



Epoch 00001: saving model to models/checkpoints/20230304-120542.hdf5

Epoch 00001: saving model to models/checkpoints/20230304-120542.hdf5

Epoch 00001: saving model to models/checkpoints/20230304-120542.hdf5

Epoch 00001: saving model to models/checkpoints/20230304-120542.hdf5

Epoch 00001: saving model to models/checkpoints/20230304-120542.hdf5

Epoch 00001: saving model to models/checkpoints/20230304-120542.hdf5


ValueError: When using data tensors as input to a model, you should specify the `steps_per_epoch` argument.

In [23]:
import pdb

In [None]:
def process_and_train(dataset, shuffle, batch_size, prefetch, bands, epochs, checkpoint_path):
    img_bands = bands
    #pdb.set_trace()
    train_ds, val_ds, test_ds = process_datasets(dataset, shuffle, batch_size, prefetch)
    #pdb.set_trace()
    trained_model, perf, params = vgg16_model_train_val(train_ds, val_ds, epochs, checkpoint_path)
    return trained_model, perf, params

### VGG16 Model: Infrared

In [None]:
trained, perf, params = process_and_train(
    dataset=DATASET, shuffle=SHUFFLE, batch_size=BATCH_SIZE, 
    prefetch=PREFETCH, bands=IR_BANDS, 
    epochs=EPOCHS, checkpoint_path=CHECKPOINT_PATH
    )

In [None]:
input_tensor = tf.keras.Input(shape=(224, 224, 3), name = 'images')
vgg16_model = tf.keras.applications.vgg16.VGG16(include_top = False, input_tensor = input_tensor)
x = vgg16_model.output
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
predictions = tf.keras.layers.Dense(1, activation='linear')(x)
model = tf.keras.Model(inputs=vgg16_model.input, outputs=predictions)
model.compile(loss='mse', optimizer='adam', metrics=["mse"])
model.fit(train_ds, epochs=2, validation_data=val_ds)