In [2]:
%cd ~/code/saoriyosano/poverty_mapper

/home/saori/code/saoriyosano/poverty_mapper


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from glob import glob

In [5]:
from batchers import dataset_constants

### Constants

In [42]:
DHS_TFRECORDS_PATH_ROOT = 'data/dhs_tfrecords/'
CSV_PATH = 'data/dhs_clusters.csv'
CLUSTERS_DF = pd.read_csv(CSV_PATH, float_precision='high', index_col=False)
BANDS = ['NIR', 'SWIR1', 'SWIR2']
MEANS = dataset_constants._MEANS_DHS
STDS = dataset_constants._STD_DEVS_DHS
BATCH_SIZE = 64

---

In [43]:
# country_year = list(CLUSTERS_DF.groupby(['country', 'year']).count().to_dict()['lat'].keys())                                     
country_year = [('angola', 2011)]

In [44]:
# create list of tfrecord files
tfrecord_paths = []
for country, year in country_year: # SY: added this for testing
    country_year = f"{country}_{year}"
    glob_path = os.path.join(
        DHS_TFRECORDS_PATH_ROOT, country_year + '*', '*.tfrecord.gz')
    tfrecord_paths += glob(glob_path)
tfrecord_paths = np.sort(tfrecord_paths)
tfrecord_paths = tfrecord_paths[1:]

In [45]:
dataset = tf.data.TFRecordDataset(tfrecord_paths)

In [None]:
# def normalize(ex):
#     for band in BANDS:
#         ex[band] = lambda: (ex[band] - MEANS[band]) / STDS[band]
#     return ex

In [71]:
# normalize images
# stacks 3 bands into an image
# reshape to (255, 255) and crop to (224, 224)

def process_tfrecords(ex):
    '''
    Args
    - example_proto: a tf.train.Example protobuf

    Returns: dict {'images': img, 'labels': label, 'locs': loc, 'years': year, ...}
    - img: tf.Tensor, shape [224, 224, C], type float32
        - channel order is [B, G, R, SWIR1, SWIR2, TEMP1, NIR, NIGHTLIGHTS]
    - label: tf.Tensor, scalar or shape [2], type float32
        - not returned if both self.label_name and self.nl_label are None
        - [label, nl_label] (shape [2]) if self.label_name and self.nl_label are both not None
        - otherwise, is a scalar tf.Tensor containing the single label
    - loc: tf.Tensor, shape [2], type float32, order is [lat, lon]
    - year: tf.Tensor, scalar, type int32
        - default value of -1 if 'year' is not a key in the protobuf
    - may include other keys if self.scalar_features is not None
    '''
    img_bands = BANDS  # bands that we want to include in the returned img

    scalar_float_keys = ['lat', 'lon', 'year', 'wealthpooled']
    
    keys_to_features = {}
    for band in img_bands:
        keys_to_features[band] = tf.io.FixedLenFeature(shape=[255**2], dtype=tf.float32)
    for key in scalar_float_keys:
        keys_to_features[key] = tf.io.FixedLenFeature(shape=[], dtype=tf.float32)
    # if self.scalar_features is not None:
    #     for key, dtype in self.scalar_features.items():
    #         keys_to_features[key] = tf.io.FixedLenFeature(shape=[], dtype=dtype)
    print(keys_to_features)
    ex = tf.io.parse_single_example(ex, features=keys_to_features)
    loc = tf.stack([ex['lat'], ex['lon']])
    year = tf.cast(ex.get('year', -1), tf.int32)

    # img = float('nan')
    # if self.normalize is not None:
    means = dataset_constants._MEANS_DHS
    std_devs = dataset_constants._STD_DEVS_DHS
    print(means, std_devs)
        # for each band, reshape to (255, 255) and crop to (224, 224)
        # then subtract mean and divide by std dev
    for band in BANDS:
        ex[band].set_shape([255 * 255])
        ex[band] = tf.reshape(ex[band], [255, 255])[15:-16, 15:-16]
        ex[band] = (ex[band] - means[band]) / std_devs[band]
    img = tf.stack([ex[band] for band in img_bands], axis=2)
    print(type(img))
    label = ex.get('wealthpooled', float('nan'))

    # result = {'images': img, 'locs': loc, 'years': year, 'wealthpooled': label}
    result = {'images': img, 'y': label}
    return result


In [72]:
dataset_proc = dataset.map(process_tfrecords)

{'NIR': FixedLenFeature(shape=[65025], dtype=tf.float32, default_value=None), 'SWIR1': FixedLenFeature(shape=[65025], dtype=tf.float32, default_value=None), 'SWIR2': FixedLenFeature(shape=[65025], dtype=tf.float32, default_value=None), 'lat': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 'lon': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 'year': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 'wealthpooled': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None)}
{'BLUE': 0.059183, 'GREEN': 0.088619, 'RED': 0.104145, 'SWIR1': 0.246874, 'SWIR2': 0.168728, 'TEMP1': 299.078023, 'NIR': 0.253074, 'DMSP': 4.005496, 'VIIRS': 1.096089} {'BLUE': 0.022926, 'GREEN': 0.03188, 'RED': 0.051458, 'SWIR1': 0.088857, 'SWIR2': 0.08324, 'TEMP1': 4.300303, 'NIR': 0.058973, 'DMSP': 23.038301, 'VIIRS': 4.786354}
<class 'tensorflow.python.framework.ops.Tensor'>


In [None]:
tf,

In [48]:
dataset_proc = dataset_proc.cache()
dataset_proc = dataset_proc.shuffle(20)
dataset_proc = dataset_proc.batch(BATCH_SIZE)
dataset_proc = dataset_proc.prefetch(2)


### ResNet18

In [32]:
X = np.ones((1, 224, 224, 3))

In [195]:
def ResNet18():
    # Input layer
    input_tensor = tf.keras.Input(shape=(224, 224, 3), name='images') 
    # rgb = input_tensor[:, :, :, :3]  # Extract first three channels from input dictionary
    # x = tf.image.rgb_to_grayscale(rgb)  # Convert to grayscale
    # First convolutional block
    x = tf.keras.layers.Conv2D(filters=64, kernel_size=7, strides=2, padding='same')(input_tensor)
    # x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=3, strides=2, padding='same')(x)
    # Residual blocks
    filters = 64
    for i in range(4):
        # First convolutional layer of the residual block
        shortcut = x
        x = tf.keras.layers.Conv2D(filters=filters, kernel_size=3, strides=1, padding='same')(x)
        # x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation('relu')(x)
        # Second convolutional layer of the residual block
        x = tf.keras.layers.Conv2D(filters=filters, kernel_size=3, strides=1, padding='same')(x)
        # x = tf.keras.layers.BatchNormalization()(x)
        # Add shortcut to the residual block
        if shortcut.shape[-1] != filters:
            shortcut = tf.keras.layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding='valid')(shortcut)
        x = tf.keras.layers.Add()([x, shortcut])
        x = tf.keras.layers.Activation('relu')(x)
        # Increase number of filters every other residual block
        if i % 2 == 1:
            filters *= 2
    # Final layers
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(units=256, activation='relu')(x)
    x = tf.keras.layers.Dense(units=1, activation='linear')(x)
    # Create and return model
    model = tf.keras.Model(inputs=input_tensor, outputs=x)
    return model

In [198]:
model = ResNet18()

In [199]:
model.compile(
    loss='mse', optimizer='adam', model = ResNet18()
    )

In [200]:
y

array([ 2.59561753,  2.20962048,  0.90646887,  1.1053592 ,  1.87934387,
        1.74931741,  0.18236142,  0.69121599,  0.527798  ,  0.89116049,
        0.936777  , -0.05265219, -0.07346991,  0.72167271, -0.11726858,
        0.38161853, -1.14221025, -0.85293049, -0.46350324, -1.0571233 ,
       -1.0633744 ,  0.4095335 , -0.90661067, -1.17829394, -1.04396415,
       -1.13135445,  0.15251048, -0.81384754, -0.43085682, -0.24942476,
       -0.83738708, -0.93222094, -0.94360435, -1.14929593, -1.0485065 ,
       -0.82958251, -0.62332767, -0.94889927, -0.95115322, -1.06607962,
       -1.24290645, -1.22812164, -1.16334105,  0.90219295,  0.20976599,
       -0.16437396, -0.21236886,  0.20641752,  0.29319105, -0.04077934,
       -0.3678565 , -0.81828356, -0.87132818, -0.8352325 , -0.90951586,
       -0.80119652,  1.03297925,  1.30253994,  0.64373565,  1.17173481,
        0.94592243,  0.21190238,  1.8354553 ,  1.37665677,  1.06142521,
        0.65685993,  0.0344386 , -0.62296993, -1.24064398, -1.21

In [201]:
test = model.fit(X, y[0:1], epochs=3)

ValueError: Invalid argument "model" passed to K.function with TensorFlow backend

### VGG16

In [49]:
input_tensor = tf.keras.Input(shape=(224, 224, 3), name = 'images')

In [50]:
vgg16 = tf.keras.applications.vgg16.VGG16(include_top = False, input_tensor = input_tensor)

In [None]:
vgg16.summary()

In [51]:
x = vgg16.output
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
predictions = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs=vgg16.input, outputs=predictions)

In [52]:
model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [None]:
model.summary()

In [39]:
y = [1]

In [53]:
test = model.fit(dataset_proc, epochs=2)

Train on None steps
Epoch 1/2


DataLossError: corrupted record at 0
	 [[{{node IteratorGetNext_2}}]]

In [63]:
# Define the feature description for your dataset
feature_description = {
    'images': tf.io.FixedLenFeature(shape=(224,224,3), dtype=tf.float32),
    'y': tf.io.FixedLenFeature([], tf.float32)
}

# Parse each record using the feature description
def _parse_function(example_proto):
    return tf.io.parse_single_example(example_proto, feature_description)

@tf.function
def iterate_dataset():
    # Iterate through the dataset and print out each record
    record1 = []
    for record in dataset:
        parsed_record = _parse_function(record)
        record1 = parsed_record['images']
        break
    return record1

_ = iterate_dataset()

In [68]:
sess = tf.compat.v1.Session()

# Run the computation graph and fetch the value of z
z_value = sess.run(_)
print(z_value)

DataLossError: corrupted record at 0
	 [[{{node ReduceDataset}}]]