# CNN: Predict NTL and Extract Features

## Setup

In [1]:
from numpy.random import seed

import os, datetime
import numpy as np
import pandas as pd
import json
import re 

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import plot_model

from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import models
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.layers import Concatenate

import logging, os 
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from glob import glob

import config as cf

In [2]:
def _parse_function_s2(proto):
    
    ## Define Features
    keys_to_features = {'viirs_ntl_group': tf.io.FixedLenFeature([], tf.int64),
                        'b_rgb': tf.io.FixedLenFeature([], tf.string)}

    #### Load one example
    parsed_features = tf.io.parse_single_example(proto, keys_to_features)
    
    single_ms_bands = ['b_ndvi', 'b_nir', 'b_B5', 'b_B7', 'b_B8A', 'b_B11', 'b_B12', 'b_AOT']
    rgb_band = 'b_rgb'
    ntl_band = 'b_ntl'

    #### Parse numpy arrays and stack single bands
    ## NTL
    #parsed_features['b_ntl'] = tf.io.decode_png(parsed_features['b_ntl'], dtype=tf.dtypes.uint16)
    #parsed_features['b_ntl'] = tf.io.parse_tensor(parsed_features['b_ntl'], out_type=tf.float64)
    #parsed_features['b_ntl'] = tf.cast(parsed_features['b_ntl'], dtype = tf.float16) # tf.repeat requires float
    #parsed_features['b_ntl'] = tf.repeat(parsed_features['b_ntl'], 3, 2)
    # random_brightness
    # random_contrast

    ## RGB
    parsed_features['b_rgb'] = tf.io.decode_png(parsed_features['b_rgb'], dtype=tf.dtypes.uint16)
    parsed_features['b_rgb'] = parsed_features['b_rgb'] / 10000 # within 0 and 1
    parsed_features['b_rgb'] = tf.image.random_flip_left_right(parsed_features['b_rgb'])
    parsed_features['b_rgb'] = tf.image.random_flip_up_down(parsed_features['b_rgb'])

    ## Single MS Bands
    #for band_i in single_ms_bands:
    #  parsed_features[band_i] = tf.io.decode_png(parsed_features[band_i], dtype=tf.dtypes.uint16)
    #  parsed_features[band_i] = tf.cast(parsed_features[band_i], dtype = tf.float16) # tf.repeat requires float
    #  parsed_features[band_i] = tf.repeat(parsed_features[band_i], 3, 2)

    #return parsed_features["asset_pca_1"], parsed_features['b_rgb'], parsed_features['b_ntl'], parsed_features['b_ndvi']
    
    # FOR CONTINUOUS
    #return parsed_features['b_rgb'], parsed_features["viirs_ntl_group"]

    # FOR DISCRETE; hard coded number of classes
    return parsed_features['b_rgb'], tf.one_hot(parsed_features["viirs_ntl_group"], 3)

def create_dataset(filepath):

    # https://gist.github.com/Smokrow/2df26111248fa327547801ac14bb9cac
    
    # This works with arrays as well
    dataset = tf.data.TFRecordDataset(filepath)
    
    # Maps the parser on every filepath in the array. You can set the number of parallel loaders here
    dataset = dataset.map(_parse_function_s2, num_parallel_calls=8)
    #dataset = dataset.map(_parse_function_s2_multiple, num_parallel_calls=8)

    # This dataset will go on forever
    #dataset = dataset.repeat()
    
    # Set the number of datapoints you want to load and shuffle 
    dataset = dataset.shuffle(SHUFFLE_BUFFER)
    
    # Set the batchsize
    dataset = dataset.batch(BATCH_SIZE)
    
    #b_rgb, asset_pca_1 = next(iter(dataset))

    # Stack single band images
    #b_ntl = tf.repeat(b_ntl, 3, -1)
    #b_ndvi = tf.repeat(b_ndvi, 3, -1)

    # Create an iterator
    #iterator = dataset.make_one_shot_iterator()
    
    # Create your tf representation of the iterator
    #asset_pca_1, b_rgb, b_ntl, b_ndvi, = iterator.get_next()

    # Bring your picture back in shape
    #image = tf.reshape(image, [-1, 256, 256, 1])
    
    # Create a one hot array for your labels
    #label = tf.one_hot(label, NUM_CLASSES)
    
    #return asset_pca_1, b_rgb, b_ntl, b_ndvi
    #return b_rgb, asset_pca_1

    return dataset

In [27]:
#### UID
def decode_fn_uid(record_bytes):
    return tf.io.parse_single_example(
      # Data
      record_bytes,

      # Schema
      {"uid": tf.io.FixedLenFeature([], dtype=tf.string)}
  )

def extract_uid(TF_FILES):
    actual_values = []
    for batch in tf.data.TFRecordDataset([TF_FILES]).map(decode_fn_uid):
        value = batch['uid'].numpy()
        actual_values.append(value)

    return actual_values

In [28]:
SURVEY_NAME = "DHS"
SATELLITE = "l8"

#### Parameters
BATCH_SIZE = 32
SHUFFLE_BUFFER = 2048

In [29]:
## Load CNN Model
name_suffix = SATELLITE + "_" + 'rgb'

CNN_MODEL_PATH = os.path.join(cf.GOOGLEDRIVE_DIRECTORY, 'Data', SURVEY_NAME, 'FinalData',
                              'cnn_models', 
                              'model_' + name_suffix + '.h5')

In [30]:
model = load_model(CNN_MODEL_PATH)

In [31]:
## Load TFRecords

TF_DIR = os.path.join(cf.GOOGLEDRIVE_DIRECTORY, 
                      'Data', 
                      SURVEY_NAME, 
                      'FinalData',
                      'Individual Datasets',
                      'cnn_' + SATELLITE,
                      'tfrecords')

TF_FILES = os.listdir(TF_DIR)

TF_FILES = [string for string in TF_FILES if ".tfrecord" in string]

TF_FILES = [os.path.join(TF_DIR, x) for x in TF_FILES]

#TF_FILES = TF_FILES[0:2]

all_dataset = create_dataset(TF_FILES)

In [32]:
## Grab features
feature_extractor = Model(inputs=model.inputs,
                  outputs=model.get_layer(name='fc1').output,)

features = feature_extractor.predict(all_dataset)
features_df = pd.DataFrame(features).add_prefix('cnn_feat_')
features_df['uid'] = extract_uid(TF_FILES)

In [35]:
features_df.to_csv(os.path.join(cf.DROPBOX_DIRECTORY,
                               'Data',
                               SURVEY_NAME,
                               'FinalData',
                               'Individual Datasets',
                               'cnn_features',
                               'cnn_features_' + SATELLITE + '_rgb.csv'),
                  index=False)