# CNN

## Setup

In [None]:
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
# https://www.pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/

from numpy.random import seed


import os, datetime
import numpy as np
import pandas as pd
import json
import re 

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras import datasets, layers, models

import keras
from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras import models
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model, Model
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from keras.applications.inception_v3 import preprocess_input

from tensorflow.keras.utils import plot_model
import tensorflow.keras as K

import logging, os 
import random
import tensorflow as tf
import matplotlib.pyplot as plt

import config as cf

# Set seeds. Note that using a GPU can still introduce randomness.
# (also not taking into account tensorflow randomness)
seed(42)

## Functions

In [None]:
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32), n_channels=1,
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' 
        
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size)) # dtype=int 

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load(os.path.join(NPY_PATH, ID + '.npy'))

            # Store class
            y[i] = self.labels[ID]
        
        return X, to_categorical(y, num_classes=self.n_classes)    

In [None]:
def define_model_imagenet(height, width, num_classes):
    '''
    Defines and compiles CNN model.
    
    Inputs:
        height, width, channels, num_classes (int)
    Returns:
        model (keras.Model object)
    '''

    # https://medium.com/abraia/first-steps-with-transfer-learning-for-custom-image-classification-with-keras-b941601fcad5
    # https://towardsdatascience.com/cnn-transfer-learning-fine-tuning-9f3e7c5806b2

    #### Base model
    input_shape = (height, width, 3)
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape, pooling = "max")

    for layer in base_model.layers:
        layer.trainable = False

    #### Model Customization
    # We take the last layer of our the model and add it to our classifier
    last = base_model.layers[-1].output
    x = Flatten()(last)
    x = Dense(100, activation='relu', name='fc1')(x)
    x = Dropout(0.3)(x)
    x = Dense(num_classes, activation='softmax', name='predictions')(x)
    model = Model(base_model.input, x)
    # We compile the model
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    return model

def evaluate_model(model, training_generator, validation_generator, CNN_MODEL_CHECKPOINT):
    '''
    Fits model, evaluates model, saves best model over epochs and cross-validations.
    
    Inputs:
        model (CNN model) keras.Model object
        trainX, trainY (numpy.ndarray) 4D array of DTL features and 2D array of targets for training
        testX, testY (numpy.ndarray) 4D array of DTL features and 2D array of targets for testing
        current_kfold (int) iteration in kfold cross-val, default=None for no cross-val
        display_metrics (bool) Default=False
    Returns:
        None
    # https://towardsdatascience.com/step-by-step-guide-to-using-pretrained-models-in-keras-c9097b647b29
    '''

    # Use early stopping to help with overfitting
    es = EarlyStopping(monitor='val_loss', mode='min', patience=2, verbose=False)

    # Save best model based on accuracy
    mc = ModelCheckpoint(CNN_MODEL_CHECKPOINT, monitor='val_loss', mode='min', 
                         verbose=True, save_best_only=True)

    # Fit model
    #history = model.fit(trainX, trainY, 
    #        epochs=50, 
    #        batch_size=32, 
    #        validation_data=(testX, testY), 
    #        callbacks=[es, mc], 
    #        verbose=False)
    
    history = model.fit(x=training_generator,
                        validation_data=validation_generator,
                        use_multiprocessing=True,
                        epochs=50,
                        callbacks=[es, mc],
                        workers=6)

    # Show accuracy
    loss, accuracy = model.evaluate(testX, testY, verbose=False)
    print(f'                              Accuracy: {accuracy}')

    return history

## Parameters

In [None]:
SURVEY_NAME = 'DHS'
SATELLITE = 'l8'
BAND = 'BRGB'
TARGET_VAR = 'wealth_index'

### Load Numpy Files

In [None]:
# List of npy files
NPY_PATH = os.path.join(cf.GOOGLEDRIVE_DIRECTORY, 
             'Data', 
             SURVEY_NAME, 
             'FinalData', 
             'Individual Datasets',
            'cnn_' + SATELLITE,
             'npy')

NPY_FILES = os.listdir(NPY_PATH)
reg = re.compile(r'^' + BAND + '_')                  
NPY_FILES = list(filter(reg.search, NPY_FILES)) 

# List of uids
uids = [file.replace('.npy', '').replace(BAND + '_', '') for file in NPY_FILES]

### Prepare Survey Data

In [None]:
#### Load survey data
survey_df = pd.read_csv(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_socioeconomic.csv'))

#### Subset survey

# Subset if target variable is NA
survey_df = survey_df.dropna(axis=0, subset=[TARGET_VAR])

# Subset to survey where we have an associated numpy array
survey_df = survey_df[survey_df['uid'].isin(uids)]

#### Variable Clean/Add

# Prep target variable
survey_df[TARGET_VAR] = np.round(survey_df[TARGET_VAR]).tolist()
survey_df[TARGET_VAR] = survey_df[TARGET_VAR] - 1 # so starts at 0

# Add band name
survey_df['band_uid'] = BAND + '_' + survey_df['uid']

# Indicate Train/Test
survey_df['traintest'] = np.random.choice(a = ['train', 'test'], 
                                      p = [0.8, 0.2],
                                      size = survey_df.shape[0])

### Dictionaries for Generator

In [None]:
# Partition Dictionary
train_uids = survey_df[survey_df.traintest == 'train']['band_uid'].tolist()
test_uids = survey_df[survey_df.traintest == 'test']['band_uid'].tolist()

partition = {'train': train_uids, 
             'test': test_uids}

In [None]:
labels = dict(zip(survey_df.band_uid, survey_df[TARGET_VAR]))

## Implement CNN

In [None]:
# Parameters
params = {'dim': (224,224),
          'batch_size': 32,
          'n_classes': 5,
          'n_channels': 3,
          'shuffle': True}

# Generators
training_generator = DataGenerator(partition['train'], labels, **params)
validation_generator = DataGenerator(partition['test'], labels, **params)

In [None]:
CNN_MODEL_PATH = os.path.join('/Users/robmarty/Desktop', f'CNN_DEPVAR.h5')

In [None]:
model = define_model_imagenet(params['dim'][0], params['dim'][1], params['n_classes'])
evaluate_model(model, training_generator, validation_generator, CNN_MODEL_PATH)