<a href="https://colab.research.google.com/github/worldbank/Pakistan-Poverty-from-Sky/blob/master/DataWork/03_predict_ntl_with_dtl/03_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# __Predicting NTL using DTL__

__Code in Github:__ _/DataWork/03_predict_ntl_with_dtl/02_cnn.ipynb

## **Filepaths and Libraries**

In [2]:
PARAM_NAME = "Nbands3_nNtlBins3_minNTLbinCount16861"

In [3]:
# Set seeds. Note that using a GPU can still introduce randomness.
# (also not taking into account tensorflow randomness)
from numpy.random import seed
seed(42)

In [5]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#drive.flush_and_unmount()

In [7]:
# Filepaths
import os, datetime

CNN_DIR = os.path.join('/content/drive', 'My Drive', 'World Bank', 'Pakistan Poverty Estimation', 'Data', 'CNN', PARAM_NAME)

CNN_FILENAME = os.path.join(CNN_DIR, 'script_CNN.h5')
CNN_PARAMS_FILENAME = os.path.join(CNN_DIR, 'CNN_parameters.json')
NTL_FILENAME = os.path.join(CNN_DIR, 'ntl.npy')
NTL_CONT_FILENAME = os.path.join(CNN_DIR, 'ntl_continuous.npy')
DTL_FILENAME = os.path.join(CNN_DIR, 'dtl.npy')
PREDICTION_FILENAME = os.path.join(CNN_DIR, 'cnn_predictions_truth_values.csv')
PREDICTION_FILENAME_CONT = os.path.join(CNN_DIR, 'cnn_predictions_continuous_truth_values.csv')

In [8]:
import os, datetime
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.applications.vgg16 import VGG16
from keras.applications.inception_v3 import preprocess_input

import logging, os 
import random
import tensorflow as tf

## **Functions**

In [9]:
def transform_target(gdf, orig_target_name, n_bins):
    '''
    Creates log NTL variable and bins into 5 classes using k-means clutering.
    '''
    # Perform log(x+1) for defined domain
    transformed_target_name = f'log_{orig_target_name}'
    gdf[transformed_target_name] = np.log(gdf[orig_target_name] + 1)
    # Bin target
    target = gdf[transformed_target_name].to_numpy().reshape(-1,1)
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans')
    gdf[FINAL_TARGET_NAME] = discretizer.fit_transform(target)

def normalize(X):
    '''
    Normalizes features.
    '''
    return X.astype('float32') / 255.0

def define_model_imagenet(height, width, channels, num_classes):
    '''
    Defines and compiles CNN model.
    
    Inputs:
        height, width, channels, num_classes (int)
    Returns:
        model (keras.Model object)
    '''

    # https://medium.com/abraia/first-steps-with-transfer-learning-for-custom-image-classification-with-keras-b941601fcad5
    # https://towardsdatascience.com/cnn-transfer-learning-fine-tuning-9f3e7c5806b2

    #### Base model
    input_shape = (height, width, channels)
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)

    for layer in base_model.layers:
        layer.trainable = False

    #### Model Customization
    # We take the last layer of our the model and add it to our classifier
    last = base_model.layers[-1].output
    x = Flatten()(last)
    x = Dense(100, activation='relu', name='fc1')(x)
    x = Dropout(0.3)(x)
    x = Dense(num_classes, activation='softmax', name='predictions')(x)
    model = Model(base_model.input, x)
    # We compile the model
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    return model

def define_model_imagenet_cont(height, width, channels, num_classes):
    '''
    Defines and compiles CNN model [continuous].
    
    Inputs:
        height, width, channels, num_classes (int)
    Returns:
        model (keras.Model object)
    '''

    # https://medium.com/abraia/first-steps-with-transfer-learning-for-custom-image-classification-with-keras-b941601fcad5
    # https://towardsdatascience.com/cnn-transfer-learning-fine-tuning-9f3e7c5806b2

    #### Base model
    input_shape = (height, width, channels)
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)

    for layer in base_model.layers:
        layer.trainable = False

    #### Model Customization
    # We take the last layer of our the model and add it to our classifier
    last = base_model.layers[-1].output
    x = Flatten()(last)
    x = Dense(100, activation='relu', name='fc1')(x)
    x = Dropout(0.3)(x)
    #x = Dense(num_classes, activation='softmax', name='predictions')(x)
    x = Dense(1, kernel_initializer='normal')(x)

    model = Model(base_model.input, x)
    # We compile the model
    model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])

    return model


def evaluate_model(model, trainX, trainY, testX, testY):
    '''
    Fits model, evaluates model, saves best model over epochs and cross-validations.
    
    Inputs:
        model (CNN model) keras.Model object
        trainX, trainY (numpy.ndarray) 4D array of DTL features and 2D array of targets for training
        testX, testY (numpy.ndarray) 4D array of DTL features and 2D array of targets for testing
        current_kfold (int) iteration in kfold cross-val, default=None for no cross-val
        display_metrics (bool) Default=False
    Returns:
        None
    # https://towardsdatascience.com/step-by-step-guide-to-using-pretrained-models-in-keras-c9097b647b29
    '''

    # Use early stopping to help with overfitting
    es = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=False)

    # Save best model based on accuracy
    mc = ModelCheckpoint(CNN_FILENAME, monitor='val_loss', mode='min', 
                         verbose=True, save_best_only=True)

    # Fit model
    model.fit(trainX, trainY, 
            epochs=100, 
            batch_size=500, 
            validation_data=(testX, testY), 
            callbacks=[es, mc], 
            verbose=False)

    # Show accuracy
    loss, accuracy = model.evaluate(testX, testY, verbose=False)
    print(f'                              Accuracy: {accuracy}')

    #return model
        

def evaluate_with_crossval(model, dataX, dataY, k=2):
    '''
    Performs evaulation with K-fold cross validation.
    
    Inputs:
        model (keras.Model object)
        dataX, dataY (numpy.ndarray) 4D array of DTL features and 2D array of targets 
                                     for training
        k (int)
    Returns:
        None
    '''
    # Define k-fold cross-val
    kfold = KFold(k, shuffle=True, random_state=1)
    # Loop through folds
    count = 1
    for train_idx, test_idx in kfold.split(dataX):
        print(f'{datetime.datetime.now()}    --- Current K-fold: {count} ---')
        # Select subsets for training and testing
        trainX, trainY, testX, testY = dataX[train_idx], dataY[train_idx], \
                                       dataX[test_idx], dataY[test_idx]
        # Pass to evaluate_model function
        evaluate_model(model, trainX, trainY, testX, testY)
        count += 1

def display_eval_metrics(model, testX, testY, n_ntl_bins):
    '''
    Displays evaluation metrics for a given trained model.
    '''
    # Get predictions
    predY = model.predict(testX)
    predY = np.argmax(predY, axis = 1)
    testY_bins = np.argmax(testY, axis = 1)
    # Generate classification report
    classes = ['Radiance Level %01d' %i for i in range(1,n_ntl_bins+1)]
    print(classification_report(testY_bins, predY, target_names=classes))

## **Load Parameters**

In [10]:
with open(CNN_PARAMS_FILENAME, 'r') as fp:
    cnn_param_dict = json.load(fp)

N_bands = cnn_param_dict['N_bands']
n_ntl_bins = cnn_param_dict['n_ntl_bins']
image_height = cnn_param_dict['image_height']
image_width = cnn_param_dict['image_width']
bands = cnn_param_dict['bands']
min_ntl_bin_count = cnn_param_dict['bands']

## **Load and Prep Data**

In [11]:
# Load Data
NTL = np.load(NTL_FILENAME)
DTL = np.load(DTL_FILENAME)

# SPLIT DATA INTO TRAINING AND TESTING
trainX, testX, raw_trainY, raw_testY = train_test_split(DTL, NTL, 
                                                        test_size=0.2)


# PREP TRAINING AND TESTING DATA
trainY = to_categorical(raw_trainY)
testY = to_categorical(raw_testY)


#print(np.unique(NTL, return_counts=True))

#print(np.unique(raw_trainY, return_counts=True))
#print(np.unique(raw_testY, return_counts=True))

#print(np.unique(trainY, return_counts=True))
#print(np.unique(testY, return_counts=True))

# PREP PIXELS IN FEATURES
trainX, testX = normalize(trainX), normalize(testX)

In [12]:
# Prep Continuous X and Ys
NTL = np.load(NTL_CONT_FILENAME)

NTL_LOG = np.log(NTL + 1)

trainX_cont, testX_cont, raw_trainY_cont, raw_testY_cont = train_test_split(DTL, NTL_LOG, 
                                                                            test_size=0.2)

trainX_cont, testX_cont = normalize(trainX_cont), normalize(testX_cont)

In [13]:
model_cont = define_model_imagenet_cont(image_height, image_width, N_bands, n_ntl_bins)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [14]:
evaluate_model(model_cont, trainX_cont, raw_trainY_cont, testX_cont, raw_testY_cont)


Epoch 00001: val_loss improved from inf to 0.66457, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00002: val_loss improved from 0.66457 to 0.61604, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00003: val_loss improved from 0.61604 to 0.59966, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00004: val_loss improved from 0.59966 to 0.59099, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00005: val_loss improved from 0.59099 to 0.57950, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00006: val_l

In [15]:
best_model = load_model(CNN_FILENAME)

### Save Dataframe of Predicted Values

# Predict Values
predY = best_model.predict(testX_cont)
#predY = np.argmax(predY, axis = 1)
#testY_bins = np.argmax(testY, axis = 1)

# Make Dataframe
results_df = pd.DataFrame({'predY': predY[:,0], 'testY': raw_testY_cont})

# Save Dataframe
results_df.to_csv(PREDICTION_FILENAME_CONT, index=False)  

In [None]:
#results_df.head()
#raw_testY_cont
#predY
results_df.head()

Unnamed: 0,predY,testY
0,2.517897,2.367192
1,7.907559,8.239053
2,3.862707,0.117567
3,-2.416992,0.217285
4,1.38027,0.675854


## **Run Model**

In [None]:
model = define_model_imagenet(image_height, image_width, N_bands, n_ntl_bins)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
evaluate_model(model, trainX, trainY, testX, testY)


Epoch 00001: val_loss improved from inf to 0.79896, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00002: val_loss improved from 0.79896 to 0.77657, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00003: val_loss improved from 0.77657 to 0.75853, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00004: val_loss improved from 0.75853 to 0.75729, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00005: val_loss improved from 0.75729 to 0.74811, saving model to /content/drive/My Drive/World Bank/Pakistan Poverty Estimation/Data/CNN/Nbands3_nNtlBins3_minNTLbinCount16861/script_CNN.h5

Epoch 00006: val_l

In [None]:
# DISPLAY IN-DEPTH EVALUTAION METRICS
best_model = load_model(CNN_FILENAME)
display_eval_metrics(model, testX, testY, n_ntl_bins)

                  precision    recall  f1-score   support

Radiance Level 1       0.80      0.73      0.77      3420
Radiance Level 2       0.63      0.68      0.66      3429
Radiance Level 3       0.65      0.65      0.65      3268

        accuracy                           0.69     10117
       macro avg       0.69      0.69      0.69     10117
    weighted avg       0.69      0.69      0.69     10117



In [None]:
### Save Dataframe of Predicted Values

# Predict Values
predY = best_model.predict(testX) # model.predict(testX)
predY = np.argmax(predY, axis = 1)
testY_bins = np.argmax(testY, axis = 1)

# Make Dataframe
results_df = pd.DataFrame({'predY': predY, 'testY': testY_bins})

# Save Dataframe
results_df.to_csv(PREDICTION_FILENAME, index=False)  