In [None]:
'''
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
'''

In [None]:
RANDOM_SEED_CONSTANT = 42  # FOR_REPRODUCIBILITY

import os
os.environ['PYTHONHASHSEED']=str(RANDOM_SEED_CONSTANT)
####*IMPORANT*: Have to do this line *before* importing tensorflow

import tensorflow   
tensorflow.random.set_seed(RANDOM_SEED_CONSTANT)

import numpy as np
np.random.seed(RANDOM_SEED_CONSTANT)

import random
random.seed(RANDOM_SEED_CONSTANT)

In [None]:
from tensorflow import keras
print(tensorflow.__version__)
print(keras.__version__)

from keras.models import load_model

# Prevent NHWC errors
#https://www.nuomiphp.com/eplan/en/50125.html
from tensorflow.keras import backend as K

if K.image_data_format()=='channels_first':
    K.set_image_data_format('channels_last')

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import roc_auc_score, balanced_accuracy_score

In [None]:
def print_binaryclassif_perf(labels, predictions, p=0.5, verbose=True):

    #labels      = np.concatenate([y for x, y in ds], axis=0).flatten()
    #labels      = labels.astype('int32')
    #predictions = model.predict(ds).flatten()

    assert(labels.shape==predictions.shape)
    assert(type(labels)==type(predictions))
    
    cm = confusion_matrix(labels, predictions > p)
    
    predictions = (predictions > p).astype('int32')
    assert(labels.dtype==predictions.dtype)
        
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    tn, fp, fn, tp  = cm.ravel()
    
    if verbose:
        print('True Negatives\t: ',  tn)
        print('False Positives\t: ', fp)
        print('False Negatives\t: ', fn)
        print('True Positives\t: ',  tp)
    
    print('Accuracy\t= {}'.format(accuracy_score(labels, predictions)))
    print('Precision\t= {}'.format(precision_score(labels, predictions)))
    print('TPR/Recall\t= {} (a.k.a Sensitivity)'.format(recall_score(labels, predictions)))
    print('TNR\t\t= {} (a.k.a Specificity)'.format((tn/(tn+fp))))
    print('F1_score\t= {}'.format(f1_score(labels, predictions)))
    print('roc_auc\t\t = {}'.format(roc_auc_score(labels, predictions)))
    print('Balanced acc\t = {}'.format(balanced_accuracy_score(labels, predictions)))

def print_metrics_on(model, ds):
    '''Dataset can be val_ds or train_ds or test_ds'''
    ytrue_val = np.concatenate([y for x, y in ds], axis=0).flatten()
    ytrue_val = ytrue_val.astype('int32')
    ypred_val = model.predict(ds).flatten()
    print_binaryclassif_perf(ytrue_val, ypred_val)

def plot_learning_history(history, filetosave=None):
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.gca().set_ylim(0, 1)
    if filetosave is None:
        plt.show()
    else:
        plt.savefig(filetosave)

In [None]:
IMAGES_DIR    = '/kaggle/input/glasses-or-no-glasses/faces-spring-2020/faces-spring-2020/'
PATH_TRAINCSV = '/kaggle/input/glasses-or-no-glasses/train.csv'

In [None]:
for dirname, dirs, filenames in os.walk(IMAGES_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Prepare the data by creating two directories of glass and noglass images
DIR_IMAGES_SUBFOLDERED = '/kaggle/working/images_ord'
os.makedirs(DIR_IMAGES_SUBFOLDERED)
os.mkdir(os.path.join(DIR_IMAGES_SUBFOLDERED, '0'))
os.mkdir(os.path.join(DIR_IMAGES_SUBFOLDERED, '1'))

In [None]:
# Copy all images to separate 0 and 1 class
df = pd.read_csv(PATH_TRAINCSV, dtype={'glasses':str})
#print(df.head())
for row in df.itertuples():
    srcimg       = 'face-{}.png'.format(row.id)
    path_srcimg  = os.path.join(IMAGES_DIR, srcimg)
    path_dstimg  = os.path.join(DIR_IMAGES_SUBFOLDERED, row.glasses, srcimg) 
    if not os.path.exists(path_dstimg):
        os.symlink(path_srcimg, path_dstimg)

In [None]:
BATCH_SIZE = 32 # defaults for most functions in tf2
IMG_HEIGHT = 56
IMG_WIDTH  = 56

train_ds = keras.preprocessing.image_dataset_from_directory(
            directory=DIR_IMAGES_SUBFOLDERED,
            label_mode='binary',
            color_mode='rgb',
            image_size=(56, 56),
            batch_size=BATCH_SIZE,
            seed=RANDOM_SEED_CONSTANT,
            validation_split=0.1,
            subset='training')

val_ds =  keras.preprocessing.image_dataset_from_directory(
            directory=DIR_IMAGES_SUBFOLDERED,
            label_mode='binary',
            color_mode='rgb',
            image_size=(56, 56),
            batch_size=BATCH_SIZE,
            seed=RANDOM_SEED_CONSTANT,
            validation_split=0.1,
            subset='validation')

In [None]:
def make_vgg16_based_cnn(BIAS_OPDENSE_LAYER=None, verbose=True):
    
    # To counteract the impact of imbalanced classes, we add an output bias
    if BIAS_OPDENSE_LAYER is not None:
        output_bias = keras.initializers.Constant(BIAS_OPDENSE_LAYER)
    else:
        output_bias = 'zeros'
    
    init_glorotuni = keras.initializers.GlorotUniform(seed=RANDOM_SEED_CONSTANT)
        
    model_backbone  = VGG16(weights='imagenet',
                      include_top=False,          # Don't use the prediction part of this network. We don't need it
                      input_shape=(56, 56, 3))
                        
    print('Backbone model')
    print(model_backbone.summary())

    if verbose:
        print('No. of trainable weights before freezing the model_backbone:', len(model_backbone.trainable_weights))
        model_backbone.trainable = False
        print('No. of trainable weights after freezing the model_backbone:', len(model_backbone.trainable_weights))
    
    model = keras.models.Sequential()
    model.add(model_backbone)
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(256, activation='relu', kernel_initializer=init_glorotuni))
    model.add(keras.layers.Dropout(rate=0.15))
    model.add(keras.layers.Dense(1, activation='sigmoid',kernel_initializer=init_glorotuni, 
                           bias_initializer=output_bias))
    
    print('Model to do transfer learning')
    print(model.summary())
    
    rmsprop = keras.optimizers.RMSprop(lr=2e-5)
    
    # Set information about which loss function and what optimization algorithm we will use to optimize it
    model.compile(loss='binary_crossentropy', 
                  optimizer=rmsprop, 
                  metrics=['acc', tensorflow.keras.metrics.AUC()])

    print('No. of trainable tensors after compilation:', len(model.trainable_weights))
    
    return model

In [None]:
model_cnn = make_vgg16_based_cnn()

In [None]:
early_stopping = tensorflow.keras.callbacks.EarlyStopping(
                 monitor='val_acc', 
                 verbose=1,
                 patience=5,
                 mode='max',
                 restore_best_weights=True)

In [None]:
tensorflow.config.list_physical_devices('GPU')

In [None]:
history =  model_cnn.fit(
                train_ds,
                epochs=2,
                steps_per_epoch=10,
                validation_data=val_ds)

In [None]:
#model_cnn.save('/kaggle/working/model_cnn_trained.h5')

In [None]:
model_cnn.evaluate(val_ds) # These values should correspond to val_loss, val_acc and val_auc printed in epoch2 above

In [None]:
#plot_learning_history(history)

In [None]:
print_metrics_on(model_cnn, val_ds)

In [None]:
Why are preds1 and preds2 below different?

In [None]:
preds1 = model_cnn.predict(val_ds).flatten() 
print(preds1[0:5])
print(preds1[-5:])

In [None]:
preds2 = model_cnn.predict(val_ds).flatten() 
print(preds2[0:5])
print(preds2[-5:])