In [1]:
import datetime
import os
from collections import defaultdict
import gc
from joblib import dump, load
import random
import time
from typing import List, Tuple

from albumentations import (
    Compose, Flip, Rotate
)

import dask
import dask.array as da
from google.cloud import storage
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, precision_score, precision_recall_curve

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.metrics import Accuracy

from data_science.augmented_image_sequence_from_npy import AugmentedImageSequenceFromNpy
from data_science.keras.model_checkpoint_gcs import ModelCheckpointGCS
from data_science.keras.cnn_models import basic_cnn_model, basic_cnn_model_with_best_practices
from data_science.serialization_utils import numpy_to_json, sklearn_precision_recall_curve_to_dict
from data_science.sklearn_batch_generator import SklearnBatchGenerator

In [None]:
n_classes = 1
n_epochs = 100
batch_size = 128

early_stopping_patience = 6

use_small_dataset = True
use_random_small_dataset = False

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/app/.gcs/big-earth-252219-fb2e5c109f78.json'

In [3]:
pal = sns.color_palette()

random_seed = 0
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

root = '/home/jovyan/work/data/big_earth'
project_name = "cloud_and_shadow"
log_dir = os.path.join(root, "model/logs")
model_dir = os.path.join(root, "model/models")
gcs_model_dir = "gcs://big_earth/model/models"

for directory in [log_dir, model_dir]:
    if not os.path.exists(directory):
        os.mkdir(directory)

In [4]:
gcs_client = storage.Client()
bucket = gcs_client.bucket("big_earth")

In [5]:
import json
# np.array(json.loads(df['binarized_labels'].iloc[0])).shape

def prepare_data(df):
    df['has_cloud_and_shadow_target'] = df['has_cloud_and_shadow_target'].apply(lambda x: np.array(json.loads(x)))
    df['binarized_labels'] = df['binarized_labels'].apply(lambda x: np.array(json.loads(x)))    
    df['image_path'] = root + "/npy_image_files/" + df['image_prefix'] + ".npy"
    return df

df = pd.read_csv(root + "/metadata/metadata.csv")
df = prepare_data(df)
print(df['binarized_labels'].iloc[0].shape)
print(df['has_cloud_and_shadow_target'].iloc[0].shape)
df = df.set_index('image_prefix', drop=False)

(1, 44)
(1,)


In [6]:
# has_cloud_and_shadow = df[df['has_cloud_and_shadow'] == 1]
# sample_no_cloud_and_shadow = df[df['has_cloud_and_shadow'] == 0].sample(
#     n=len(has_cloud_and_shadow), random_state=random_seed)

# print("len(sample_no_cloud_and_shadow)", len(sample_no_cloud_and_shadow), "len(has_cloud_and_shadow)", 
#       len(has_cloud_and_shadow))

# train, valid, test = balanced_class_train_test_splits(*[sample_no_cloud_and_shadow, has_cloud_and_shadow])

In [7]:
# len(pd.read_csv(root + '/google_automl_cloud_and_shadow_dataset.csv'))

google_automl_dataset = pd.read_csv(root + '/google_automl_cloud_and_shadow_dataset_small.csv')
google_automl_dataset['image_prefix'] = google_automl_dataset['gcs_uri'].str.split('/').apply(lambda x: x[-1].replace(".png", ""))
google_automl_dataset = google_automl_dataset.set_index('image_prefix', drop=False)

train = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'TRAIN'].index]
valid = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'VALIDATION'].index]
test = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'TEST'].index]

print(len(train), len(valid), len(test))
print(len(train) + len(valid) + len(test) == len(google_automl_dataset))

1907 253 240
True


In [8]:
# npy_image_dir = root + "/npy_image_files"
# npy_files = [npy_image_dir + "/" + file + ".npy" for file in train['image_prefix'].values]
# start = time.time()
# stats = stats_for_numpy_images(npy_files,  use_test_data=False)
# stats.to_csv('cloud_and_shadow_stats.csv', index_label='band')
# print(time.time() - start)

stats = pd.read_csv('cloud_and_shadow_stats.csv', index_col='band')

In [9]:
def join_histories(histories):
    full_history = defaultdict(list)

    for history in histories:
        for key, value in history.history.items():
            full_history[key].extend(value)
    return full_history


def graph_model_history(history):
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    t = f.suptitle('Basic CNN Performance', fontsize=12)
    f.subplots_adjust(top=0.85, wspace=0.3)

    max_epoch = len(history['val_loss']) + 1
    epoch_list = list(range(1, max_epoch))
    ax1.plot(epoch_list, history['accuracy'], label='Train Accuracy')
    ax1.plot(epoch_list, history['val_accuracy'], label='Validation Accuracy')
    ax1.set_xticks(np.arange(1, max_epoch, 5))
    ax1.set_ylabel('Accuracy Value')
    ax1.set_xlabel('Epoch')
    ax1.set_title('Accuracy')
    l1 = ax1.legend(loc="best")

    ax2.plot(epoch_list, history['loss'], label='Train Loss')
    ax2.plot(epoch_list, history['val_loss'], label='Validation Loss')
    ax2.set_xticks(np.arange(1, max_epoch, 5))
    ax2.set_ylabel('Loss Value')
    ax2.set_xlabel('Epoch')
    ax2.set_title('Loss')
    l2 = ax2.legend(loc="best")


def predict(model, model_path, x, batch_size, n_classes):
    thresholds = np.array([0.5 for _ in range(n_classes)])
    model.load_weights(model_path)

    predict_generator = AugmentedImageSequenceFromNpy(x=x, y=None, batch_size=batch_size,
                                                        augmentations=AUGMENTATIONS_TEST)
    # Generators
    pred_test_probs = model.predict_generator(predict_generator)
    pred_test_labels = pd.DataFrame(pred_test_probs, columns=classes)
    pred_test_labels = pred_test_labels.apply(lambda x: x > thresholds, axis=1)
    # Convert boolean predictions to labels
    pred_test_lables = pred_test_labels.apply(lambda row: ' '.join(row[row].index), axis=1)

    del predict_generator
    gc.collect()

    return pred_test_labels

AUGMENTATIONS_TRAIN = Compose([
    Flip(p=0.5),
    Rotate(limit=(0, 360), p=0.5)
])

AUGMENTATIONS_TEST = Compose([])

In [10]:
x_train = train['image_path'].values
x_valid = valid['image_path'].values
x_test = test['image_path'].values

target = 'has_cloud_and_shadow_target'
y_train = np.stack(train[target].values)
y_valid = np.stack(valid[target].values)
y_test = np.stack(test[target].values)

print(y_train.shape, y_train[0].shape)

if use_small_dataset:
    n_epochs = 3
    x_train = np.concatenate([x_train[:50], x_train[-50:]])
    x_valid = np.concatenate([x_valid[:50], x_valid[-50:]])
    x_test = np.concatenate([x_test[:50], x_test[-50:]])

    y_train = np.concateate([y_train[:50], y_train[-50:]])
    y_valid = np.concatenate([y_valid[:50], y_valid[-50:]])
    y_test = np.concatenate([y_test[:50], y_test[-50:]])
elif use_random_small_dataset:
    shape = (100, 1)
    x_train = np.concatenate([np.ones(shape), np.zeros(shape)])
    y_train = np.concatenate([np.ones(shape), np.zeros(shape)])

    x_valid = np.concatenate([np.ones(shape), np.zeros(shape)])
    y_valid = np.concatenate([np.ones(shape), np.zeros(shape)])

    y_train = np.random.randint(0, 2, (len(train), 44))
    y_valid = np.random.randint(0, 2, (len(valid), 44))
    y_test = np.random.randint(0, 2, (len(test), 44))
    y_test_labels = test['labels'].values

a = AugmentedImageSequenceFromNpy(x=x_train, y=y_train,
                                  batch_size=batch_size,
                                  augmentations=AUGMENTATIONS_TRAIN, stats=stats)

for x, y in a:
    print(x.shape, y.shape)
    break

a.on_epoch_end()

(1907, 1) (1,)
(100, 120, 120, 3) (100, 1)


In [11]:
from sklearn.linear_model import LogisticRegression

# sanity check the generator output
train_batch_generator = SklearnBatchGenerator(x_train, y_train, batch_size, AUGMENTATIONS_TRAIN, stats)
valid_batch_generator = SklearnBatchGenerator(x_valid, y_valid, batch_size, AUGMENTATIONS_TEST, stats)

train_batch_generator.on_epoch_end()
valid_batch_generator.on_epoch_end()

clf = LogisticRegression()
x, y = train_batch_generator[0]
print(x.shape, x[0].shape, y.shape, y[0].shape)
clf.fit(x, y)

x, y = valid_batch_generator[0]
pred = clf.predict(x)
accuracy_score(pred, y)

(100, 43200) (43200,) (100,) ()




0.58

In [12]:
import json
import os

def get_model_and_metadata_from_gcs(bucket, model_dir, model_file_ext, model_load_func, gcs_model_dir, experiment_name):
    model_and_metadata_filepath = os.path.join(model_dir, experiment_name)
    metadata_filepath = f"{model_and_metadata_filepath}_metadata.json"
    model_filepath = f"{model_and_metadata_filepath}.{model_file_ext}"
    
    gcs_model_and_metadata_filepath = os.path.join(gcs_model_dir, experiment_name)
    gcs_metadata_filepath = f"{gcs_model_and_metadata_filepath}_metadata.json"
    gcs_model_filepath = f"{gcs_model_and_metadata_filepath}.{model_file_ext}"
    
    print(gcs_model_filepath)

    gcs_metadata_blob = bucket.blob(gcs_metadata_filepath)
    gcs_model_blob = bucket.blob(gcs_model_filepath)
    
    if gcs_metadata_blob.exists():
        print('Loading previously trained model.')
        gcs_metadata_blob.download_to_filename(metadata_filepath)

        with open(metadata_filepath, 'r') as json_file:
            model_metadata = json.load(json_file)

        gcs_model_blob.download_to_filename(model_filepath)

        model = model_load_func(model_filepath)
        return model, model_metadata

    return None, None

In [13]:
import joblib 

history = list()

train_batch_generator = SklearnBatchGenerator(x_train, y_train, batch_size, AUGMENTATIONS_TRAIN, stats)
valid_batch_generator = SklearnBatchGenerator(x_valid, y_valid, batch_size, AUGMENTATIONS_TEST, stats)

classes = np.array([0, 1])
epochs_without_improvement = 0

now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
experiment_name = f"sgd_classifier_default_2020_1_31"
gcs_model_dir = "model/models"
model_path = os.path.join(model_dir, experiment_name + ".joblib")
model_gcs_path = os.path.join(gcs_model_dir, experiment_name + ".joblib")
model_metadata_path = os.path.join(model_dir, experiment_name + "_metadata.json")
model_metadata_gcs_path = os.path.join(gcs_model_dir, experiment_name + "_metadata.json")

model, model_base_metadata = get_model_and_metadata_from_gcs(bucket, model_dir, "joblib", joblib.load, gcs_model_dir, 
                                                             experiment_name)

if model is not None:
    print('Resuming training at epoch', model_base_metadata['epoch'])
else:
    model = SGDClassifier()
    model_base_metadata = {
        'data': 'train_valid_google_automl_cloud_and_shadow_dataset_small.csv',
        'data_prep': 'normalization_augmentation',
        'experiment_name': experiment_name,
        'experiment_start_time': now,
        'model': SGDClassifier.__name__,
        'random_state': random_seed
    }
        
# Shuffle the data
train_batch_generator.on_epoch_end()
valid_batch_generator.on_epoch_end()
train_start = time.time()
best_model = None
for epoch in range(int(model_base_metadata['epoch']) + 1, n_epochs):
    start = time.time()
    for batch_x, batch_y in train_batch_generator.make_one_shot_iterator():
        model.partial_fit(batch_x, batch_y, classes=classes)
    
    if epoch % 10 == 0:
        print("training completed in", time.time() - start, "seconds")
    
    start = time.time()
    
    actual_y_train, pred_y_train = train_batch_generator.get_predictions(clf)
    actual_y_valid, pred_y_valid = valid_batch_generator.get_predictions(clf)
    
    if epoch % 10 == 0:
        print("prediction completed in", time.time() - start, "seconds")

    epoch_time = f"{time.time() - start:.4f}"
    epoch_metrics = {
        'accuracy_train': sklearn.metrics.accuracy_score(actual_y_train, pred_y_train),
        'accuracy_valid': sklearn.metrics.accuracy_score(actual_y_valid, pred_y_valid),        
        "f1_score_train": sklearn.metrics.f1_score(actual_y_train, pred_y_train),
        "f1_score_valid": sklearn.metrics.f1_score(actual_y_valid, pred_y_valid),        
    }
    history.append(epoch_metrics)
    
    print("epoch_num", epoch, "-", epoch_time, "sec -", epoch_metrics['accuracy_valid'])
        
    if len(history) < 2:
        continue
        
    if epoch_metrics['accuracy_valid'] <= history[-2]['accuracy_valid']:
        epochs_without_improvement += 1
    else:
        dump(model, model_path)
        with open(model_metadata_path, 'w+') as json_file:
            model_base_metadata.update({
                'epoch': str(epoch),
                'confusion_matrix': numpy_to_json(confusion_matrix(actual_y_valid, pred_y_valid)),
                'precision_recall_curve': sklearn_precision_recall_curve_to_dict(
                    sklearn.metrics.precision_recall_curve(actual_y_valid, pred_y_valid)),
                'history': history,
                'train_time_elapsed': time.time() - train_start
            })
            json.dump(model_base_metadata, json_file)
        
        for filename, gcs_filename in [(model_path, model_gcs_path), (model_metadata_path, model_metadata_gcs_path)]:
            blob = bucket.blob(gcs_filename)
            blob.upload_from_filename(filename)
            
        epochs_without_improvement = 0
    
    if epochs_without_improvement == early_stopping_patience:
        print("Ending training due to no improvement")
        break
    
    
    train_batch_generator.on_epoch_end()
    valid_batch_generator.on_epoch_end()


model/models/sgd_classifier_default_2020_1_31.joblib
Loading previously trained model.
Resuming training at epoch 5
epoch_num 6 - 2.6665 sec - 0.58
epoch_num 7 - 1.7891 sec - 0.58
epoch_num 8 - 2.0057 sec - 0.58
epoch_num 9 - 2.1827 sec - 0.58
training completed in 1.1429970264434814 seconds
prediction completed in 2.1457488536834717 seconds
epoch_num 10 - 2.1466 sec - 0.58
epoch_num 11 - 2.0036 sec - 0.58
epoch_num 12 - 3.6074 sec - 0.58
Ending training due to no improvement


In [14]:
import json
import time
import warnings

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.lib.io import file_io


class ModelCheckpointGCS(ModelCheckpoint):
    """
    Computes scikit-learn metrics on train and validation data whenever model reaches a new high "monitor" value. Saves
    model and training metadata to disk and gcs. Assumes GOOGLE_APPLICATION_CREDENTIALS has been set.
    """

    def __init__(self, filepath, gcs_filepath, gcs_bucket, model_metadata, monitor='val_loss', verbose=0, mode='auto', period=1):
        model_filepath = f"{filepath}.h5"
        super(ModelCheckpointGCS, self).__init__(filepath=model_filepath, monitor=monitor, verbose=verbose,
                                                 save_best_only=True, save_weights_only=False,
                                                 mode=mode, period=period)
        self.model_filepath = model_filepath
        self.model_metadata_filepath = f"{filepath}_metadata.json"
        self.gcs_bucket = gcs_bucket
        self.gcs_model_filepath = f"{gcs_filepath}.h5"
        self.gcs_model_metadata_filepath = f"{gcs_filepath}_metadata.json"
        self.model_metadata = model_metadata
        self.train_start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        """
        Based on
        https://github.com/tensorflow/tensorflow/blob/r2.1/tensorflow/python/keras/callbacks.py#L983
        :param epoch:
        :param logs:
        :return:
        """
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            warnings.warn('Can save best model only with %s available, '
                          'skipping.' % (self.monitor), RuntimeWarning)
        else:
            if self.monitor_op(current, self.best):
                if self.verbose > 0:
                    print('Epoch %05d: %s improved from %0.5f to %0.5f,'
                          ' saving model to %s'
                          % (epoch, self.monitor, self.best,
                             current, self.model_filepath))
                self.best = current

                # Save model
                self.model.save(self.model_filepath, overwrite=True)

                blob = self.gcs_bucket.blob(self.gcs_model_filepath)
                blob.upload_from_filename(self.model_filepath)

                self.model_metadata.update({
                    'epoch': str(epoch),
                    'history': {key: value.astype(np.float64) for key, value in logs.items()},
                    'elapsed_train_time': time.time() - self.train_start_time
                })
                
                with open(self.model_metadata_filepath, 'w+') as json_file:
                    json.dump(self.model_metadata, json_file)
                
                blob = self.gcs_bucket.blob(self.gcs_model_metadata_filepath)
                blob.upload_from_filename(self.model_metadata_filepath)

            else:
                if self.verbose > 0:
                    print('Epoch %05d: %s did not improve' %
                          (epoch, self.monitor))


In [20]:
now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M")
experiment_name = f"{project_name}_basic_cnn_2020_1_31"

model, model_base_metadata = get_model_and_metadata_from_gcs(bucket, model_dir, "h5", load_model, gcs_model_dir, experiment_name)

model_and_metadata_filepath = os.path.join(model_dir, experiment_name)
gcs_model_and_metadata_filepath = os.path.join(gcs_model_dir, experiment_name)

if model is not None:
    print('Resuming training at epoch', int(model_base_metadata['epoch']) + 1)
else:
    model = basic_cnn_model((120, 120, 3), n_classes=n_classes)
    model_base_metadata = {
        'data': 'train_valid_google_automl_cloud_and_shadow_dataset_small.csv',
        'data_prep': 'normalization_augmentation',
        'experiment_name': experiment_name,
        'experiment_start_time': now,
        'model': 'keras_cnn',
        'random_state': random_seed,
        # so that initial_epoch is 0
        'epoch': -1
    }        

print(f'len(train): {len(x_train)}')
print(f'len(valid): {len(x_valid)}')

histories = []
metrics = [Accuracy()]
loss = 'binary_crossentropy'
metric_to_monitor = 'val_accuracy'

optimizer = Adam()
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

verbosity = 0
# Generators
train_generator = AugmentedImageSequenceFromNpy(x=x_train, y=y_train, batch_size=batch_size,
                                                augmentations=AUGMENTATIONS_TRAIN, stats=stats)

valid_generator = AugmentedImageSequenceFromNpy(x=x_valid, y=y_valid, batch_size=batch_size,
                                                augmentations=AUGMENTATIONS_TEST, stats=stats)

callbacks = [
    EarlyStopping(monitor=metric_to_monitor, patience=early_stopping_patience, verbose=verbosity),
    ReduceLROnPlateau(monitor=metric_to_monitor, factor=0.5, patience=early_stopping_patience, min_lr=0.000001),
    TensorBoard(log_dir, histogram_freq=1),
    ModelCheckpointGCS(filepath=model_and_metadata_filepath, gcs_filepath=gcs_model_and_metadata_filepath, 
                       gcs_bucket=bucket, model_metadata=model_base_metadata, monitor=metric_to_monitor, 
                       verbose=verbosity)
]

history = model.fit(train_generator, initial_epoch=int(model_base_metadata['epoch']) + 1,
                              epochs=n_epochs,
                              steps_per_epoch=len(train_generator),
                              callbacks=callbacks,
                              validation_data=valid_generator, validation_steps=len(valid_generator),
                              shuffle=True, verbose=1)

actual_y_train, pred_y_train = train_generator.get_predictions(model)
actual_y_valid, pred_y_valid = valid_generator.get_predictions(model)

metadata_filepath = f"{model_and_metadata_filepath}_metadata.json"
with open(metadata_filepath, 'r') as json_file:
    best_model_metadata = json.load(json_file)

best_model_metadata.update({
    'accuracy_train': sklearn.metrics.accuracy_score(actual_y_train, pred_y_train),
    'accuracy_valid': sklearn.metrics.accuracy_score(actual_y_valid, pred_y_valid),
    'f1_score_train': sklearn.metrics.f1_score(actual_y_train, pred_y_train),
    'f1_score_valid': sklearn.metrics.f1_score(actual_y_valid, pred_y_valid),
    'confusion_matrix': numpy_to_json(sklearn.metrics.confusion_matrix(actual_y_valid, pred_y_valid)),
    'precision_recall_curve': sklearn_precision_recall_curve_to_dict(
        sklearn.metrics.precision_recall_curve(actual_y_valid, pred_y_valid)),
})

with open(metadata_filepath, 'w+') as json_file:
    json.dump(best_model_metadata, json_file)

blob = bucket.blob(f"{gcs_model_and_metadata_filepath}_metadata.json")
blob.upload_from_filename(metadata_filepath)

# Attempt to avoid memory leaks
del train_generator
del valid_generator
gc.collect()



# if os.environ.get("SHOULD_PREDICT", "True") == "True":
#     pred_test_labels = predict(model=model, weight_dir=model_path, x=x_test, batch_size=batch_size, n_classes=n_classes)
#     clf_report = classification_report(y_test_labels, pred_test_labels, target_names=classes)
#     print(clf_report)

model/models/cloud_and_shadow_basic_cnn_2020_1_31.h5
Loading previously trained model.
Resuming training at epoch 4
len(train): 100
len(valid): 100
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 1 steps, validate for 1 steps
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


In [None]:
# from sklearn.metrics import classification_report

# test_generator = AugmentedImageSequenceFromNpy(x=x_test, y=None, batch_size=batch_size,
#                                                         augmentations=AUGMENTATIONS_TEST)
# y_pred = model.predict(test_generator)
# y_pred_binary = [0 if pred < .5 else 1 for pred in y_pred]
# clf = classification_report(y_test, y_pred_binary,  target_names=['has_clouds'])

In [None]:
# print(pd.DataFrame(y_pred)[0].unique())
# print(pd.DataFrame(y_pred_binary)[0].unique())
# pd.DataFrame(y_pred)[0].unique()
# pd.DataFrame(y_pred_binary)[0].unique()
# full_histories = join_histories(histories)
# graph_model_history(full_histories)