In [228]:
import datetime
import os
from collections import defaultdict
import gc
import random
import time
from typing import List, Tuple

from albumentations import (
    Compose, Flip, Rotate
)

import dask
import dask.array as da
from data_science.augmented_image_sequence_from_npy import AugmentedImageSequenceFromNpy
from data_science.cnn_models import basic_cnn_model, basic_cnn_model_with_best_practices
from data_science.sklearn_batch_generator import SklearnBatchGenerator

from joblib import dump, load

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, precision_score, precision_recall_curve

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import Accuracy, Precision, Recall

from tensorflow_addons.metrics import FBetaScore, F1Score


pal = sns.color_palette()



ModuleNotFoundError: No module named 'data_science.cnn_models'

In [193]:
random_seed = 0
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

In [194]:
root = '/home/jovyan/work/data/big_earth'
project_name = "cloud_and_shadow"
log_dir = os.path.join(root, "model/logs")
model_dir = os.path.join(root, "model/models")
checkpoint_dir = os.path.join(root, "model/checkpoints")

gcs_client = storage.Client()
bucket = gcs_client.bucket(bucket_name)

for directory in [log_dir, model_dir, checkpoint_dir]:
    if not os.path.exists(directory):
        os.mkdir(directory)

In [195]:
import json
# np.array(json.loads(df['binarized_labels'].iloc[0])).shape

def prepare_data(df):
    df['has_cloud_and_shadow_target'] = df['has_cloud_and_shadow_target'].apply(lambda x: np.array(json.loads(x)))
    df['binarized_labels'] = df['binarized_labels'].apply(lambda x: np.array(json.loads(x)))    
    df['image_path'] = root + "/npy_image_files/" + df['image_prefix'] + ".npy"
    return df

df = pd.read_csv(root + "/metadata/metadata.csv")
df = prepare_data(df)
print(df['binarized_labels'].iloc[0].shape)
print(df['has_cloud_and_shadow_target'].iloc[0].shape)
df = df.set_index('image_prefix', drop=False)

(1, 44)
(1,)


In [196]:
# has_cloud_and_shadow = df[df['has_cloud_and_shadow'] == 1]
# sample_no_cloud_and_shadow = df[df['has_cloud_and_shadow'] == 0].sample(
#     n=len(has_cloud_and_shadow), random_state=random_seed)

# print("len(sample_no_cloud_and_shadow)", len(sample_no_cloud_and_shadow), "len(has_cloud_and_shadow)", 
#       len(has_cloud_and_shadow))

# train, valid, test = balanced_class_train_test_splits(*[sample_no_cloud_and_shadow, has_cloud_and_shadow])

In [197]:
google_automl_dataset = pd.read_csv(root + '/google_automl_cloud_and_shadow_dataset_small.csv')
google_automl_dataset['image_prefix'] = google_automl_dataset['gcs_uri'].str.split('/').apply(lambda x: x[-1].replace(".png", ""))
google_automl_dataset = google_automl_dataset.set_index('image_prefix', drop=False)

train = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'TRAIN'].index]
valid = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'VALIDATION'].index]
test = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'TEST'].index]

print(len(train), len(valid), len(test))
print(len(train) + len(valid) + len(test) == len(google_automl_dataset))

1907 253 240
True


In [198]:
# npy_image_dir = root + "/npy_image_files"
# npy_files = [npy_image_dir + "/" + file + ".npy" for file in train['image_prefix'].values]
# start = time.time()
# stats = stats_for_numpy_images(npy_files,  use_test_data=False)
# stats.to_csv('cloud_and_shadow_stats.csv', index_label='band')
# print(time.time() - start)

stats = pd.read_csv('cloud_and_shadow_stats.csv', index_col='band')

50.17684245109558


In [199]:
def train_model(model, x_train: np.array, y_train: np.array, x_valid: np.array,
          y_valid: np.array, n_epochs, n_classes, batch_size, log_dir, model_path):
    """
    Based on from https://www.kaggle.com/infinitewing/keras-solution-and-my-experience-0-92664
    """
    print(f'Split train: {len(x_train)}')
    print(f'Split valid: {len(x_valid)}')

    histories = []
    learn_rates = [0.001, 0.0001, 0.00001]
    metrics = [Accuracy(), Precision(), Recall(), F1Score(num_classes=n_classes, average='micro'),
               FBetaScore(num_classes=n_classes, beta=2.0, average='micro')]
    loss = 'binary_crossentropy'
    metric_to_monitor = 'val_loss'

    for learn_rate_num, learn_rate in enumerate(learn_rates):
        print(f'Training model on fold with learn_rate {learn_rate}')
        optimizer = Adam(lr=learn_rate, momentum=0.9)
        model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

        verbosity = 0
        callbacks = [
            EarlyStopping(monitor=metric_to_monitor, patience=2, verbose=verbosity),
            ReduceLROnPlateau(monitor=metric_to_monitor, factor=0.5, patience=2, min_lr=0.000001),
            TensorBoard(log_dir, histogram_freq=1),
            ModelCheckpoint(model_path, monitor=metric_to_monitor, save_weights_only=False, save_best_only=True,
                            verbose=verbosity)
        ]

        # Generators
        train_generator = AugmentedImageSequenceFromNpy(x=x_train, y=y_train, batch_size=batch_size,
                                                        augmentations=AUGMENTATIONS_TRAIN)

        valid_generator = AugmentedImageSequenceFromNpy(x=x_valid, y=y_valid, batch_size=batch_size,
                                                        augmentations=AUGMENTATIONS_TEST)

        history = model.fit_generator(generator=train_generator,
                                      epochs=n_epochs,
                                      steps_per_epoch=len(train_generator),
                                      callbacks=callbacks,
                                      validation_data=valid_generator, validation_steps=len(valid_generator),
                                      shuffle=True, verbose=1)
        histories.append(history)

    # Attempt to avoid memory leaks
    del train_generator
    del valid_generator
    gc.collect()

    return histories


def join_histories(histories):
    full_history = defaultdict(list)

    for history in histories:
        for key, value in history.history.items():
            full_history[key].extend(value)
    return full_history


def graph_model_history(history):
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    t = f.suptitle('Basic CNN Performance', fontsize=12)
    f.subplots_adjust(top=0.85, wspace=0.3)

    max_epoch = len(history['val_loss']) + 1
    epoch_list = list(range(1, max_epoch))
    ax1.plot(epoch_list, history['accuracy'], label='Train Accuracy')
    ax1.plot(epoch_list, history['val_accuracy'], label='Validation Accuracy')
    ax1.set_xticks(np.arange(1, max_epoch, 5))
    ax1.set_ylabel('Accuracy Value')
    ax1.set_xlabel('Epoch')
    ax1.set_title('Accuracy')
    l1 = ax1.legend(loc="best")

    ax2.plot(epoch_list, history['loss'], label='Train Loss')
    ax2.plot(epoch_list, history['val_loss'], label='Validation Loss')
    ax2.set_xticks(np.arange(1, max_epoch, 5))
    ax2.set_ylabel('Loss Value')
    ax2.set_xlabel('Epoch')
    ax2.set_title('Loss')
    l2 = ax2.legend(loc="best")


def predict(model, model_path, x, batch_size, n_classes):
    thresholds = np.array([0.5 for _ in range(n_classes)])
    model.load_weights(model_path)

    predict_generator = AugmentedImageSequenceFromNpy(x=x, y=None, batch_size=batch_size,
                                                        augmentations=AUGMENTATIONS_TEST)
    # Generators
    pred_test_probs = model.predict_generator(predict_generator)
    pred_test_labels = pd.DataFrame(pred_test_probs, columns=classes)
    pred_test_labels = pred_test_labels.apply(lambda x: x > thresholds, axis=1)
    # Convert boolean predictions to labels
    pred_test_lables = pred_test_labels.apply(lambda row: ' '.join(row[row].index), axis=1)

    del predict_generator
    gc.collect()

    return pred_test_labels

AUGMENTATIONS_TRAIN = Compose([
    Flip(p=0.5),
    Rotate(limit=(0, 360), p=0.5)
])

AUGMENTATIONS_TEST = Compose([])

In [213]:
n_classes = 1

n_epochs = 100
model = basic_cnn_model((120, 120, 3), n_classes=n_classes)

# Test the correctness and speed of loading one batch
batch_size = 128

x_train = train['image_path'].values
x_valid = valid['image_path'].values
x_test = test['image_path'].values

target = 'has_cloud_and_shadow_target'
y_train = np.stack(train[target].values)
y_valid = np.stack(valid[target].values)
y_test = np.stack(test[target].values)

print(y_train.shape, y_train[0].shape)

use_small_dataset = True
use_random_small_dataset = False
if use_small_dataset:
    x_train = np.concatenate([x_train[:50], x_train[50:]])
    x_valid = np.concatenate([x_valid[:50], x_valid[50:]])
    x_test = np.concatenate([x_test[:50], x_test[50:]])

    y_train = np.concatenate([y_train[:50], y_train[50:]])
    y_valid = np.concatenate([y_valid[:50], y_valid[50:]])
    y_test = np.concatenate([y_test[:50], y_test[50:]])
elif use_random_small_dataset:
    shape = (100, 1)
    x_train = np.concatenate([np.ones(shape), np.zeros(shape)])
    y_train = np.concatenate([np.ones(shape), np.zeros(shape)])

    x_valid = np.concatenate([np.ones(shape), np.zeros(shape)])
    y_valid = np.concatenate([np.ones(shape), np.zeros(shape)])

    y_train = np.random.randint(0, 2, (len(train), 44))
    y_valid = np.random.randint(0, 2, (len(valid), 44))
    y_test = np.random.randint(0, 2, (len(test), 44))
    y_test_labels = test['labels'].values

a = AugmentedImageSequenceFromNpy(x=x_train, y=y_train,
                                  batch_size=batch_size,
                                  augmentations=AUGMENTATIONS_TRAIN, stats=stats)

for x, y in a:
    print(x.shape, y.shape)
    break

a.on_epoch_end()

(1907, 1) (1,)
(128, 120, 120, 3) (128, 1)


In [224]:
from sklearn.linear_model import LogisticRegression

# sanity check the generator output
train_batch_generator = SklearnBatchGenerator(x_train, y_train, batch_size, AUGMENTATIONS_TRAIN, stats)
valid_batch_generator = SklearnBatchGenerator(x_valid, y_valid, batch_size, AUGMENTATIONS_TEST, stats)

train_batch_generator.on_epoch_end()
valid_batch_generator.on_epoch_end()

clf = LogisticRegression()
x, y = train_batch_generator[0]
print(x.shape, x[0].shape, y.shape, y[0].shape)
clf.fit(x, y)

x, y = valid_batch_generator[0]
pred = clf.predict(x)
accuracy_score(pred, y)

(128, 43200) (43200,) (128,) ()




0.5625

In [226]:


clf = SGDClassifier()
history = list()

train_batch_generator = SklearnBatchGenerator(x_train, y_train, batch_size, AUGMENTATIONS_TRAIN, stats)
valid_batch_generator = SklearnBatchGenerator(x_valid, y_valid, batch_size, AUGMENTATIONS_TEST, stats)

n_epochs = 100
n_batches = len(x_train) // batch_size
classes = np.array([0, 1])
early_stopping_patience = 6
epochs_without_improvement = 0

now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M")
experiment_name = f"sgd_classifier_default_{now}"
model_path = os.path.join(model_dir, experiment_name + ".joblib")
model_metadata_path = os.path.join(model_dir, experiment_name + "_metadata.json")

# Shuffle the data
train_batch_generator.on_epoch_end()
valid_batch_generator.on_epoch_end()
for epoch in range(n_epochs):
    start = time.time()
    for batch_x, batch_y in train_batch_generator.make_one_shot_iterator():
        clf.partial_fit(batch_x, batch_y, classes=classes)
    
    if epoch % 10 == 0:
        print("training completed in", time.time() - start, "seconds")
        
    actual_y_train, pred_y_train = train_batch_generator.get_predictions(clf)
    actual_y_valid, pred_y_valid = valid_batch_generator.get_predictions(clf)

    epoch_time = f"{time.time() - start:.4f}"
    epoch_metrics = {
        'accuracy_train': accuracy_score(actual_y_train, pred_y_train),
        'accuracy_valid': accuracy_score(actual_y_valid, pred_y_valid),        
        "f1_score_train": f1_score(actual_y_train, pred_y_train),
        "f1_score_valid": f1_score(actual_y_valid, pred_y_valid),        
    }
    history.append(epoch_metrics)
    
    print("epoch_num", epoch, "-", epoch_time, "sec -", epoch_metrics['accuracy_valid'])
        
    if len(history) < 2:
        continue
        
    if epoch_metrics['accuracy_valid'] <= history[-2]['accuracy_valid']:
        epochs_without_improvement += 1
    else:
        dump(clf, model_path)
        with open(model_metadata_path, 'w+') as json_file:
            json.dump({
                'data': 'npy_image_files_cloud_and_shadow_equal_split_no_cloud_and_shadow',
                'data_prep': 'normalization_augmentation',
                'experiment_name': experiment_name,
                'experiment_start_time': now,
                'model': SGDClassifier.__name__,
                'random_state': random_seed,
                'confusion_matrix': confusion_matrix(actual_y_valid, pred_y_valid),
                'precision_recall_curve': precision_recall_curve(actual_y_valid, pred_y_valid),
                'history': history
            }, json_file)
        
        for filename in [model_path, model_metadata_path]:
            blob = bucket.blob(filename)
            blob.upload_from_filename(filename)
            
        epochs_without_improvement = 0
    
    if epochs_without_improvement == early_stopping_patience:
        print("Ending training due to no improvement")
        break
                                    
    train_batch_generator.on_epoch_end()
    valid_batch_generator.on_epoch_end()


training completed in 198.89089703559875 seconds
epoch_num 0 - 395.3008 sec - 0.6521739130434783
epoch_num 1 - 373.6191 sec - 0.43873517786561267
epoch_num 2 - 357.0683 sec - 0.4782608695652174


NameError: name 'work_dir' is not defined

In [None]:
if os.environ.get("SHOULD_TRAIN", "True") == "True":
    now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M")
    experiment_name = f"{project_name}_basic_cnn_{now}"
    model_path = os.path.join(model_dir, experiment_name)
    cnn_checkpoint_path = os.path.join(checkpoint_dir, f"{experiment_name}.cpkt")

#     histories = train(model, x_train=x_train,
#                       y_train=y_train,
#                       x_valid=x_valid,
#                       y_valid=y_valid,
#                       n_epochs=n_epochs,
#                       n_classes=n_classes,
#                       batch_size=batch_size,
#                       log_dir=log_dir,
#                       model_path=model_path)
    
    print(f'len(train): {len(x_train)}')
    print(f'len(valid): {len(x_valid)}')

    histories = []
    f1_score = F1Score(num_classes=n_classes)
    metrics = [Accuracy(), Precision(), Recall(), f1_score,
               FBetaScore(num_classes=n_classes, beta=2.0)]
    loss = 'binary_crossentropy'
    metric_to_monitor = 'val_accuracy'

    optimizer = Adam()
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

    verbosity = 0
    callbacks = [
        EarlyStopping(monitor=metric_to_monitor, patience=early_stopping_patience, verbose=verbosity),
        ReduceLROnPlateau(monitor=metric_to_monitor, factor=0.5, patience=early_stopping_patience, min_lr=0.000001),
        TensorBoard(log_dir, histogram_freq=1),
        ModelCheckpoint(cnn_checkpoint_path, monitor=metric_to_monitor, save_weights_only=False, save_best_only=True,
                        verbose=verbosity)
    ]

    # Generators
    train_generator = AugmentedImageSequenceFromNpy(x=x_train, y=y_train, batch_size=batch_size,
                                                    augmentations=AUGMENTATIONS_TRAIN)

    valid_generator = AugmentedImageSequenceFromNpy(x=x_valid, y=y_valid, batch_size=batch_size,
                                                    augmentations=AUGMENTATIONS_TEST)

    history = model.fit_generator(train_generator,
                                  epochs=n_epochs,
                                  steps_per_epoch=len(train_generator),
                                  callbacks=callbacks,
                                  validation_data=valid_generator, validation_steps=len(valid_generator),
                                  shuffle=True, verbose=1)
    
    model.save(model_path)
    # Attempt to avoid memory leaks
    del train_generator
    del valid_generator
    gc.collect()
    
    

# if os.environ.get("SHOULD_PREDICT", "True") == "True":
#     pred_test_labels = predict(model=model, weight_dir=model_path, x=x_test, batch_size=batch_size, n_classes=n_classes)
#     clf_report = classification_report(y_test_labels, pred_test_labels, target_names=classes)
#     print(clf_report)

In [None]:
from sklearn.metrics import classification_report

test_generator = AugmentedImageSequenceFromNpy(x=x_test, y=None, batch_size=batch_size,
                                                        augmentations=AUGMENTATIONS_TEST)
y_pred = model.predict(test_generator)
y_pred_binary = [0 if pred < .5 else 1 for pred in y_pred]
clf = classification_report(y_test, y_pred_binary,  target_names=['has_clouds'])

In [None]:
print(pd.DataFrame(y_pred)[0].unique())
print(pd.DataFrame(y_pred_binary)[0].unique())

In [None]:
print(clf)

In [None]:
pd.DataFrame(y_pred)[0].unique()

In [None]:
pd.DataFrame(y_pred_binary)[0].unique()

In [None]:
full_histories = join_histories(histories)
graph_model_history(full_histories)