In [2]:
!pip install hyperopt seaborn opencv-python

Collecting hyperopt
  Using cached hyperopt-0.2.3-py3-none-any.whl (1.9 MB)
Collecting seaborn
  Using cached seaborn-0.10.0-py3-none-any.whl (215 kB)
Collecting opencv-python
  Using cached opencv_python-4.2.0.32-cp36-cp36m-manylinux1_x86_64.whl (28.2 MB)
Processing /root/.cache/pip/wheels/6e/9c/ed/4499c9865ac1002697793e0ae05ba6be33553d098f3347fb94/future-0.18.2-py3-none-any.whl
Processing /root/.cache/pip/wheels/0f/12/87/f2ce9e3aeb87e36bde305edc44e4877932b039491b97f96090/networkx-2.2-py2.py3-none-any.whl
Installing collected packages: future, networkx, hyperopt, seaborn, opencv-python
Successfully installed future-0.18.2 hyperopt-0.2.3 networkx-2.2 opencv-python-4.2.0.32 seaborn-0.10.0


In [3]:
import datetime
import os
from collections import defaultdict
import gc
from joblib import dump, load
import random
import time
from typing import List, Tuple

from albumentations import (
    Compose, Flip, Rotate
)

import cv2
import dask
import dask.array as da
t
from google.cloud import storage
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, precision_score, precision_recall_curve

import tensorflow as tf
from tensorflow.keras.applications import inception_v3, resnet_v2
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.metrics import Accuracy

from data_engineering.dask_image_stats_collector import stats_for_numpy_images 
from data_science.graph_utils import graph_model_history
from data_science.keras.model_checkpoint_gcs import ModelCheckpointGCS
from data_science.keras.cnn_models import basic_cnn_model, basic_cnn_model_with_regularization, pretrained_model
from data_science.serialization_utils import numpy_to_json, sklearn_precision_recall_curve_to_dict
from data_science.sklearn_batch_generator import SklearnBatchGenerator
from data_science.train import get_model_and_metadata_from_gcs, train_keras_model

In [4]:
print(tf.test.gpu_device_name())
print(tf.test.is_built_with_gpu_support())
print(tf.test.is_built_with_cuda())


True
True


In [5]:
root = '/home/jovyan/work/data/big_earth'
os.listdir(root)

['BigEarthNet-v1.0',
 '.DS_Store',
 'google_automl_cloud_and_shadow_dataset_small.csv',
 'cloud_and_shadow_stats.csv',
 'npy_files',
 'patches_with_cloud_and_shadow.csv',
 'patches_with_seasonal_snow.csv',
 'BigEarthNet-v1.0.tar.gz',
 'model',
 'google_automl_cloud_and_shadow_dataset.csv',
 'metadata',
 'png_image_files',
 'npy_image_files']

In [6]:
random_seed = 0
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/app/.gcs/big-earth-252219-fb2e5c109f78.json'
gcs_client = storage.Client()
bucket = gcs_client.bucket("big_earth")

n_classes = 1
n_epochs = 100
batch_size = 128

early_stopping_patience = 6
use_small_dataset = False
use_random_small_dataset = False

project_name = "cloud_and_shadow"
model_dir = os.path.join(root, "model/models")
log_dir = os.path.join(root, "model/logs")
# blob prefix
gcs_model_dir = "model/models"
# tensorboard
gcs_log_dir = "gs://big_earth/model/logs"

for directory in [log_dir, model_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)

In [8]:
import json
# np.array(json.loads(df['binarized_labels'].iloc[0])).shape

def prepare_data(df):
    df['has_cloud_and_shadow_target'] = df['has_cloud_and_shadow_target'].apply(lambda x: np.array(json.loads(x)))
    df['binarized_labels'] = df['binarized_labels'].apply(lambda x: np.array(json.loads(x)))    
    df['image_path'] = root + "/npy_image_files/" + df['image_prefix'] + ".npy"
    return df

df = pd.read_csv(root + "/metadata/metadata.csv")
df = prepare_data(df)
print(df['binarized_labels'].iloc[0].shape)
print(df['has_cloud_and_shadow_target'].iloc[0].shape)
df = df.set_index('image_prefix', drop=False)

google_automl_dataset = pd.read_csv( '/app/data_science/google_automl_cloud_and_shadow_dataset_small.csv')
google_automl_dataset['image_prefix'] = google_automl_dataset['gcs_uri'].str.split('/').apply(lambda x: x[-1].replace(".png", ""))
google_automl_dataset = google_automl_dataset.set_index('image_prefix', drop=False)

train = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'TRAIN'].index]
valid = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'VALIDATION'].index]
test = df.loc[google_automl_dataset[google_automl_dataset['set'] == 'TEST'].index]

print(len(train), len(valid), len(test))
print(len(train) + len(valid) + len(test) == len(google_automl_dataset))

(1, 44)
(1,)
1907 253 240
True


In [9]:
if os.path.exists(root + '/cloud_and_shadow_stats.csv'):
    all_stats = pd.read_csv(root + '/cloud_and_shadow_stats.csv')
else:
    stat_list = []
    npy_image_dir = root + "/npy_image_files"
    npy_files = [npy_image_dir + "/" + file + ".npy" for file in train['image_prefix'].values]
    start = time.time()
    stats = stats_for_numpy_images(npy_files,  use_test_data=False)
    stats['data'] = 'all'
    stat_list.append(stats)
    
    # get stats per class
    no_cloud = train[train['has_cloud_and_shadow'] == 0]
    cloud = train[train['has_cloud_and_shadow'] == 1]
    print(len(no_cloud), len(cloud))

    for name, data in [('no_cloud', no_cloud), ('cloud', cloud)]:
        npy_files = [npy_image_dir + "/" + file + ".npy" for file in data['image_prefix'].values]
        stats = stats_for_numpy_images(npy_files,  use_test_data=False)
        stats['data'] = name
        stat_list.append(stats)    
    
    all_stats = pd.concat(stat_list)
    all_stats['band'] = all_stats.index
    all_stats = all_stats.reset_index()  
    all_stats = all_stats.drop('index', axis=1)    
    all_stats.to_csv(root + '/cloud_and_shadow_stats.csv', index=False)
        
    print(f'stats computed in {time.time() - start}')

band_stats = all_stats[all_stats['data'] == 'all']    

In [10]:
x_train = train['image_path'].values
x_valid = valid['image_path'].values
x_test = test['image_path'].values

target = 'has_cloud_and_shadow_target'
y_train = np.stack(train[target].values)
y_valid = np.stack(valid[target].values)
y_test = np.stack(test[target].values)

print(y_train.shape, y_train[0].shape)

if use_small_dataset:
    size = batch_size
    n_epochs = 3
    x_train = np.concatenate([x_train[:size], x_train[-size:]])
    x_valid = np.concatenate([x_valid[:size], x_valid[-size:]])
    x_test = np.concatenate([x_test[:size], x_test[-size:]])

    y_train = np.concatenate([y_train[:size], y_train[-size:]])
    y_valid = np.concatenate([y_valid[:size], y_valid[-size:]])
    y_test = np.concatenate([y_test[:size], y_test[-size:]])
elif use_random_small_dataset:
    shape = (100, 1)
    x_train = np.concatenate([np.ones(shape), np.zeros(shape)])
    y_train = np.concatenate([np.ones(shape), np.zeros(shape)])

    x_valid = np.concatenate([np.ones(shape), np.zeros(shape)])
    y_valid = np.concatenate([np.ones(shape), np.zeros(shape)])

    y_train = np.random.randint(0, 2, (len(train), 44))
    y_valid = np.random.randint(0, 2, (len(valid), 44))
    y_test = np.random.randint(0, 2, (len(test), 44))
    y_test_labels = test['labels'].values

print(y_train.shape, y_train[0].shape)

(1907, 1) (1,)
(1907, 1) (1,)


In [11]:
# Overfit on all training data
model = basic_cnn_model((120, 120, 3), n_classes)
experiment_name = f"{project_name}_basic_cnn_2020_1_31"
result = train_keras_model(
    random_seed=random_seed,
    x_train=x_train, y_train=y_train, x_valid=None, y_valid=None, image_augmentations=None, image_processor=None,
    band_stats=band_stats,
    bucket=bucket, model_dir=model_dir, gcs_model_dir=gcs_model_dir, gcs_log_dir=gcs_log_dir, 
    experiment_name=experiment_name, start_model=model, should_train_from_scratch=True, optimizer=Adam,  lr=3e-4,
    should_upload_to_gcs=False,
    n_epochs=200, early_stopping_patience=10)
pd.DataFrame(result['y_pred_probs']).hist()

Downloading model blob.
len(train): 1907
Epoch 1/200
Epoch 2/200
 3/15 [=====>........................] - ETA: 33s - loss: 0.6812 - accuracy: 0.5807

KeyboardInterrupt: 

In [None]:
valid_generator = get_image_dataset(x=x_valid, y=y_valid, augmentations=None, band_stats=stats, 
                                                  batch_size=batch_size)
actual_y_valid, pred_y_valid, pred_y_valid_probs = get_predictions_for_dataset(valid_generator, model) 
print('val_loss', binary_crossentropy(actual_y_valid, pred_y_valid_probs).numpy())
print('val_accuracy', accuracy_score(actual_y_valid, pred_y_valid))

In [None]:
# Start basic_cnn_model_with_regularization by adding batch normalization and validation set 
model = basic_cnn_model_with_regularization((120, 120, 3), n_classes)
experiment_name = f"{project_name}_basic_cnn_with_regularization_2020_1_31"
result = train_keras_model(
    random_seed=random_seed,
    x_train=x_train, y_train=y_train, x_valid=None, y_valid=None, image_augmentations=None, image_processor=None,
    band_stats=band_stats,
    bucket=bucket, model_dir=model_dir, gcs_model_dir=gcs_model_dir, gcs_log_dir=gcs_log_dir, 
    experiment_name=experiment_name, start_model=model, should_train_from_scratch=False, optimizer=Adam, lr=3e-4,
    should_upload_to_gcs=True,
    n_epochs=200, early_stopping_patience=30)
pd.DataFrame(result['y_pred_probs']).hist()

In [None]:
graph_model_history(history.history)

In [20]:
augmentations_train = Compose([
    Flip(p=0.5),
    Rotate(limit=(0, 360), p=0.5),
])

In [22]:
# Add image augmentation
model = basic_cnn_model_with_regularization((120, 120, 3), n_classes)
experiment_name = f"{project_name}_basic_cnn_with_regularization_and_aug_2020_1_31"

result = train_keras_model(
    random_seed=random_seed,
    x_train=x_train, y_train=y_train, x_valid=None, y_valid=None, image_augmentations=augmentations_train, 
    image_processor=None,
    band_stats=band_stats,
    bucket=bucket, model_dir=model_dir, gcs_model_dir=gcs_model_dir, gcs_log_dir=gcs_log_dir, 
    experiment_name=experiment_name, start_model=model, should_train_from_scratch=False, optimizer=Adam, lr=3e-4,
    should_upload_to_gcs=True,
    n_epochs=100, early_stopping_patience=30)
pd.DataFrame(result['y_pred_probs']).hist()

Downloading model blob.
Resuming training at epoch 41
len(train): 1907
Epoch 42/100
      1/Unknown - 4s 4s/step - loss: 0.2063 - accuracy: 0.9062

ProfilerNotRunningError: Cannot stop profiling. No profiler is running.

In [None]:
graph_model_history(history.history)

In [21]:
# Try hyperparameter optimization
def optimize():
    def train_keras_with_hyperopt_params(params):
        experiment_name = (f"{project_name}_basic_cnn_lr_{params['learning_rate'][0]}_optimizer_"
                           f"{params['optimizer'][0]}_2020_2_08")
        result = train_keras_model(
            random_seed=random_seed, x_train=x_train[:10], y_train=y_train[:10], x_valid=x_valid, y_valid=y_valid,
            image_augmentations=augmentations_train, image_processor=None, band_stats=band_stats, bucket=bucket, model_dir=model_dir,
            gcs_model_dir=gcs_model_dir, gcs_log_dir=gcs_log_dir, experiment_name=experiment_name, start_model=model,
            should_train_from_scratch=True, optimizer=params['optimizer'][1], lr=params['learning_rate'][1],
            should_upload_to_gcs=True, n_epochs=2, early_stopping_patience=10)

        result['status'] = STATUS_OK
        return result

    space = {
        'learning_rate': hp.choice('learning_rate', [
            ('1e-4', 1e-4), ('1e-3', 1e-3), ('1e-2', 1e-2)]),
        'optimizer': hp.choice('optimizer', [
            ('Adam', Adam), ('SGD', SGD), ('RMSprop', RMSprop)])
    }
    trials = Trials()
    best = fmin(fn=train_keras_with_hyperopt_params,
                algo=tpe.suggest,
                space=space,
                max_evals=30,
                trials=trials)

    return best, trials


best, trials = optimize()

len(train): 10                                        
len(valid): 253                                       
Epoch 1/2                                             
      1/Unknown                                       
 - 1s 604ms/step - loss: 0.9517 - accuracy: 0.2000    
                                                      
 - 22s 22s/step - loss: 0.9517 - accuracy: 0.2000 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 2/2                                             
  0%|          | 0/30 [00:24<?, ?trial/s, best loss=?]

job exception: Cannot stop profiling. No profiler is running.


  0%|          | 0/30 [00:25<?, ?trial/s, best loss=?]


ProfilerNotRunningError: Cannot stop profiling. No profiler is running.

In [12]:
# Try transfer learning
# https://keras.io/applications/#inceptionv3
inception_input_shape = (299, 299, 3)
model = pretrained_model(inception_v3.InceptionV3, inception_input_shape, n_classes)
experiment_name = f"{project_name}_inception_v3_with_aug_2020_2_08"

def inception_v3_image_processor(image):
    resized_img = cv2.resize(image, dsize=(inception_input_shape[0], inception_input_shape[1]), 
                             interpolation=cv2.INTER_CUBIC)
    return inception_v3.preprocess_input(resized_img)


result = train_keras_model(
    random_seed=random_seed,
    x_train=x_train, y_train=y_train, x_valid=None, y_valid=None, image_augmentations=augmentations_train, 
    image_processor=inception_v3_image_processor,
    band_stats=band_stats,
    bucket=bucket, model_dir=model_dir, gcs_model_dir=gcs_model_dir, gcs_log_dir=gcs_log_dir, 
    experiment_name=experiment_name, start_model=model, should_train_from_scratch=True, optimizer=Adam, lr=3e-4,
    should_upload_to_gcs=True,
    n_epochs=100, early_stopping_patience=30)

pd.DataFrame(result['y_pred_probs']).hist()

KeyboardInterrupt: 

In [None]:
graph_model_history(history.history)

In [None]:
resnet_v2_input_shape = (224, 224, 3)
model = pretrained_model(resnet_v2.InceptionV3, inception_input_shape, n_classes)
experiment_name = f"{project_name}_resnet_v2_with_aug_2020_2_08"

def resnet_v2_image_processor(image):
    resized_img = cv2.resize(image, dsize=(resnet_v2_input_shape[0], resnet_v2_input_shape[1]), 
                             interpolation=cv2.INTER_CUBIC)
    return resnet_v2.preprocess_input(resized_img)

history, y_actual, y_pred, y_pred_probs = train_keras_model(
    random_seed=random_seed,
    x_train=x_train, y_train=y_train, x_valid=None, y_valid=None, image_augmentations=augmentations_train, 
    image_processor=inception_v3_image_processor,
    band_stats=band_stats,
    bucket=bucket, model_dir=model_dir, gcs_model_dir=gcs_model_dir, gcs_log_dir=gcs_log_dir, 
    experiment_name=experiment_name, start_model=model, should_train_from_scratch=True, optimizer=Adam, lr=3e-4,
    should_upload_to_gcs=True,
    n_epochs=100, early_stopping_patience=30)

pd.DataFrame(y_pred_probs).hist()

In [None]:
graph_model_history(history.history)

In [None]:
precision = precision[:-1]
recall = recall[:-1]
df = pd.concat([
    pd.DataFrame({'stat_name': ['precision' for _ in range(len(precision))],
                  'stat_value': precision,
                  'threshold': thresholds}),
    pd.DataFrame({'stat_name': ['recall' for _ in range(len(precision))],
                  'stat_value': recall,
                  'threshold': thresholds})    
])

In [None]:
ax = sns.lineplot(x=recall, y=precision, color='darkblue')
ax.fill_betweean(recall,precision, color="darkblue", alpha=0.3)

In [None]:
ax = sns.lineplot(y=df['stat_value'], x=df['threshold'], hue=df['stat_name'], 
                  palette={'precision': 'red', 'recall': 'blue'})
# ax.fill_between(recall,precision, color="darkblue", alpha=0.3)