<h1> Wildebeest Detection using U-Net from VHR satellite images</h1>
Code Author: Zijing Wu

***The code is developed for educational project purposes.***


In [None]:
#If you are using Google Colaboratory to run this code, please upload the whole folder to your Google Drive, and run this cell install the requirements.

#connect to the google drive if you use Google Colaboratory
# from google.colab import drive
# drive.mount('/content/drive')


In [None]:
import tensorflow as tf

In [None]:
#import os
#os.environ["CUDA_VISIBLE_DEVICES"]="2"
#tf.device('/device:GPU:0')

In [None]:
tf.device('/device:GPU:0')

In [None]:
#check the GPU colab assigns to you
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)


# Load libraries and prepare the dataset

##Import libraries and modules

In [None]:
import os
import rasterio

import numpy as np               # numerical array manipulation
#from tqdm import tqdm
# import cv2
import random
from rasterio.windows import Window

%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (15, 10)
import matplotlib.pyplot as plt
import gc

import pandas as pd

In [None]:
#set the sys path where the modules locates
import sys
sys.path.insert(0,"core")

#If you are using Google Colaboratory, modify the path here
#sys.path.insert(0,"/content/drive/MyDrive/Colab/zijingwu-Satellite-based-monitoring-of-wildebeest/core")
from preprocess import *
from data_generator import DataGenerator, SimpleDataGenerator

from model import *

from evaluation import *

from visualization import *

import importlib

from predict import *

## Set data file directories

In [None]:
PATCH_SIZE=336
INPUT_BANDS = [0,1,2]
NUMBER_BANDS=len(INPUT_BANDS)

Data_folder = "/home/zijing/wildebeest/SampleData"
Folder = "/home/zijing/wildebeest"

In [None]:
#Example of savinng the training data directory in a csv file

year_list = ["data_2009Aug", "data_2010Sep", "data_2013Aug", "data_2015Jul", "data_2018Aug", "data_2020Oct"]

# The images are sometimes not in the size of patch_size x patch_size.
# Use pre_directory() to further “crop” them by recording the window dimension info in the directory.
# The DataGenrator class is defined for this type of data storage.
# If your data is already cropped properly, then please use SimpleDataGenerator instead.

for year in year_list:
    image_path = Data_folder+'/'+year+'/'+'3_Train_test/train/image'
    label_path = Data_folder+'/'+year+'/'+'3_Train_test/train/mask'
    head = year
    out_path = os.path.join(Data_folder, 'update_train2023_match2023_4_dict_train_filelinks.csv')
    prep_directory(head, image_path, label_path, out_path, bandorder="123", stretch=0, stride=PATCH_SIZE)

# Model generation (U-Net) and training

## K-fold splitting ensemble
To achieve a more robust and reliable model, we adopted K-fold spliting to create an ensemble model.
We split the training dataset into k folds. We use 1 fold as the validation dataset during training, and the remaining k-1 folds as the training dataset. This way we will have k models with each model learning different variations of the dataset.
Then the predictions of all the k models are averaged to get the final prediction.

In [None]:
import sklearn
from sklearn.model_selection import KFold

#set the number of folds -- k
num_folds = 5
# Define per-fold score containers
fold = []
precision_per_fold = []
recall_per_fold = []
f1_per_fold = []
loss_per_fold = []


Val_precision_per_fold = []
Val_recall_per_fold = []
Val_f1_per_fold = []

Test_precision_per_fold = []
Test_recall_per_fold = []
Test_f1_per_fold = []


df_ori = pd.read_csv(os.path.join(Data_folder, 'update_dict_comb_filelinks.csv'))
df2023 = pd.read_csv(os.path.join(Data_folder, 'update_match2023_dict_comb_filelinks.csv'))
df2023_4 = pd.read_csv(os.path.join(Data_folder, 'update_match2023_4_dict_comb_filelinks.csv'))

train_df = pd.concat([df_ori, df2023, df2023_4]).reset_index(drop=True)

train_df = train_df[(train_df['Window_width']>=335) & (train_df['Window_height']>=335)].reset_index(drop=True)

kf = KFold(n_splits=num_folds, shuffle=True, random_state = 3)
#print(Xtrain)

#split the dataset into k folds, save the index of training and validation data
split = []
for train, val in kf.split(train_df.index):
    com = {'train': train, 'val': val}
    split.append(com)
    print(com['train'])
    print(com['val'])

## Train the model

In [None]:
# Define callbacks for the early stopping of training, LearningRateScheduler and model checkpointing
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard

weight_path = os.path.join(Folder, "tmp/test")
if not os.path.exists(weight_path):
  os.makedirs(weight_path)

reduceLROnPlat = ReduceLROnPlateau(monitor='loss', factor=0.33,
                                   patience=10, verbose=1, mode='min',
                                   min_delta=0.0001, cooldown=4, min_lr=1e-16)

early = EarlyStopping(monitor="loss", mode="min", verbose=2, patience=20)

#Use this directory if you are using Google Colaboratory
log_dir = Folder+"/tmp/logs/UNet"
# log_dir=Folder+"/tmp/logs/UNet" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=log_dir, histogram_freq=0, write_graph=True, write_grads=False, write_images=False, 
                          embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq='epoch')



In [None]:
#load tensorboard
%reload_ext tensorboard

%tensorboard --logdir /home/zijing/wildebeest/tmp/logs/UNet #put the absolute path here
# %tensorboard --logdir /content/drive/MyDrive/Colab/Wildebeest-UNet/tmp/logs/UNet


In [None]:
#k-fold ensemble model training
fold_no = 1
i = fold_no - 1

NUMBER_EPOCHS = 60
BATCH_SIZE = 12

#set weight for tversky loss in core/model.py
weight_set = 0.8

lr_set = 1.0e-04
drop_set = 0

# The images are sometimes not in the size of patch_size x patch_size.
# Use pre_directory() to further “crop” them by recording the window dimension info in the directory.
# The DataGenrator class is defined for this type of data storage.
# If your data is already cropped properly, then please use SimpleDataGenerator instead.

train_params = {'patchsize': PATCH_SIZE,
          'batch_size': BATCH_SIZE,
          'input_image_channel': [0,1,2],
          'shuffle': True,
         'augment': True,
        'folder': "/home/zijing/wildebeest/SampleData"}
val_params = {'patchsize': PATCH_SIZE,
          'batch_size': BATCH_SIZE,
          'input_image_channel': [0,1,2],
          'shuffle': True,
         'augment': False,
        'folder': "/home/zijing/wildebeest/SampleData"}



while i < num_folds:
   
    train_data = train_df.iloc[split[i]['train']].reset_index()
    val_data = train_df.iloc[split[i]['val']].reset_index()
    # train_df = df.reset_index()

    training_generator = DataGenerator(train_data,  **train_params)
    validation_generator = DataGenerator(val_data,  **val_params)

    
    pretrained_weight_path = None  
    model = unet(pretrained_weights=pretrained_weight_path, input_size = (PATCH_SIZE,PATCH_SIZE,NUMBER_BANDS),
                 lr = lr_set, drop_out = drop_set)
    model.summary()
    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    # check_path = os.path.join(weight_path, 'weights.'+str(fold_no)+'_{epoch:02d}-{loss:.4f}.hdf5')
    check_path = os.path.join(weight_path, 'weights.'+str(fold_no)+'_{epoch:02d}-{val_loss:.4f}.hdf5')    
    checkpoint = ModelCheckpoint(check_path, monitor='val_loss', verbose=1,
                                 save_best_only=True, mode='min', save_weights_only = True)
    callbacks_list = [checkpoint, reduceLROnPlat, early, tensorboard] #reduceLROnPlat is not required with adaDelta

    hist = model.fit(training_generator,
                     epochs=NUMBER_EPOCHS,
                     validation_data=validation_generator,
                     callbacks=callbacks_list,
                     verbose=1
                     # use_multiprocessing=True,
                     # workers=8
                    )
      # summarize history for loss
    plt.plot(hist.history['loss'])
    # plt.plot(hist.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

    history = pd.DataFrame(hist.history)
    history.to_csv(os.path.join(weight_path, 'hist_fold{}.csv'.format(fold_no)))

    # find the best weights with lowest validation loss
    hist_loss = [item for item in hist.history['val_loss'] if not(pd.isnull(item))]
    if len(hist_loss) == 0:
        break
    best_loss = np.min(hist_loss)
    best_epoch = hist.history['val_loss'].index(best_loss)+1

    #add criteria of lowest_loss to determine if it is needed to retrain the model
    #because sometimes the loss does not decrease at all, and it need to be retrained

    if best_loss < 1:
        #save the training history
        history = pd.DataFrame(hist.history)
        history.to_csv(os.path.join(weight_path, 'hist_fold{}.csv'.format(fold_no)))
        best_path = os.path.join(weight_path, 'weights.{}_{:02d}-{:.4f}.hdf5'.format(fold_no,best_epoch,best_loss))
        print(best_path)

        model.load_weights(best_path)

        #rename the best weights
        os.rename(best_path,os.path.join(weight_path, 'best_weights_fold_{}.hdf5'.format(fold_no)))

        del model
        del hist
        gc.collect()
        K.clear_session()

        # Increase fold number
        fold_no = fold_no + 1
        i = i + 1

    else:
        print("The loss did not decrease significantly. Retrain this model...")
        del model
        del hist
        gc.collect()
        K.clear_session()



#References
***References***

Ankit. (2020). ankitkariryaa/An-unexpectedly-large-count-of-trees-in-the-western-Sahara-and-Sahel: Paper version (v1.0.0). Zenodo. https://doi.org/10.5281/zenodo.3978185