useful links:

- Data Preparation for Variable Length Input Sequences, URL: https://machinelearningmastery.com/data-preparation-variable-length-input-sequences-sequence-prediction/
- Masking and padding with Keras, URL: https://www.tensorflow.org/guide/keras/masking_and_padding
- Step-by-step understanding LSTM Autoencoder layers, URL: https://towardsdatascience.com/step-by-step-understanding-lstm-autoencoder-layers-ffab055b6352XX, 
- Understanding input_shape parameter in LSTM with Keras, URL: https://stats.stackexchange.com/questions/274478/understanding-input-shape-parameter-in-lstm-with-keras
- tf.convert_to_tensor, URL: https://www.tensorflow.org/api_docs/python/tf/convert_to_tensor
- ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int) in Python, URL: https://datascience.stackexchange.com/questions/82440/valueerror-failed-to-convert-a-numpy-array-to-a-tensor-unsupported-object-type

In [None]:
"""
* Copyright 2020, Maestria de Humanidades Digitales,
* Universidad de Los Andes
*
* Developed for the Msc graduation project in Digital Humanities
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

# ===============================
# native python libraries
# ===============================
import re
import random
import json
import csv
import cv2
import datetime
from collections import OrderedDict
from collections import Counter
from collections import deque

# ===============================
# extension python libraries
# ===============================
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
import matplotlib.pyplot as plt

# sample handling sklearn package
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer

# # Keras + Tensorflow ML libraries
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Masking
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import UpSampling2D
from tensorflow.keras.layers import GlobalMaxPooling2D

# ===============================
# developed python libraries
# ===============================

In [None]:
'''
A UDF to convert input data into 3-D
array as required for LSTM network.

taken from https://towardsdatascience.com/step-by-step-understanding-lstm-autoencoder-layers-ffab055b6352
'''
def temporalize(data, lookback):
    output_X = list()
    for i in range(len(X)-lookback-1):
        temp = list()
        for j in range(1,lookback+1):
            # Gather past records upto the lookback period
            temp.append(data[[(i+j+1)], :])
        temp = np.array(temp, dtype="object")
        output_X.append(temp)
    output_X = np.array(output_X, dtype="object")
    return output_X

In [None]:
def read_img(img_fpn):
    ans = cv2.imread(img_fpn, cv2.IMREAD_UNCHANGED)
    return ans

In [None]:
def std_img(img, minv, maxv):
    rangev = maxv - minv
    ans = img.astype("float32")/float(rangev)
    # ans = pd.Series(ans)
    # respuesta de la funcion
    return ans

In [None]:
def pad_img(img, h, w):
    #  in case when you have odd number
    top_pad = np.floor((h - img.shape[0]) / 2).astype(np.uint16)
    bottom_pad = np.ceil((h - img.shape[0]) / 2).astype(np.uint16)
    right_pad = np.ceil((w - img.shape[1]) / 2).astype(np.uint16)
    left_pad = np.floor((w - img.shape[1]) / 2).astype(np.uint16)
    ans = np.copy(np.pad(img, ((top_pad, bottom_pad), (left_pad, right_pad), (0, 0)), mode="constant", constant_values=0))
    return ans

In [None]:
def get_images(rootf, src_df, src_col, tgt_col):
    ans = src_df
    src_files = list(ans[src_col])
    tgt_files = list()

    # ansdict = {}
    for tfile in src_files:
        tfpn = os.path.join(rootf, tfile)
        # print(tfpn)
        # print(os.path.exists(tfpn))
        timg = read_img(tfpn)
        tgt_files.append(timg)

    ans[tgt_col] = tgt_files
    return ans

In [None]:
def padding_images(src_df, src_col, tgt_col, max_shape):
    ans = src_df
    src_images = list(ans[src_col])
    tgt_images = list()
    max_x, max_y = max_shape[0], max_shape[1]
    padding = None

    # ansdict = {}
    for timg in src_images:
        # print(timg)
        timg = np.array(timg, dtype="object")
        # std_timg = std_img(timg, 0, 255)
        pimg = pad_img(timg, max_y, max_x)
        tgt_images.append(pimg)

    ans[tgt_col] = tgt_images
    return ans

In [None]:
def standarize_images(src_df, src_col, tgt_col):
    ans = src_df
    src_images = list(ans[src_col])
    tgt_images = list()

    # ansdict = {}
    for timg in src_images:
        # print(timg)
        timg = np.array(timg, dtype="object")
        std_timg = std_img(timg, 0, 255)
        tgt_images.append(std_timg)

    ans[tgt_col] = tgt_images
    return ans

In [None]:
# function to get the max shape in the image dataset
def get_mshape(shape_data, imgt):

    max_x, max_y, max_ch = 0, 0, 0
    shape_data = list(shape_data)
    ans = None

    if imgt == "rgb":

        for tshape in shape_data:
            tshape = eval(tshape)
            tx, ty, tch = tshape[0], tshape[1], tshape[2]

            if tx > max_x:
                max_x = tx
            if ty > max_y:
                max_y = ty
            if tch > max_ch:
                max_ch = tch
            
        ans = (max_x, max_y, max_ch)
    
    elif imgt == "bw":

        for tshape in shape_data:
            tshape = eval(tshape)
            tx, ty = tshape[0], tshape[1]

            if tx > max_x:
                max_x = tx
            if ty > max_y:
                max_y = ty
            
        ans = (max_x, max_y)
        
    return ans

In [None]:
# variable definitions
# root folder
dataf = "Data"

# subfolder with predictions txt data
imagef = "Img"

# report subfolder
reportf = "Reports"

#  subfolder with the CSV files containing the ML pandas dataframe
stdf = "Std"

# dataframe file extension
fext = "csv"

imgf = "jpg"

rgb_sufix = "rgb"
bw_sufix = "bw"

# standard sufix
stdprefix = "std-"

# ml model useful data
mltprefix = "ml-"

# report names
str_date = datetime.date.today().strftime("%d-%b-%Y")

small_sufix = "Img-Data-Small"
large_sufix = "Img-Data-Large"

gallery_prefix = "VVG-Gallery"

# dataframe file name
small_fn = stdprefix + gallery_prefix + "-" + small_sufix + "." + fext
large_fn = stdprefix + gallery_prefix + "-" + large_sufix + "." + fext


# ramdom seed
randseed = 42

# sample distribution train vs test sample size
trainf = 0.80
testf = 0.20

# regex to know that column Im interested in
keeper_regex = r"(^ID$)|(^std_)"

imgt = rgb_sufix

# default values
work_fn, work_imgt, work_sufix = small_fn, imgt, small_sufix
# work_fn, work_imgt, work_sufix  = large_fn, imgt, large_sufix

In [None]:
root_folder = os.getcwd()
root_folder = os.path.split(root_folder)[0]
root_folder = os.path.normpath(root_folder)
print(root_folder)

In [None]:
# variable reading
# dataframe filepath
fn_path = os.path.join(root_folder, dataf, stdf, work_fn)
print(fn_path)

In [None]:
# rading training data
# loading file
source_df = pd.read_csv(
                fn_path,
                sep=",",
                encoding="utf-8",
                engine="python",
            )

In [None]:
# checking everything is allrigth
source_df.head(5)
# chekcing the dataframe
source_df.info()

In [None]:
# reading images from folder and loading images into df
# working variables
src_col = work_imgt + "_img"
tgt_col = work_imgt + "_img" + "_data"
work_shape = work_imgt + "_shape"

print(src_col, tgt_col)
source_df = get_images(root_folder, source_df, src_col, tgt_col)

In [None]:
source_df.info()

In [None]:
# searching the biggest shape in the image files
print(work_shape)
shape_data = source_df[work_shape]
max_shape = get_mshape(shape_data, work_imgt)
print(max_shape)

In [None]:
# padding training data according to max shape of the images in gallery
pad_prefix = "pad_"
conv_prefix = "cnn_"
src_col = work_imgt + "_img" + "_data"
tgt_col = pad_prefix + conv_prefix + src_col

print(src_col, tgt_col)
source_df = padding_images(source_df, src_col, tgt_col, max_shape)

In [None]:
source_df.info()
# test = source_df["pad_cnn_rgb_img_data"].value_counts()
# print(test)

In [None]:
# reading images from folder and stadarizing images into df
# working variables
print("standarizing regular images...")
src_col = work_imgt + "_img" + "_data"
tgt_col = "std_" + src_col

source_df = standarize_images(source_df, src_col, tgt_col)

In [None]:
print("standarizing padded images...")
src_col = pad_prefix + conv_prefix + work_imgt + "_img" + "_data"
tgt_col = "std_" + src_col

source_df = standarize_images(source_df, src_col, tgt_col)

In [None]:
source_df.info()

In [None]:
# selecting data to train
# want to keep the columns starting with STD_
df_columns = list(source_df.columns)
print("------ original input/interested columns ------")
print(df_columns)

# create the columns Im interesting in
keep_columns = [i for i in df_columns if re.search(keeper_regex, i)]

print("\n\n------ Interesting columns ------")
print(keep_columns)

In [None]:
# creating the training dataframe
train_df = pd.DataFrame(source_df, columns=keep_columns)

In [None]:
# checking the train dataframe
train_df.head(5)
train_df.info()

In [None]:
regular_img_col = "std_" + work_imgt + "_img" + "_data"
padded_img_col = "std_" + pad_prefix + conv_prefix + work_imgt + "_img" + "_data"

In [None]:
working_img_col = padded_img_col

In [None]:
# creating Train/Test sample
# getting the X, y to train, as is autoencoder both are the same
X = np.array([np.array(i, dtype="object") for i in train_df[working_img_col]], dtype="object")
y = np.array([np.array(j, dtype="object") for j in train_df[working_img_col]], dtype="object")

# X = train_df[padded_col]
# y = train_df[padded_col]

print(X.shape)
print(y.shape)

In [None]:
print(type(X[0]))
print(type(X[0][0]))
print(X.shape)

In [None]:
print(type(y[0]))
print(type(y[0][0]))
print(y.shape)

In [None]:
print(X[0].shape)
print(y[0].shape)

In [None]:
X_cnn = X
y_cnn = y

In [None]:
# dividing according to train/test proportions
X_train, X_test, y_train, y_test = train_test_split(X_cnn, y_cnn, train_size = trainf, test_size = testf, random_state = randseed)

In [None]:
# checking shaped
print("training shape (X, y)")
print("X: ", X_train.shape, " y: ", y_train.shape)

print("testing shape (X, y)")
print("X: ", X_test.shape, " y: ", y_test.shape)

print("data types")
print(type(X_train), type(X_test), type(y_train), type(y_test))
print(type(X_train[0]), type(X_test[0]), type(y_train[0]), type(y_test[0]))
print(type(X_train[0][0]), type(X_test[0][0]), type(y_train[0][0]), type(y_test[0][0]))
print(type(X_train[0][0][0]), type(X_test[0][0][0]), type(y_train[0][0][0]), type(y_test[0][0][0]))

In [None]:
if working_img_col == regular_img_col:

    Xtf_train = X_train
    Xtf_test = X_test
    ytf_train = y_train
    ytf_test = y_test

elif working_img_col == padded_img_col:
    print("using", working_img_col)
    Xtf_train = tf.convert_to_tensor(X_train, dtype="float64")
    Xtf_test = tf.convert_to_tensor(X_test, dtype="float64")
    ytf_train = tf.convert_to_tensor(y_train, dtype="float64")
    ytf_test = tf.convert_to_tensor(y_test, dtype="float64")

# y_tensor = tf.convert_to_tensor(y, dtype=tf.float23) 

In [None]:
# number of neurons or processing units in LSTM
# the number is because of good practices for NLP
# min 200 max 500, normaly 300 (related to the semantic number of themes)
# 120 for now in this test

# timestep is 1 because you read a word at a time
filters = 32
print("CNN filter number:", filters)

in_shape = X_train[0].shape
# in_shape = (None, None, 3)
# in_shape = (None, None, 1)
print("Input training shape:", in_shape)

# batch size
bs = int(X_train.shape[0]*0.05)+1
print("CNN learning batch size:", bs)

ksize = (3,3)
psize = (2,2)
print("CNN kernel size:", ksize)
print("CNN pad size:", psize)

# neurons/processing units size in the dense layer (THIS SHOULD BE SOM!!!!)
mdn = 8*8*3
mid_reshape = (8,8,3)
print("Dense middle processing units:", mdn)
# dn2 = len(XB_set[0])*SECURITY_FACTOR

# numero de neuronas de salida
out_shape = X_train[0].shape
print("Output prediction shape:", out_shape)

# axtivation functions
inn = "relu"
act = "relu"
out = "softmax"

# loss percentage
ldrop = 0.2

# padding policy
pad = "same"

# random seed
randseed = 42

# parameters to compile model
# loss function
# ls = "mean_squared_error"
# ls = "categorical_crossentropy"
ls = "binary_crossentropy"

# doptimization function
opti = "adam"
# evaluation score
met = ["accuracy"]

# parameters to exeute training
# verbose mode
ver = 1
# training epocha
epo = 300
print("training epochs:", epo)

In [None]:
# model layers
# to avoid overfit you need to use dropout in someplaces
# options:
# 1) Mask -> Drop -> LSTM1 -> LSTM2 -> LSTM3 -> Drop -> Dense -> Drop -> LSTM3 -> LSTM2 -> LSTM1 -> Drop -> TimeDistDense
# 1) Mask -> Drop -> LSTM1 -> LSTM2 -> LSTM3 -> Dense -> LSTM3 -> LSTM2 -> LSTM1 -> Drop -> TimeDistDense
# 2) Mask -> Drop -> LSTM1 -> LSTM2 -> LSTM2 -> Dense -> LSTM2 -> LSTM2 -> LSTM1 -> Drop -> TimeDistDense
# 3) Mask -> Drop -> LSTM1 -> LSTM2 -> Drop -> Dense -> Drop -> LSTM2 -> LSTM1 -> Drop -> TimeDistDense
# 3) Mask -> Drop -> LSTM1 -> Drop -> Dense -> Drop -> STM1 -> Drop -> TimeDistDense
# 5) Mask -> Drop -> LSTM1 -> Drop -> Dense -> Drop -> LSTM1 -> TimeDistDense

cnn_layers = (

    # input layer (padding and prep)
    Input(shape = in_shape, name = "LayIn"),

    # intermediate convolutional encoder layer
    Conv2D(filters, ksize, activation = act, padding = pad, input_shape = in_shape, name = "EnConv1"),
    MaxPooling2D(psize, padding = pad, name = "EnPool1"),
    Dropout(ldrop, name = "EnDrop1"),

    # intermediate convolutional encoder layer
    Conv2D(int(filters/2), ksize, activation=act, padding = pad, name = "EnConv2"),
    MaxPooling2D(psize, padding = pad, name = "EnPool2"),
    Dropout(ldrop, name = "EnDrop2"),

    # intermediate convolutional encoder layer
    Conv2D(int(filters/4), ksize, activation=act, padding = pad, name = "EnConv3"),
    MaxPooling2D(psize, padding = pad, name = "EnPool3"),
    Dropout(ldrop, name = "EnDrop3"),

    # #from 2D to 1D
    # Flatten(name = "LayFlat"),
    # # mid dense encoding layer
    # # dense layer for abstraction (THIS SHOULD COULD SOM!!!!)
    Dense(mdn, activation = act, name = "DenseMid"),
    # Dropout(ldrop, name = "MidDrop"),
    # # from 1D to 2D
    # Reshape(mid_reshape, name = "layReshape"),
    
    # intermediate convolutional decoder layer
    Conv2D(int(filters/4), ksize, activation = act, padding = pad, name = "DeConv1"),
    UpSampling2D(psize, name = "DeUpsam1"),
    Dropout(ldrop, name = "DeDrop1"),

    # intermediate convolutional decoder layer
    Conv2D(int(filters/2), ksize, activation = act, padding = pad, name = "DeConv2"),
    UpSampling2D(psize, name = "DeUpsam2"),
    Dropout(ldrop, name = "DeDrop2"),

    # intermediate convolutional decoder layer
    Conv2D(filters, ksize, activation = act, padding = pad, name = "DeConv3"),
    UpSampling2D(psize, name = "DeUpsam3"),
    Dropout(ldrop, name = "DeDrop3"),
    # capa de salida
    # Reshape(inshape),
    Conv2D(3, ksize, activation = out, padding = pad, input_shape = out_shape, name = "LayOut"),
)

In [None]:
# defining model
cnn_model = Sequential(cnn_layers)
cnn_model.model_name = "CNN_Autoencoder"

In [None]:
# compile model
cnn_model.compile(loss = ls, optimizer = opti, metrics = met)
cnn_model.summary()

In [None]:
# early stopping condition BECAAUSE THIS ARE TESTS
EarlyStopCNN_Acc = EarlyStopping(monitor = "val_accuracy", min_delta = 0.01, patience = 30, verbose = 1, mode = "max", restore_best_weights = True)

In [None]:
# training model
history_cnn = cnn_model.fit(
    x = Xtf_train, 
    y = ytf_train,
    epochs = epo, 
    verbose = ver, 
    workers = 6,
    batch_size = bs, 
    callbacks = [EarlyStopCNN_Acc],
    shuffle = False,
    use_multiprocessing = True,
    validation_data = (Xtf_test, ytf_test),
)

In [None]:
# evaluationg model
cnn_eval = cnn_model.evaluate(x = Xtf_test, y = ytf_test)

In [None]:
# general evaluation 
print("avg loss: ", cnn_eval[0])
print("avg acc: ", cnn_eval[1])

In [None]:
# testing model
cnn_results = cnn_model.predict(X_test, batch_size = bs, verbose = 1)#, batch_size = bs)

In [None]:
# saving model
wdir = os.getcwd()
models_folder = "Models"
model_fname = "vvg_cnn_autoencoder"
model_fpn = os.path.join(root_folder, dataf, models_folder, model_fname)
print("The trained  model is:", model_fpn)
lstm_model.save(model_fpn)

In [None]:
# cheking test shape
print(lstm_results.shape)

In [None]:
# reporting results
# reporte de entrenamiento para el modelo
# base de la figura
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16,8))

# datos de la figura en de perdida y precision
ax1.plot(history_lstm.history["loss"], 'green', label = "Train Loss")
ax1.plot(history_lstm.history["val_loss"], 'royalblue', label = "Test Loss")
ax2.plot(history_lstm.history["accuracy"], 'green', label = "Train Accuracy")
ax2.plot(history_lstm.history["val_accuracy"], 'royalblue', label = "Test Accuracy")

# leyenda de la grafica
fig.suptitle("LEARNING BEHAVIOR")
ax1.grid(True)
ax2.grid(True)
ax1.set_title("Loss")
ax2.set_title("Accuracy")
ax1.set(xlabel = "Epoch [cycle]", ylabel = "loss [%]")
ax2.set(xlabel = "Epoch [cycle]", ylabel = "Acc [%]")
fig.legend()
fig.show()

In [None]:
# saving image in png file
work_learn_img = model_fname + "-" + str_date + "-" + work_sufix + "-learn-curve." + imgext
img_fpn = os.path.join(root_folder, dataf, reportf, work_learn_img)
print(os.path.exists(img_fpn))
print(img_fpn)

In [None]:
# saving rendered image
fig.savefig(img_fpn, dpi = fig.dpi)

In [None]:
# FOR FUTURE USE!!!! DEMO!!!
lstm_test_text = """
                There is a khan's daughter
                Who steps on in a SWINGING manner
                And has the marks of twenty tigers,
                Who steps on in a GRACEFUL manner
                And has the marks of thirty manner
                Who steps on in an ELEGANT manner
                And has the marks of forty tigers,
                Who steps on in a DELICATE manner
                And has the marks of fifty tigers.
                """