useful links:

- Data Preparation for Variable Length Input Sequences, URL: https://machinelearningmastery.com/data-preparation-variable-length-input-sequences-sequence-prediction/
- Masking and padding with Keras, URL: https://www.tensorflow.org/guide/keras/masking_and_padding
- Step-by-step understanding LSTM Autoencoder layers, URL: https://towardsdatascience.com/step-by-step-understanding-lstm-autoencoder-layers-ffab055b6352XX, URL: xxxxx
- XXXX, URL: xxxxx
- XXXX, URL: xxxxx
- XXXX, URL: xxxxx

In [226]:
"""
* Copyright 2020, Maestria de Humanidades Digitales,
* Universidad de Los Andes
*
* Developed for the Msc graduation project in Digital Humanities
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

# ===============================
# native python libraries
# ===============================
import re
import random
import json
import csv
from datetime import datetime
from collections import OrderedDict
from collections import Counter
from collections import deque

# ===============================
# extension python libraries
# ===============================
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
import matplotlib.pyplot as plt

# natural language processing packages
import gensim
from gensim import models
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# downloading nlkt data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# sample handling sklearn package
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer

# # Keras + Tensorflow ML libraries
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import GlobalMaxPooling2D

# ===============================
# developed python libraries
# ===============================

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [227]:
'''
A UDF to convert input data into 3-D
array as required for LSTM network.

taken from https://towardsdatascience.com/step-by-step-understanding-lstm-autoencoder-layers-ffab055b6352
'''
def temporalize(X, y, lookback):
    output_X = []
    output_y = []
    for i in range(len(X)-lookback-1):
        t = []
        for j in range(1,lookback+1):
            # Gather past records upto the lookback period
            t.append(X[[(i+j+1)], :])
        output_X.append(t)
        output_y.append(y[i+lookback+1])
    return output_X, output_y

In [228]:
# variable definitions
# root folder
dataf = "Data"

# subfolder with the OCR transcrived txt data
targetf = "Target"

#  subfolder with the CSV files containing the ML pandas dataframe
stdf = "Std"

# dataframe file extension
fext = "csv"

# dictionary extension
dext = "dict"

# dataframe file name
small_fn = "std-VVG-Gallery-Text-Data-Small" + "." + fext
large_fn = "std-VVG-Gallery-Text-Data-Large" + "." + fext

# dictionary file name
sdict_fn = "VVG-Gallery-Text-Data-Small" + "." + dext
ldict_fn = "VVG-Gallery-Text-Data-Large" + "." + dext

# ramdom seed
randseed = 42

# window size
min_wsize = 20
max_wsize = 30
def_wsize = int((min_wsize+max_wsize)/2)

# sample distribution train vs test sample size
trainf = 0.80
testf = 0.20

# regex to know that column Im interested in
keeper_regex = r"(^ID$)|(^STD_)"

# default values
work_fn = small_fn
work_dict = sdict_fn
# work_fn = large_fn
# work_dict = ldict_fn

In [229]:
# variable reading
# dataframe filepath
fn_path = os.path.join(os.getcwd(), dataf, stdf, work_fn)
print(fn_path)

# gensim dictionart filepath
dict_path = os.path.join(os.getcwd(), dataf, stdf, work_dict)
print(dict_path)

c:\Users\Felipe\Documents\GitHub\sa-artea\VVG-LSTM-TextAutoencoder\Data\Std\std-VVG-Gallery-Text-Data-Small.csv
c:\Users\Felipe\Documents\GitHub\sa-artea\VVG-LSTM-TextAutoencoder\Data\Std\VVG-Gallery-Text-Data-Small.dict


In [230]:
# reading words dictionary
# loading gesim words dictionary
vvg_dict = gensim.corpora.Dictionary()
vvg_dict = vvg_dict.load(dict_path)
print(vvg_dict)

Dictionary(660 unique tokens: ['1', '11', '16', '1853', '1885']...)


In [231]:
# rading training data
# loading file
source_df = pd.read_csv(
                fn_path,
                sep=",",
                encoding="utf-8",
                engine="python",
            )

In [232]:
# checking everything is allrigth
source_df.head(5)

Unnamed: 0,ID,CORE_TEXT,EXT_TEXT,complementary colours,this torso of Venus,drew,Van Gogh wrote,standing torso of Venus,he wrote,The Potato Eaters,...,1890,cityscape,1881,Brussels,TOKENS,PREP_TOKENS,BOWS_TOKENS,IDX_TOKENS,TFIDF_TOKENS,STD_DVEC_TOKENS
0,s0004V1962r,Head of a Woman Vincent van Gogh (1853 - 1890)...,F0388r JH0782 s0004V1962r 43.5 cm x 36.2 cm,localhost,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,"['head', 'of', 'a', 'woman', 'vincent', 'van',...","['head', 'woman', 'vincent', 'van', 'gogh', '1...","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1...","[39, 73, 70, 69, 38, 3, 5, 52, 49, 4, 53, 28, ...","[(0, 0.0014366751686058629), (1, 0.20742760148...","[0.12089483268014274, 0.10457729123726997, 0.1..."
1,s0006V1962,Head of a Woman Vincent van Gogh (1853 - 1890)...,"F0160 JH0722 s0006V1962 43.2 cm x 30.0 cm, 2.2...",https://www.vangoghmuseum.nl/en/stories/lookin...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,"['head', 'of', 'a', 'woman', 'vincent', 'van',...","['head', 'woman', 'vincent', 'van', 'gogh', '1...","[(0, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1...","[39, 73, 70, 69, 38, 3, 5, 52, 81, 4, 53, 28, ...","[(0, 0.0006632619049359525), (2, 0.10441989952...","[0.11162570186015741, 0.09655924305614885, 0.1..."
2,s0010V1962,Portrait of an Old Woman Vincent van Gogh (185...,"F0174 JH0978 s0010V1962 50.5 cm x 39.8 cm, 68....",localhost,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,"['portrait', 'of', 'an', 'old', 'woman', 'vinc...","['portrait', 'old', 'woman', 'vincent', 'van',...","[(0, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1...","[151, 150, 73, 70, 69, 38, 3, 5, 127, 33, 4, 5...","[(0, 0.0007658221722692316), (4, 0.10636887288...","[0.22113928023709148, 0.30323802477749107, 0.1..."
3,s0056V1962,"Torso of Venus Vincent van Gogh (1853 - 1890),...","F0216a JH1054 s0056V1962 46.0 cm x 38.0 cm, 55...",localhost,https://www.vangoghmuseum.nl/en/collection/s01...,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,"['torso', 'of', 'venus', 'vincent', 'van', 'go...","['torso', 'venus', 'vincent', 'van', 'gogh', '...","[(0, 1), (3, 1), (5, 1), (6, 1), (7, 1), (11, ...","[190, 193, 70, 69, 38, 3, 5, 55, 44, 160, 53, ...","[(0, 0.0008712743083825235), (6, 0.00175770197...","[0.11027555366028634, 0.08752905610112195, 0.0..."
4,s0058V1962,Woman with a Mourning Shawl Vincent van Gogh (...,"F0161 JH0788 s0058V1962 45.5 cm x 33.0 cm, 60 ...",localhost,localhost,https://www.vangoghmuseum.nl/en/collection/d00...,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,"['woman', 'with', 'a', 'mourning', 'shawl', 'v...","['woman', 'mourning', 'shawl', 'vincent', 'van...","[(0, 1), (3, 1), (4, 3), (5, 1), (6, 1), (7, 1...","[73, 205, 210, 70, 69, 38, 3, 5, 52, 48, 49, 4...","[(0, 0.0006110115453988034), (4, 0.12729967560...","[0.1334287674669934, 0.43723399354449305, 0.43..."


In [233]:
# chekcing the dataframe
source_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 56 columns):
 #   Column                                                                                              Non-Null Count  Dtype 
---  ------                                                                                              --------------  ----- 
 0   ID                                                                                                  59 non-null     object
 1   CORE_TEXT                                                                                           59 non-null     object
 2   EXT_TEXT                                                                                            59 non-null     object
 3   complementary colours                                                                               59 non-null     object
 4   this torso of Venus                                                                                 59 non-null     object
 

In [234]:
# selecting data to train
# want to keep the columns starting with STD_
df_columns = list(source_df)
print("------ original input/interested columns ------")
print(df_columns)

# create the columns Im interesting in
keep_columns = [i for i in df_columns if re.search(keeper_regex, i)]

print("\n\n------ Interesting columns ------")
print(keep_columns)


------ original input/interested columns ------
['ID', 'CORE_TEXT', 'EXT_TEXT', 'complementary colours', 'this torso of Venus', 'drew', 'Van Gogh wrote', 'standing torso of Venus', 'he wrote', 'The Potato Eaters', 'which he painted a number of times', 'He would use this technique more than once in his later work', 'Head of a Woman', 'Head of a Man', 'Head of a Woman 1', 'Head of a Woman 2', 'Head of a Woman 3', 'Torso of Venus', 'Horse', 'Torso of Venus 1', 'Male Torso', 'Kneeling Ecorche', 'Torso of Venus 2', 'Portrait of a Prostitute', 'Head of an Old Man', 'Head of a Woman 4', 'Plaster Cast of a Womans Torso', 'Plaster Cast of a Womans Torso 1', 'Torso of Venus 3', 'Woman Sewing', 'Letter from Vincent van Gogh to Theo van Gogh with sketches of Head of a Woman and Head of a Woman', 'Head of a Prostitute', '1885', 'Nuenen', 'painting', 'heads', 'Antwerp', 'portrait', '1886', 'Paris', 'still life', 'nude', '1884', '1887', 'animal art', 'drawing', '1890', 'cityscape', '1881', 'Brussels'

In [235]:
# creating the training dataframe
train_df = pd.DataFrame(source_df, columns=keep_columns)

In [236]:
# getting the column with the relevant data to train
dvector_col = [i for i in df_columns if re.search(u"^STD_", i)]
dvector_col = dvector_col[0]
print("Dense vector column in dataframe: ", str(dvector_col))

Dense vector column in dataframe:  STD_DVEC_TOKENS


In [237]:
# fix column data type
work_corpus = train_df[dvector_col]
dvec_std_corpus = list()

for dvector in work_corpus:
    dvector = eval(dvector)
    dvector = np.array(dvector)
    dvec_std_corpus.append(dvector)

dvec_std_corpus = np.array(dvec_std_corpus, dtype="object")

In [238]:
# changing type in dataframe
train_df[dvector_col] = dvec_std_corpus

In [239]:
# checking the train dataframe
train_df.head(5)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               59 non-null     object
 1   STD_DVEC_TOKENS  59 non-null     object
dtypes: object(2)
memory usage: 1.0+ KB


In [240]:
# padding training data according to max length of text corpus
pad_prefix = "PAD_"
recurrent_prefix = "LSTM_"
padded_corpus = None

# getting the corpus dense vectors
dvec_std_corpus = train_df[dvector_col]

# converting list of list to array of array
# npdvec_std_corpus = np.array([np.array(x, dtype="object") for x in dvec_std_corpus], dtype="object")
# dvec_std_corpus = np.array(dvec_std_corpus, dtype="object")
print(npdvec_std_corpus.shape)

# padding the representation
padded_corpus = pad_sequences(npdvec_std_corpus, dtype='object', padding="post")
print(padded_corpus.shape)

# creating the new column and saving padded data
padded_col = pad_prefix + dvector_col
# print(padded_col)
train_df[padded_col] = list(padded_corpus)
print(padded_corpus.shape)

(59,)
(59, 140)
(59, 140)


In [241]:
# checking the train dataframe
train_df.head(5)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   59 non-null     object
 1   STD_DVEC_TOKENS      59 non-null     object
 2   PAD_STD_DVEC_TOKENS  59 non-null     object
dtypes: object(3)
memory usage: 1.5+ KB


In [242]:
# creating Train/Test sample
# getting the X, y to train, as is autoencoder both are the same
X = np.array([np.array(i, dtype="object") for i in train_df[padded_col]], dtype="object")
y = np.array([np.array(j, dtype="object") for j in train_df[padded_col]], dtype="object")

timesteps = X.shape

print(timesteps)
print(X[0].shape)
print(y.shape)

(59, 140)
(140,)
(59, 140)


In [243]:
# X, y = temporalize(X, y, timesteps)

In [244]:
# reshaping for LSTM model
X_lstm = X.reshape((X.shape[0],X.shape[1],1))
y_lstm = y.reshape((X.shape[0],X.shape[1],1))
print(X_lstm.shape)
print(y_lstm.shape)

# X_train = train_inputs.reshape((split,3,2))
# X_test = X_test.reshape((test_inputs.shape[0], 3, 2))

(59, 140, 1)
(59, 140, 1)


In [245]:
# creating the column for the reshape data according to LSTM
lstm_col = recurrent_prefix + dvector_col
train_df[lstm_col] = list(X_lstm)

In [246]:
# checking the train dataframe
train_df.head(5)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ID                    59 non-null     object
 1   STD_DVEC_TOKENS       59 non-null     object
 2   PAD_STD_DVEC_TOKENS   59 non-null     object
 3   LSTM_STD_DVEC_TOKENS  59 non-null     object
dtypes: object(4)
memory usage: 2.0+ KB


In [247]:
# dividing according to train/test proportions
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = trainf, test_size = testf, random_state = randseed)

In [248]:
# defining model

In [249]:
# training model

In [250]:
# saving model

In [251]:
# testing model

In [252]:
# saving results