# VINCENT VAN GOGH STANDARD DATA TEXT PROCESS

This script takes the gallery's text from the local data folder and process it to an standard representation with Natural Language Processing and Word2Vec methods.

The process goes as follows:

1. Load the CSV into a pandas DataFrame.
2. Transform text columns into words lists.

    1.1 Clear words in text list.
    
4. Remove unnecesary words from words list with the stop word dictionary.
5. 
6. Transform URLs columns into standard categories.

https://statsmaths.github.io/stat289-f18/solutions/tutorial19-gensim.html


each TXT page as a record in the input representation of the model.
2.	Removing unnecessary words with the Spanish stop dictionary.
3.	Recognizing a set of unique words in documents.
4.	Transforming the unique words into columns of the input model.
5.	Vectorizing each word of the document by frequency of appearance (word2vec).

**NOTE:** Because GitHub has limited storage capabilities and the digital archive data is private, the data in the folder _\\Data\\_ is just a sample for the code to work without errors.

In [153]:
"""
* Copyright 2020, Maestria de Humanidades Digitales,
* Universidad de Los Andes
*
* Developed for the Msc graduation project in Digital Humanities
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

# ===============================
# native python libraries
# ===============================
import os
import copy
import sys
import csv
import re
import pprint

# ===============================
# extension python libraries
# ===============================
import pandas as pd
import numpy as np
import gensim
from gensim import models
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# downloading nlkt data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# ===============================
# developed python libraries
# ===============================


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [154]:
# notebook varlable definitions
# root folder
dataf = "Data"

# subfolder with the OCR transcrived txt data
prepf = "Prep"

#  subfolder with the CSV files containing the ML pandas dataframe
stdf = "Std"

# dataframe file extension
fext = "csv"

# dictionary extension
dext = "dict"

# dataframe file name
small_fn = "VVG-Gallery-Text-Data-Small" + "." + fext
large_fn = "VVG-Gallery-Text-Data-Large" + "." + fext


# regex for _TEXT
text_re = u"\w+_TEXT"

# regex for ID
id_re = u"ID{1}"

# regex for others (URLs|Categories)
cat_re = u"\b(?!(ID{1}|\w+_TEXT))\b(\w+\W+)+"
cat_re = u"ID{1}(^\w+( \w+)*$)"

# default values
# work_fn = small_fn
work_fn = large_fn

In [155]:

# default dataframe schema
dfSchema =[
    "ID",               # unique key for the text file
    "FILE_PATH",        # text file local path
    "DOC_NAME",         # name of the text original document
    "TEXT",             # OCR extractec text
    "AUTHOR",           # author of the document
    "LABEL",            # learning target label, associated with the AUTHOR
    "CLEAN_TEXT",       # cleaned text extracted from the document
    "SENTENCES",        # text divided by sentences
    "NUM_SENTENCES",    # number of sentences in the text
    "WORDS",            # text divided by words
    "NUM_WORDS",        # number of words in the text
    "TOKENS",           # unique tokens extracted from the text
    "NUM_TOKENS",       # number of unique of tokens in the text
]

In [156]:
# loading the CSV file into pandas
# read an existing CSV fileto update the dataframe
fn_path = os.path.join(os.getcwd(), dataf, prepf, work_fn)
print(fn_path)
text_df = pd.read_csv(
                fn_path,
                sep=",",
                encoding="utf-8",
                engine="python",
            )

c:\Users\Felipe\Documents\GitHub\sa-artea\VVG-Gallery-StdDataProcessor\Notebooks\Data\Prep\VVG-Gallery-Text-Data-Large.csv


In [157]:
# getting the df columns
df_cols = list(text_df)

# getting the text columns
text_r = re.compile(text_re)
text_cols = list(filter(text_r.match, df_cols))

# getting the ID column
id_r = re.compile(id_re)
id_cols = list(filter(id_r.match, df_cols))

# getting the URLs/Category columns
cat_r = re.compile(cat_re)
cat_cols = list(filter(cat_r.match, df_cols))

In [158]:
# getting the original working text
text_corpus = list(text_df[text_cols[0]])
print(len(text_corpus))

964


In [159]:
# to working text
text_clean = list()
for text in text_corpus:
    text = text.lower()
    text_clean.append(text)

print(len(text_clean), len(text_corpus))

964 964


In [160]:
# cleaning and preprocessing text for word2vec
i = 0
for i in range(0, len(text_clean)):
    text = text_clean[i]
    # removing special characters
    text = re.sub(r"\W", " ", text)
    # finding missing points between numbers
    text = re.sub(r"(\d{1,3}) (\d{1,2})", r"\1.\2", text)
    # removing excessive spaces
    text = re.sub(r"\s+", " ", text)
    text_clean[i] = text
    i = i + 1

print(len(text_clean), len(text_corpus))

964 964


In [161]:
# tokenising text
text_tokens = list()

for text in text_clean:
    text = text.split()
    text_tokens.append(text)
    # print(text)

print(len(text_tokens), len(text_clean), len(text_corpus))

964 964 964


In [162]:
# removing stopwords
text_nsw_tokens = list()

for tokens in text_tokens:

    clear_tokens = list()

    for token in tokens:
        if not token in stopwords.words('english'):
            clear_tokens.append(token)
    
    ttokens = copy.deepcopy(clear_tokens)
    text_nsw_tokens.append(ttokens)
    # print(clear_tokens)

print(len(text_nsw_tokens), len(text_tokens), len(text_clean), len(text_corpus))

964 964 964 964


In [163]:
# lematization of the text
text_lemmas = list()
token_lematizer = WordNetLemmatizer()

for tokens in text_nsw_tokens:

    lemma_tokens = list()

    for token in tokens:
        
        ans = token_lematizer.lemmatize(token)
        lemma_tokens.append(ans)

    tlemmas = copy.deepcopy(lemma_tokens)
    text_lemmas.append(tlemmas)

print(len(text_lemmas), len(text_nsw_tokens), len(text_tokens), len(text_clean), len(text_corpus))

964 964 964 964 964


In [164]:
text_df["TOKENS"] = text_tokens
text_df["PREP_TOKENS"] = text_lemmas

In [165]:
text_df.head()

Unnamed: 0,ID,CORE_TEXT,EXT_TEXT,he wrote,Van Gogh wrote,complementary colours,The Potato Eaters,this torso of Venus,drew,standing torso of Venus,...,drawing,1884,heads,Antwerp,1886,nude,1881,Brussels,TOKENS,PREP_TOKENS
0,s0005V1962,The Potato Eaters Vincent van Gogh (1853 - 189...,localhost F0082 JH0764 s0005V1962,"Van Gogh saw the Potato Eaters as a showpiece,...",localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,,,"[the, potato, eaters, vincent, van, gogh, 1853...","[potato, eater, vincent, van, gogh, 1853, 1890..."
1,s0019V1962,Garden with Courting Couples: Square Saint-Pie...,localhost F0314 JH1258 s0019V1962,Van Gogh called this sunny park scene the pain...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,,,"[garden, with, courting, couples, square, sain...","[garden, courting, couple, square, saint, pier..."
2,s0022V1962,Self-Portrait as a Painter Vincent van Gogh (1...,localhost F0522 JH1356 s0022V1962,Van Gogh presented himself in this self-portra...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,,,"[self, portrait, as, a, painter, vincent, van,...","[self, portrait, painter, vincent, van, gogh, ..."
3,s0027V1962,The Langlois Bridge Vincent van Gogh (1853 - 1...,localhost F0400 JH1371 s0027V1962,The sky was grey when Van Gogh painted this br...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,,,"[the, langlois, bridge, vincent, van, gogh, 18...","[langlois, bridge, vincent, van, gogh, 1853, 1..."
4,s0029V1962,"The Sower Vincent van Gogh (1853 - 1890), Arle...",localhost F0451 JH1629 s0029V1962,Van Gogh had a special interest in sowers thro...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,,,"[the, sower, vincent, van, gogh, 1853, 1890, a...","[sower, vincent, van, gogh, 1853, 1890, arles,..."


In [166]:
vvg_dict = gensim.corpora.Dictionary(text_lemmas)
print(vvg_dict)
work_dict = work_fn.split(".")
work_dict = work_dict[0] + "." + dext
dict_pfn = os.path.join(dataf, stdf, work_dict)
print(dict_pfn)
vvg_dict.save(dict_pfn) 
# os.path.join("Data","VVG-gallery-text.dict"))
# pprint.pprint(vvg_dict.token2id)

Dictionary(1403 unique tokens: ['100', '114', '133', '1853', '1885']...)
Data\Std\VVG-Gallery-Text-Data-Large.dict


In [167]:
# text representation to numeric representation
text_bows = list()
text_idxs = list()

for lemmas in text_lemmas:

    # bow loose the order/semantic
    t_bow = vvg_dict.doc2bow(lemmas, allow_update=True)
    text_bows.append(t_bow)
    # idz keeps the order/semantic
    t_idx = vvg_dict.doc2idx(lemmas)
    text_idxs.append(t_idx)

print(len(text_bows), len(text_idxs), len(text_lemmas), len(text_nsw_tokens), len(text_tokens), len(text_clean), len(text_corpus))

964 964 964 964 964 964 964


In [168]:
# train the model
tfidf = gensim.models.TfidfModel(text_idxs, dictionary=vvg_dict, normalize=True)
corpus_tfidf = tfidf[text_bows]
print(len(corpus_tfidf), len(text_bows), len(text_idxs), len(text_lemmas), len(text_nsw_tokens), len(text_tokens), len(text_clean), len(text_corpus))

964 964 964 964 964 964 964 964


In [169]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 964 entries, 0 to 963
Data columns (total 52 columns):
 #   Column                                                                                              Non-Null Count  Dtype  
---  ------                                                                                              --------------  -----  
 0   ID                                                                                                  964 non-null    object 
 1   CORE_TEXT                                                                                           964 non-null    object 
 2   EXT_TEXT                                                                                            964 non-null    object 
 3   he wrote                                                                                            257 non-null    object 
 4   Van Gogh wrote                                                                                      964 non-null    

In [170]:
text_df["BOWS_TOKENS"] = text_bows
text_df["IDX_TOKENS"] = text_idxs
text_df["TFIDF_TOKENS"] = corpus_tfidf

In [171]:
# checking everything is okey
text_df.head()

Unnamed: 0,ID,CORE_TEXT,EXT_TEXT,he wrote,Van Gogh wrote,complementary colours,The Potato Eaters,this torso of Venus,drew,standing torso of Venus,...,Antwerp,1886,nude,1881,Brussels,TOKENS,PREP_TOKENS,BOWS_TOKENS,IDX_TOKENS,TFIDF_TOKENS
0,s0005V1962,The Potato Eaters Vincent van Gogh (1853 - 189...,localhost F0082 JH0764 s0005V1962,"Van Gogh saw the Potato Eaters as a showpiece,...",localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,,,"[the, potato, eaters, vincent, van, gogh, 1853...","[potato, eater, vincent, van, gogh, 1853, 1890...","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[18, 11, 21, 20, 13, 3, 5, 16, 8, 14, 4, 10, 1...","[(0, 0.40324268615259634), (1, 0.4484853309269..."
1,s0019V1962,Garden with Courting Couples: Square Saint-Pie...,localhost F0314 JH1258 s0019V1962,Van Gogh called this sunny park scene the pain...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,,,"[garden, with, courting, couples, square, sain...","[garden, courting, couple, square, saint, pier...","[(3, 1), (5, 1), (7, 1), (9, 4), (10, 1), (12,...","[30, 29, 28, 34, 33, 32, 21, 20, 13, 3, 5, 31,...","[(3, 0.0016704315719060568), (5, 0.00167043157..."
2,s0022V1962,Self-Portrait as a Painter Vincent van Gogh (1...,localhost F0522 JH1356 s0022V1962,Van Gogh presented himself in this self-portra...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,,,"[self, portrait, as, a, painter, vincent, van,...","[self, portrait, painter, vincent, van, gogh, ...","[(3, 1), (5, 1), (7, 1), (9, 4), (10, 1), (12,...","[44, 43, 42, 21, 20, 13, 3, 5, 31, 40, 25, 41,...","[(3, 0.0021413680169537126), (5, 0.00214136801..."
3,s0027V1962,The Langlois Bridge Vincent van Gogh (1853 - 1...,localhost F0400 JH1371 s0027V1962,The sky was grey when Van Gogh painted this br...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,,,"[the, langlois, bridge, vincent, van, gogh, 18...","[langlois, bridge, vincent, van, gogh, 1853, 1...","[(3, 1), (5, 1), (7, 1), (9, 4), (10, 1), (12,...","[51, 50, 21, 20, 13, 3, 5, 49, 52, 35, 10, 17,...","[(3, 0.0019726989955169234), (5, 0.00197269899..."
4,s0029V1962,"The Sower Vincent van Gogh (1853 - 1890), Arle...",localhost F0451 JH1629 s0029V1962,Van Gogh had a special interest in sowers thro...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,,,"[the, sower, vincent, van, gogh, 1853, 1890, a...","[sower, vincent, van, gogh, 1853, 1890, arles,...","[(3, 1), (5, 1), (7, 1), (9, 4), (10, 1), (12,...","[58, 21, 20, 13, 3, 5, 49, 57, 35, 10, 17, 19,...","[(3, 0.002496978648614098), (5, 0.002496978648..."


In [172]:
# creating the dense vector standar representantion of the text
text_dvector = list()

# iterating in each text with the tfidf word bag
for t_idtokens, tfidf_tokens in zip(text_idxs, corpus_tfidf):
    # print("===============================")
    # print(len(tidxs), len(ttfidf))
    # print(type(tidxs), type(ttfidf))
    # dense vector representation
    tdvect = list()

    # creating the dense representation for each text
    for t_token in t_idtokens:

        # transforming the tfidf into dict
        tokens_dict = dict(tfidf_tokens)
        
        # looking for each word
        if t_token in tokens_dict.keys():
            temp = tokens_dict.get(t_token)
            # appending std word representation into array
            tdvect.append(temp)

    # copying std dense vector into corpus column
    ans = copy.deepcopy(tdvect)
    text_dvector.append(ans)

# checking the size of all columna
print(len(text_dvector), len(corpus_tfidf), len(text_bows), len(text_idxs), len(text_lemmas), len(text_nsw_tokens), len(text_tokens), len(text_clean), len(text_corpus))

# adding the dense representation into the dataframe
text_df["STD_DVEC_TOKENS"] = text_dvector

964 964 964 964 964 964 964 964 964


In [173]:
# checking everything is okey
text_df.head()

Unnamed: 0,ID,CORE_TEXT,EXT_TEXT,he wrote,Van Gogh wrote,complementary colours,The Potato Eaters,this torso of Venus,drew,standing torso of Venus,...,1886,nude,1881,Brussels,TOKENS,PREP_TOKENS,BOWS_TOKENS,IDX_TOKENS,TFIDF_TOKENS,STD_DVEC_TOKENS
0,s0005V1962,The Potato Eaters Vincent van Gogh (1853 - 189...,localhost F0082 JH0764 s0005V1962,"Van Gogh saw the Potato Eaters as a showpiece,...",localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,,,"[the, potato, eaters, vincent, van, gogh, 1853...","[potato, eater, vincent, van, gogh, 1853, 1890...","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[18, 11, 21, 20, 13, 3, 5, 16, 8, 14, 4, 10, 1...","[(0, 0.40324268615259634), (1, 0.4484853309269...","[0.2562978554088656, 0.35800004137819563, 0.00..."
1,s0019V1962,Garden with Courting Couples: Square Saint-Pie...,localhost F0314 JH1258 s0019V1962,Van Gogh called this sunny park scene the pain...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,,,"[garden, with, courting, couples, square, sain...","[garden, courting, couple, square, saint, pier...","[(3, 1), (5, 1), (7, 1), (9, 4), (10, 1), (12,...","[30, 29, 28, 34, 33, 32, 21, 20, 13, 3, 5, 31,...","[(3, 0.0016704315719060568), (5, 0.00167043157...","[0.17370519076038676, 0.3630472868005495, 0.25..."
2,s0022V1962,Self-Portrait as a Painter Vincent van Gogh (1...,localhost F0522 JH1356 s0022V1962,Van Gogh presented himself in this self-portra...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,,,"[self, portrait, as, a, painter, vincent, van,...","[self, portrait, painter, vincent, van, gogh, ...","[(3, 1), (5, 1), (7, 1), (9, 4), (10, 1), (12,...","[44, 43, 42, 21, 20, 13, 3, 5, 31, 40, 25, 41,...","[(3, 0.0021413680169537126), (5, 0.00214136801...","[0.26248956382121424, 0.2017948634248733, 0.41..."
3,s0027V1962,The Langlois Bridge Vincent van Gogh (1853 - 1...,localhost F0400 JH1371 s0027V1962,The sky was grey when Van Gogh painted this br...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,,,"[the, langlois, bridge, vincent, van, gogh, 18...","[langlois, bridge, vincent, van, gogh, 1853, 1...","[(3, 1), (5, 1), (7, 1), (9, 4), (10, 1), (12,...","[51, 50, 21, 20, 13, 3, 5, 49, 52, 35, 10, 17,...","[(3, 0.0019726989955169234), (5, 0.00197269899...","[0.4287413085585919, 0.3073207033196817, 0.003..."
4,s0029V1962,"The Sower Vincent van Gogh (1853 - 1890), Arle...",localhost F0451 JH1629 s0029V1962,Van Gogh had a special interest in sowers thro...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,,,"[the, sower, vincent, van, gogh, 1853, 1890, a...","[sower, vincent, van, gogh, 1853, 1890, arles,...","[(3, 1), (5, 1), (7, 1), (9, 4), (10, 1), (12,...","[58, 21, 20, 13, 3, 5, 49, 57, 35, 10, 17, 19,...","[(3, 0.002496978648614098), (5, 0.002496978648...","[0.36082600862610664, 0.004993957297228196, 0...."


In [174]:
# saving the CSV file into pandas
# writing an existing CSV fileto update the dataframe
target_fn = "std-" + work_fn
fn_tpath = os.path.join(os.getcwd(), dataf, stdf, target_fn)
print(fn_tpath)
text_df.to_csv(fn_tpath,
                sep=",",
                index=False,
                encoding="utf-8",
                mode="w",
                )

c:\Users\Felipe\Documents\GitHub\sa-artea\VVG-Gallery-StdDataProcessor\Notebooks\Data\Std\std-VVG-Gallery-Text-Data-Large.csv


In [175]:
# dont remember for what i did this
# sim_index = gensim.similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=len(vvg_dict))