# VINCENT VAN GOGH STANDARD DATA TEXT PROCESS

This script takes the gallery's text from the local data folder and process it to an standard representation with Natural Language Processing and Word2Vec methods.

The process goes as follows:

1. Load the CSV into a pandas DataFrame.
2. Transform text columns into words lists.

    1.1 Clear words in text list.
    
4. Remove unnecesary words from words list with the stop word dictionary.
5. 
6. Transform URLs columns into standard categories.

https://statsmaths.github.io/stat289-f18/solutions/tutorial19-gensim.html


each TXT page as a record in the input representation of the model.
2.	Removing unnecessary words with the Spanish stop dictionary.
3.	Recognizing a set of unique words in documents.
4.	Transforming the unique words into columns of the input model.
5.	Vectorizing each word of the document by frequency of appearance (word2vec).

**NOTE:** Because GitHub has limited storage capabilities and the digital archive data is private, the data in the folder _\\Data\\_ is just a sample for the code to work without errors.

In [1]:
"""
* Copyright 2020, Maestria de Humanidades Digitales,
* Universidad de Los Andes
*
* Developed for the Msc graduation project in Digital Humanities
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

# ===============================
# native python libraries
# ===============================
import os
import copy
import sys
import csv
import re
import pprint

# ===============================
# extension python libraries
# ===============================
import pandas as pd
import numpy as np
import gensim
from gensim import models
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# downloading nlkt data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# ===============================
# developed python libraries
# ===============================


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# notebook varlable definitions
# root folder
dataf = "Data"

# subfolder with the OCR transcrived txt data
prepf = "Prep"

#  subfolder with the CSV files containing the ML pandas dataframe
stdf = "Std"

# dataframe file extension
fext = "csv"

# dataframe file name
small_fn = "VVG-Gallery-Text-Data-Small" + "." + fext
large_fn = "VVG-Gallery-Text-Data-Large" + "." + fext


# regex for _TEXT
text_re = u"\w+_TEXT"

# regex for ID
id_re = u"ID{1}"

# regex for others (URLs|Categories)
cat_re = u"\b(?!(ID{1}|\w+_TEXT))\b(\w+\W+)+"
cat_re = u"ID{1}(^\w+( \w+)*$)"

# default values
work_fn = small_fn

In [3]:
# stopwords files
basicStopWords = "mlt-uniandes-spanish-stop-words" + ".txt"
compositeStopWords = "composite-nltk-spanish-stop-words" + ".txt"

# positive and negative data subfolders, each category is repeated in all 4 previous subfolders
positiveFolder = "01-Mechas"
negativeFolder = "00-Others"

# default dataframe schema
dfSchema =[
    "ID",               # unique key for the text file
    "FILE_PATH",        # text file local path
    "DOC_NAME",         # name of the text original document
    "TEXT",             # OCR extractec text
    "AUTHOR",           # author of the document
    "LABEL",            # learning target label, associated with the AUTHOR
    "CLEAN_TEXT",       # cleaned text extracted from the document
    "SENTENCES",        # text divided by sentences
    "NUM_SENTENCES",    # number of sentences in the text
    "WORDS",            # text divided by words
    "NUM_WORDS",        # number of words in the text
    "TOKENS",           # unique tokens extracted from the text
    "NUM_TOKENS",       # number of unique of tokens in the text
]

initColumns = [
    dfSchema[0],
    dfSchema[1],
    dfSchema[2],    
]


In [4]:
# loading the CSV file into pandas

# read an existing CSV fileto update the dataframe
fn_path = os.path.join(os.getcwd(), dataf, prepf, work_fn)
print(fn_path)
text_df = pd.read_csv(
                fn_path,
                sep=",",
                encoding="utf-8",
                engine="python",
            )

c:\Users\Felipe\Documents\GitHub\sa-artea\VVG-Gallery-StdDataProcessor\Notebooks\Data\Prep\VVG-Gallery-Text-Data-Small.csv


In [5]:
# getting the df columns
df_cols = list(text_df)

# getting the text columns
text_r = re.compile(text_re)
text_cols = list(filter(text_r.match, df_cols))

# getting the ID column
id_r = re.compile(id_re)
id_cols = list(filter(id_r.match, df_cols))

# getting the URLs/Category columns
cat_r = re.compile(cat_re)
cat_cols = list(filter(cat_r.match, df_cols))

In [6]:
# getting the original working text
text_corpus = list(text_df[text_cols[0]])
print(len(text_corpus))

59


In [7]:
# to working text
text_clean = list()
for text in text_corpus:
    text = text.lower()
    text_clean.append(text)

print(len(text_clean), len(text_corpus))

59 59


In [8]:
# cleaning and preprocessing text for word2vec
i = 0
for i in range(0, len(text_clean)):
    text = text_clean[i]
    # removing special characters
    text = re.sub(r"\W", " ", text)
    # finding missing points between numbers
    text = re.sub(r"(\d{1,3}) (\d{1,2})", r"\1.\2", text)
    # removing excessive spaces
    text = re.sub(r"\s+", " ", text)
    text_clean[i] = text
    i = i + 1

print(len(text_clean), len(text_corpus))

59 59


In [9]:
# tokenising text
text_tokens = list()

for text in text_clean:
    text = text.split()
    text_tokens.append(text)
    # print(text)

print(len(text_tokens), len(text_clean), len(text_corpus))

59 59 59


In [10]:
# removing stopwords
text_nsw_tokens = list()

for tokens in text_tokens:

    clear_tokens = list()

    for token in tokens:
        if not token in stopwords.words('english'):
            clear_tokens.append(token)
    
    ttokens = copy.deepcopy(clear_tokens)
    text_nsw_tokens.append(ttokens)
    # print(clear_tokens)

print(len(text_nsw_tokens), len(text_tokens), len(text_clean), len(text_corpus))

59 59 59 59


In [11]:
# lematization of the text
text_lemmas = list()
token_lematizer = WordNetLemmatizer()

for tokens in text_nsw_tokens:

    lemma_tokens = list()

    for token in tokens:
        
        ans = token_lematizer.lemmatize(token)
        lemma_tokens.append(ans)

    tlemmas = copy.deepcopy(lemma_tokens)
    text_lemmas.append(tlemmas)

print(len(text_lemmas), len(text_nsw_tokens), len(text_tokens), len(text_clean), len(text_corpus))

59 59 59 59 59


In [12]:
text_df["TOKENS"] = text_tokens
text_df["PREP_TOKENS"] = text_lemmas

In [13]:
text_df.head()

Unnamed: 0,ID,CORE_TEXT,EXT_TEXT,complementary colours,this torso of Venus,drew,Van Gogh wrote,standing torso of Venus,he wrote,The Potato Eaters,...,1884,1887,animal art,drawing,1890,cityscape,1881,Brussels,TOKENS,PREP_TOKENS
0,s0004V1962r,Head of a Woman Vincent van Gogh (1853 - 1890)...,F0388r JH0782 s0004V1962r 43.5 cm x 36.2 cm,localhost,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,localhost,localhost,"[head, of, a, woman, vincent, van, gogh, 1853,...","[head, woman, vincent, van, gogh, 1853, 1890, ..."
1,s0006V1962,Head of a Woman Vincent van Gogh (1853 - 1890)...,"F0160 JH0722 s0006V1962 43.2 cm x 30.0 cm, 2.2...",https://www.vangoghmuseum.nl/en/stories/lookin...,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,localhost,localhost,"[head, of, a, woman, vincent, van, gogh, 1853,...","[head, woman, vincent, van, gogh, 1853, 1890, ..."
2,s0010V1962,Portrait of an Old Woman Vincent van Gogh (185...,"F0174 JH0978 s0010V1962 50.5 cm x 39.8 cm, 68....",localhost,localhost,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,localhost,localhost,"[portrait, of, an, old, woman, vincent, van, g...","[portrait, old, woman, vincent, van, gogh, 185..."
3,s0056V1962,"Torso of Venus Vincent van Gogh (1853 - 1890),...","F0216a JH1054 s0056V1962 46.0 cm x 38.0 cm, 55...",localhost,https://www.vangoghmuseum.nl/en/collection/s01...,localhost,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,localhost,localhost,"[torso, of, venus, vincent, van, gogh, 1853, 1...","[torso, venus, vincent, van, gogh, 1853, 1890,..."
4,s0058V1962,Woman with a Mourning Shawl Vincent van Gogh (...,"F0161 JH0788 s0058V1962 45.5 cm x 33.0 cm, 60 ...",localhost,localhost,https://www.vangoghmuseum.nl/en/collection/d00...,localhost,localhost,localhost,localhost,...,localhost,localhost,localhost,localhost,localhost,localhost,localhost,localhost,"[woman, with, a, mourning, shawl, vincent, van...","[woman, mourning, shawl, vincent, van, gogh, 1..."


In [19]:
vvg_dict = gensim.corpora.Dictionary(text_lemmas)
print(vvg_dict)
vvg_dict.save(os.path.join("Data","VVG-gallery-text.dict"))
pprint.pprint(vvg_dict.token2id)

Dictionary(660 unique tokens: ['1', '11', '16', '1853', '1885']...)
{'1': 0,
 '10': 75,
 '11': 1,
 '12': 622,
 '13.0': 599,
 '14': 520,
 '16': 2,
 '17': 613,
 '1853': 3,
 '1880': 623,
 '1881': 624,
 '1884': 273,
 '1884.1885': 426,
 '1885': 4,
 '1886': 160,
 '1887': 327,
 '1888': 521,
 '1889': 522,
 '1890': 5,
 '1891': 6,
 '19.8': 580,
 '1902': 523,
 '1902.03': 524,
 '1903.04': 525,
 '1914': 625,
 '1920': 626,
 '1925': 7,
 '1930': 8,
 '1931': 572,
 '1952': 9,
 '1956': 526,
 '1960': 10,
 '1962': 11,
 '1964': 527,
 '1965': 528,
 '1970': 195,
 '1972': 614,
 '1973': 12,
 '1981': 627,
 '1989': 628,
 '1990': 629,
 '1994': 13,
 '1st': 14,
 '2': 15,
 '2.2': 76,
 '20.8': 585,
 '2004': 630,
 '2005': 631,
 '21': 16,
 '22': 573,
 '22.1': 615,
 '24.0': 328,
 '24.4': 218,
 '24.6': 597,
 '24.9': 576,
 '25': 17,
 '254': 632,
 '26.6': 616,
 '26.8': 398,
 '26.9': 611,
 '27.0': 421,
 '27.1': 396,
 '27.2': 496,
 '27.3': 609,
 '28': 18,
 '29.1': 600,
 '29.5': 300,
 '2nd': 19,
 '30.0': 77,
 '30.5': 568,
 '30

In [82]:
# text representation to numeric representation
text_bows = list()
text_idxs = list()

for lemmas in text_lemmas:

    # bow loose the order/semantic
    t_bow = vvg_dict.doc2bow(lemmas, allow_update=True)
    text_bows.append(t_bow)
    # idz keeps the order/semantic
    t_idx = vvg_dict.doc2idx(lemmas)
    text_idxs.append(t_idx)

print(len(text_bows), len(text_idxs), len(text_lemmas), len(text_nsw_tokens), len(text_tokens), len(text_clean), len(text_corpus))

59 59 59 59 59 59 59


In [93]:
# train the model
tfidf = gensim.models.TfidfModel(text_idxs, dictionary=vvg_dict, normalize=True)
corpus_tfidf = tfidf[text_bows]
corpus_tfidf[0]

[(0, 0.024662054234269034),
 (1, 3.5607149544744794),
 (2, 3.8826430493618416),
 (4, 3.425436095839058),
 (6, 0.04975303519709973),
 (7, 0.04975303519709973),
 (8, 3.8826430493618416),
 (9, 3.5607149544744794),
 (10, 3.0752881273042374),
 (11, 0.04932410846853807),
 (12, 0.04932410846853807),
 (13, 0.024662054234269034),
 (14, 7.121429908948959),
 (15, 0.0739861627028071),
 (16, 0.09950607039419947),
 (17, 0.04975303519709973),
 (18, 2.712718047919529),
 (19, 3.5607149544744794),
 (20, 5.882643049361842),
 (21, 5.882643049361842),
 (22, 0.04975303519709973),
 (23, 0.04975303519709973),
 (25, 0.09950607039419947),
 (26, 0.04975303519709973),
 (27, 0.04975303519709973),
 (28, 1.634715535918256),
 (30, 0.024662054234269034),
 (31, 0.04975303519709973),
 (33, 3.7652860987236823),
 (34, 6.59536109728137),
 (35, 0.04975303519709973),
 (36, 0.1479723254056142),
 (37, 0.04932410846853807),
 (39, 2.0752881273042374),
 (42, 0.09950607039419947),
 (43, 0.0739861627028071),
 (44, 0.049324108468538

In [92]:
sim_index = gensim.similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=len(vvg_dict))

In [77]:
a = tfidf._apply(text_idxs)
a.__dict__.keys()

dict_keys(['obj', 'corpus', 'chunksize', 'metadata'])

In [78]:
tfidf.__dict__.keys()

dict_keys(['id2word', 'wlocal', 'wglobal', 'normalize', 'num_docs', 'num_nnz', 'idfs', 'smartirs', 'slope', 'pivot', 'eps', 'cfs', 'dfs', 'term_lens'])

In [79]:
# tfidf.idfs
tfidf.wglobal

# print(list(sims))

<function gensim.models.tfidfmodel.df2idf(docfreq, totaldocs, log_base=2.0, add=0.0)>