In [100]:
import sddk
import pandas as pd
pd.options.display.max_columns = 1000 # to see all columns
import json
import re
import geopandas as gpd
from shapely.geometry import Point
from functools import partial
from shapely.ops import transform
import pyproj
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

import nltk
from nltk.util import bigrams

In [2]:
EDHCSg = sddk.read_file("EDHCSg.geojson", "gdf", "a9237c5ea642d4714bcdefb03f70a1f4")

reading file located in a public folder


In [59]:
# to train the model, we will work with a subset of the for which we have both the EDH and EDCS attributes
# we will call it EDHg
EDHg = EDHCSg[(EDHCSg["EDH-ID"].notnull()) & (EDHCSg["EDCS-ID"].notnull())] 

In [214]:
# what are the most common types of inscriptions
EDHg.groupby("type_of_inscription_clean").size().sort_values(ascending=False)

type_of_inscription_clean
epitaph                            21520
votive inscription                 11728
NULL                                3745
owner/artist inscription            3340
honorific inscription               3003
building/dedicatory inscription     2561
mile-/leaguestone                   1307
identification inscription           850
acclamation                          287
defixio                              269
list                                 246
military diploma                     209
label                                194
boundary inscription                 175
elogium                              132
letter                               119
public legal inscription             109
seat inscription                      42
private legal inscription             36
prayer                                18
assignation inscription               15
calendar                              10
adnuntiatio                            1
dtype: int64

Now we can focus on some EDCS attributes (i.e. firt 28 columns) which might be good predictors of `type_of_inscription_clean` in EDH. First, look at `status_list`:

# Based on `status_list`



In [217]:
EDHg["status_list"].tolist()[:10]

["['Augusti/Augustae', 'litterae erasae', 'ordo equester', 'tituli honorarii', 'tria nomina']",
 "['Augusti/Augustae', 'miliaria', 'viri']",
 "['Augusti/Augustae', 'leges', 'viri']",
 "['litterae erasae', 'tituli sacri']",
 {},
 {},
 {},
 {},
 "['Augusti/Augustae', 'tituli fabricationis', 'viri']",
 {}]

In [219]:
# among other information, status list also contains an information about inscription type:
EDHg["inscr_type"].tolist()[:10]

['tituli honorarii',
 'miliaria',
 'leges',
 'tituli sacri',
 {},
 {},
 {},
 {},
 'tituli fabricationis',
 {}]

In [121]:
# for some applications, it will be useful to transform it into string, with all two-word phrases treated as one word 
# (therefore underscore)
status_lists = []
for el in EDHg["status_list"].tolist():
    try: el = eval(el)
    except: pass
    if isinstance(el, list):
        new_el = el
    elif isinstance(el, str):
        new_el = [el]
    else: 
        new_el = []
    new_el = " ".join([el.replace(" ", "_") for el in new_el])
    status_lists.append(new_el)

In [102]:
status_lists[:10]

['Augusti/Augustae litterae_erasae ordo_equester tituli_honorarii tria_nomina',
 'Augusti/Augustae miliaria viri',
 'Augusti/Augustae leges viri',
 'litterae_erasae tituli_sacri',
 '',
 '',
 '',
 '',
 'Augusti/Augustae tituli_fabricationis viri',
 '']

# Build the model

In [122]:
y = EDHg["type_of_inscription_clean"].tolist()
y[:10]

['honorific inscription',
 'mile-/leaguestone',
 'public legal inscription',
 'votive inscription',
 'owner/artist inscription',
 'public legal inscription',
 'honorific inscription',
 'building/dedicatory inscription',
 'building/dedicatory inscription',
 'NULL']

In [123]:
re.match("\w+\/?|\_\w+", "Augusti_Augustae")

<_sre.SRE_Match object; span=(0, 16), match='Augusti_Augustae'>

In [124]:
# prepare the vector model
# sklearn does not like some special characters, therefore it is important to defined what is accepted as a token_pattern
vectorizer = CountVectorizer(token_pattern=r"\w+\/?|\_\w+")
X = vectorizer.fit_transform(status_lists)

In [204]:
# define training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [127]:
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=1000000.0, gamma='auto', kernel='rbf')
    svm.fit(X, y)
    return svm

In [128]:
%%time
svm = train_svm(X_train, y_train)

In [129]:
pred = svm.predict(X_test)
print(svm.score(X_test, y_test))
print(confusion_matrix(pred, y_test))

In [132]:
len(pred)

4992

In [136]:
len(X_test.todense())

4992

In [140]:
# only where the predictor is present
predictors_present = [np.any(el) for el in X_test.todense()]
predictors_present[:10]

[True, True, True, True, False, True, True, True, True, True]

In [148]:
# filter for the ones with predictors
len(np.array(y_test)[predictors_present])

4135

In [146]:
# check the performance on datapoints for which we have something in the "status_list"
svm.score(X_test.todense()[predictors_present], np.array(y_test)[predictors_present])

0.8773881499395405

# Combine `status_list` & `clean_text_interpretive_word`

next step would be to build a model combining the information from the `status_list` with an informatation from the text of the inscription (i.e. `clean_text_interpretive_word`).

In [150]:
inscr_texts = EDHg["clean_text_interpretive_word"].tolist()

The language of inscriptions is characterized by a number of standardized phrases, which often determine the type of insription. Treated separately, "Dis" and "Manibus" might be confusing, but treated as a bigram, they bear a crucial information about the type of inscription.



In [220]:
def get_bigrams_underscore(inscr_text):
    try:
        inscr_bigrams = [" ".join(el) for el in list(bigrams(inscr_text.split()))]
        inscr_bigrams_ = [bigram.replace(" ", "_") for bigram in inscr_bigrams]
    except:
        inscr_bigrams_ = []
    return inscr_bigrams_

In [221]:
EDHg_bigrams_ = [get_bigrams_underscore(inscr_text) for inscr_text in inscr_texts]
EDHg_bigrams_[:5]

[['Fortissimo_et',
  'et_piissimo',
  'piissimo_Caesari',
  'Caesari_domino',
  'domino_nostro',
  'nostro_Galerio',
  'Galerio_Valerio',
  'Valerio_Maximiano',
  'Maximiano_Pio',
  'Pio_Felici',
  'Felici_Invicto',
  'Invicto_Coranius',
  'Coranius_Titianus',
  'Titianus_vir',
  'vir_perfectissimus',
  'perfectissimus_praeses',
  'praeses_provinciae',
  'provinciae_veteris',
  'veteris_Epiri',
  'Epiri_numini',
  'numini_eorum',
  'eorum_dicatissimus'],
 ['Imperatori_Caesari',
  'Caesari_Marco',
  'Marco_Annio',
  'Annio_Floriano',
  'Floriano_Pio',
  'Pio_Felici',
  'Felici_Augusto',
  'Augusto_patri',
  'patri_patriae',
  'patriae_milia',
  'milia_passuum',
  'passuum_III',
  'III_Imperatori',
  'Imperatori_Caesari',
  'Caesari_Marco',
  'Marco_Aurelio',
  'Aurelio_Probo',
  'Probo_Pio',
  'Pio_Felici',
  'Felici_Augusto',
  'Augusto_milia',
  'milia_passuum',
  'passuum_II'],
 ['Tiberius_Claudius',
  'Claudius_Caesar',
  'Caesar_Augustus',
  'Augustus_Germanicus',
  'Germanicus_pon

In [222]:
bigrams_list = [el for sublist in EDHg_bigrams_ for el in sublist]
bigrams_list[:10]

['Fortissimo_et',
 'et_piissimo',
 'piissimo_Caesari',
 'Caesari_domino',
 'domino_nostro',
 'nostro_Galerio',
 'Galerio_Valerio',
 'Valerio_Maximiano',
 'Maximiano_Pio',
 'Pio_Felici']

In [223]:
# how many unique bigrams do we have?
len(list(set(bigrams_list)))

258720

In [231]:
# ok, let's list only 1000 of the most common
N = 100
bigrams_N = nltk.FreqDist(bigrams_list).most_common(N)
bigrams_N = [tup[0] for tup in bigrams_N]
bigrams_N[:10]

['Dis_Manibus',
 'vixit_annos',
 'votum_solvit',
 'solvit_libens',
 'libens_merito',
 'Iovi_Optimo',
 'Optimo_Maximo',
 'hic_situs',
 'situs_est',
 'tribunicia_potestate']

In [232]:
status_bigrams_list = []
status_list = []
for el_status, el_bigrams in zip(EDHg["status_list"].tolist(), EDHg_bigrams_):
    try: el_status = eval(el_status)
    except: pass
    if isinstance(el_status, list):
        new_el = el_status
    elif isinstance(el, str):
        new_el = [el]
    else: 
        new_el = []
    new_el = [el.replace(" ", "_") for el in new_el]
    status_list.append(new_el)
    new_el = new_el + el_bigrams
    new_el = " ".join(new_el)
    status_bigrams_list.append(new_el)

In [233]:
len(set(status_list_flat))

37

In [234]:
status_list_flat = [el for sublist in status_list for el in sublist]
status_N = [tup[0] for tup in nltk.FreqDist(status_list_flat).most_common(100)] ### we had 228 in total
status_N[:10]

['viri',
 'tria_nomina',
 'tituli_sepulcrales',
 'mulieres',
 'tituli_sacri',
 'milites',
 'nomen_singulare',
 'Augusti/Augustae',
 'tituli_operum',
 'tituli_fabricationis']

In [235]:
len(status_N)

37

In [236]:
vocab = status_N + bigrams_N

In [237]:
len(vocab)

137

In [238]:
vectorizer = CountVectorizer(token_pattern=r"\w+\/?|\_\w+", vocabulary=vocab)
X = vectorizer.fit_transform(status_bigrams_list)

In [239]:
X.todense().shape

(49916, 137)

In [240]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [241]:
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=1000000.0, gamma='auto', kernel='rbf')
    svm.fit(X, y)
    return svm

In [None]:
%%time
svm = train_svm(X_train, y_train)

In [None]:
pred = svm.predict(X_test)
print(svm.score(X_test, y_test))