In [1]:
import sddk
import pandas as pd
import tabulate
pd.options.display.max_columns = 1000 # to see all columns
import json
import re
import geopandas as gpd
from shapely.geometry import Point
from functools import partial
from shapely.ops import transform
import pyproj
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

import pickle

import nltk
from nltk.util import bigrams

In [66]:
LIST = gpd.read_parquet("../data/large_data/LIST_contexts.parquet") # point out to your local version...

In [67]:
# remove?
EDH_overlap_all = LIST[(LIST["EDH-ID"].notnull()) & (LIST["EDCS-ID"].notnull())]
EDH_overlap = EDH_overlap_all[~EDH_overlap_all["type_of_inscription_clean"].str.contains("NULL")]

Now we can focus on some EDCS attributes (i.e. firt 28 columns) which might be good predictors of `type_of_inscription_clean` in EDH. First, look at `status_list`:

# Applying the model

In [68]:
# load the model and vectorizer back
classifier = pickle.load(open('../data/large_data/et_model_v2.0.sav', 'rb'))
vectorizer = pickle.load(open('../data/large_data/et_vectorizer_v2.0.sav','rb'))

In [69]:
# we will apply the model to inscriptions which are only in EDCS
EDCS_unique = LIST[(LIST["EDH-ID"].isnull()) & (LIST["EDCS-ID"].notnull())]
EDCS_unique.head(5)

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,trismegistos_uri,pleiades_id,transcription,inscription,clean_text_conservative,clean_text_interpretive_sentence,clean_text_interpretive_word,clean_text_interpretive_word_EDCS,diplomatic_text,province,place,inscr_type,status_notation,inscr_process,status,partner_link,last_update,letter_size,type_of_inscription,work_status,year_of_find,present_location,text_edition,support_objecttype,support_material,support_decoration,keywords_term,people,type_of_inscription_clean,type_of_inscription_certainty,height_cm,width_cm,depth_cm,material_clean,type_of_monument_clean,type_of_monument_certainty,province_label_clean,province_label_certainty,country_clean,country_certainty,findspot_ancient_clean,findspot_ancient_certainty,modern_region_clean,modern_region_certainty,findspot_modern_clean,findspot_modern_certainty,findspot_clean,findspot_certainty,language,language_EDCS,raw_dating,not_after,not_before,Longitude,Latitude,geotemporal?,geometry,withinRE?,large_city_pop,medium_city_pop,small_city_pop,urban_context,urban_context_city,urban_context_pop_est
0,LIST-1,EDCS-31400030,,,,,Leius,Leius,,Leius,Leius,,Achaia,?,,,,,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,,,False,,False,,,,,,
1,LIST-2,EDCS-55701593,,,,,Cn(aeus) Atei(us),Cn Atei,,Cnaeus Ateius,Cnaeus Ateius,,Achaia,Agios Donatos / Photike,tituli fabricationis,"praenomen et nomen, viri",sigilla impressa,praenomen et nomen; sigilla impressa; tituli...,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,20.506908,39.475976,False,POINT (20.50691 39.47598),True,41221.0,27923.0,1000.0,rural,,
2,LIST-3,EDCS-55701594,,,,,Avill(ius),Avill,,Avillius,Avillius,,Achaia,Agios Donatos / Photike,tituli fabricationis,"nomen singulare, viri",sigilla impressa,nomen singulare; sigilla impressa; tituli fa...,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,20.506908,39.475976,False,POINT (20.50691 39.47598),True,41221.0,27923.0,1000.0,rural,,
3,LIST-4,EDCS-55701595,,,,,L(ucius) Av(illius),L Av,,Lucius Avillius,Lucius Avillius,,Achaia,Agios Donatos / Photike,tituli fabricationis,"praenomen et nomen, viri",sigilla impressa,praenomen et nomen; sigilla impressa; tituli...,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,20.506908,39.475976,False,POINT (20.50691 39.47598),True,41221.0,27923.0,1000.0,rural,,
4,LIST-5,EDCS-55701596,,,,,Camurius,Camurius,,Camurius,Camurius,,Achaia,Agios Donatos / Photike,tituli fabricationis,"nomen singulare, viri",sigilla impressa,nomen singulare; sigilla impressa; tituli fa...,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,20.506908,39.475976,False,POINT (20.50691 39.47598),True,41221.0,27923.0,1000.0,rural,,


In [70]:
# first create bigrams
def get_bigrams_underscore(inscr_text):
    try:
        inscr_bigrams = [" ".join(el) for el in list(bigrams(inscr_text.split()))]
        inscr_bigrams_ = [bigram.replace(" ", "_") for bigram in inscr_bigrams]
    except:
        inscr_bigrams_ = []
    return inscr_bigrams_

EDCS_unique["bigrams"] = EDCS_unique["clean_text_interpretive_word"].apply(get_bigrams_underscore)

In [71]:
EDCS_unique["status_list"] = EDCS_unique["status"].fillna(value="").apply(lambda x: x.split(";"))
EDCS_unique["material_clean"].replace("NULL", inplace=True)

In [72]:
def combine_status_list_and_bigrams(el_status, el_bigrams, el_material):
    # preprocess status:
    try:
        el_status = eval(el_status)
    except:
        pass
    if isinstance(el_status, list):
        new_el_status = el_status
    elif isinstance(el_status, str):
        new_el_status = [el_status]
    else:
        new_el_status = []
    new_el_status = [el.strip().replace(" ", "_") for el in new_el_status]
    # preprocess material:
    if el_material is None:
        el_material = []
    else:
        el_material = el_material.partition(": ")[0]  #
        el_material = el_material.split(", ")
        el_material = [el.replace(" ", "_").replace("?", "") for el in el_material]
    # combine status, bigrams and material
    new_el = new_el_status + el_bigrams + el_material
    new_el = " ".join(new_el)
    return new_el, new_el_status, el_material

In [73]:
# we need to extract all predictors from the dataframe
# into `status_bigrams_list_apply` (list of lists)
status_bigrams_list_apply = []
status_list = []
material_list = []
for el_status, el_bigrams, el_material in zip(EDCS_unique["status_list"].tolist(), EDCS_unique["bigrams"].tolist(), EDCS_unique["material_clean"].tolist()):
    new_el, new_status, el_material = combine_status_list_and_bigrams(el_status, el_bigrams, el_material)
    status_bigrams_list_apply.append(new_el)
    status_list.append(new_status)
    material_list.extend(el_material)

In [74]:
X_applied = vectorizer.transform(status_bigrams_list_apply)

In [75]:
prediction = classifier.predict(X_applied)
prediction[:10]

array(['identification inscription', 'owner/artist inscription',
       'owner/artist inscription', 'owner/artist inscription',
       'owner/artist inscription', 'owner/artist inscription',
       'votive inscription', 'identification inscription',
       'honorific inscription', 'building/dedicatory inscription'],
      dtype='<U31')

In [76]:
classifier.classes_

array(['acclamation', 'adnuntiatio', 'assignation inscription',
       'boundary inscription', 'building/dedicatory inscription',
       'calendar', 'defixio', 'elogium', 'epitaph',
       'honorific inscription', 'identification inscription', 'label',
       'letter', 'list', 'mile-/leaguestone', 'military diploma',
       'owner/artist inscription', 'prayer', 'private legal inscription',
       'public legal inscription', 'seat inscription',
       'votive inscription'], dtype='<U31')

In [77]:
probs = [np.max(pred) for pred in classifier.predict_proba(X_applied)]
probs[:10]

[0.6666666666666659,
 0.9031917526917528,
 0.8401048945136412,
 0.9031917526917528,
 0.8401048945136412,
 0.9031917526917528,
 0.36523809523809525,
 0.6666666666666659,
 0.62,
 0.3782555555555555]

In [78]:
prediction_probs = [(classifier.classes_[np.argmax(pred)], pred[np.argmax(pred)]) for pred in classifier.predict_proba(X_applied)]
prediction_probs[:10]

[('identification inscription', 0.6666666666666659),
 ('owner/artist inscription', 0.9031917526917528),
 ('owner/artist inscription', 0.8401048945136412),
 ('owner/artist inscription', 0.9031917526917528),
 ('owner/artist inscription', 0.8401048945136412),
 ('owner/artist inscription', 0.9031917526917528),
 ('votive inscription', 0.36523809523809525),
 ('identification inscription', 0.6666666666666659),
 ('honorific inscription', 0.62),
 ('building/dedicatory inscription', 0.3782555555555555)]

In [79]:
len(prediction_probs)

448585

In [80]:
len(prediction_probs)

448585

In [81]:
for threshold in [thres / 100 for thres in range(40, 100, 5)]:
    above_threshold = [tup for tup in prediction_probs if tup[1] >= threshold]
    proportion = len(above_threshold) / len(prediction_probs)
    print("prob. threshold: {0}, proportion of inscr.: {1}".format(threshold, proportion))

prob. threshold: 0.4, proportion of inscr.: 0.9790808876801498
prob. threshold: 0.45, proportion of inscr.: 0.966093382525051
prob. threshold: 0.5, proportion of inscr.: 0.9429115998082861
prob. threshold: 0.55, proportion of inscr.: 0.9070633213326348
prob. threshold: 0.6, proportion of inscr.: 0.8814093204186497
prob. threshold: 0.65, proportion of inscr.: 0.8632388510538694
prob. threshold: 0.7, proportion of inscr.: 0.6805911923046915
prob. threshold: 0.75, proportion of inscr.: 0.6581784945996857
prob. threshold: 0.8, proportion of inscr.: 0.5933189919413266
prob. threshold: 0.85, proportion of inscr.: 0.5478627238984808
prob. threshold: 0.9, proportion of inscr.: 0.5202313942731032
prob. threshold: 0.95, proportion of inscr.: 0.4698173144443082


In [82]:
threshold_results = []
for threshold in [thres / 100 for thres in range(30, 100, 2)]:
    above_threshold = [tup for tup in prediction_probs if tup[1] >= threshold]
    proportion = len(above_threshold) / len(prediction_probs)
    threshold_results.append({"threshold (=/>)" : threshold, "proportion" : np.round(proportion, 2), "N" : len(above_threshold)})
model_results_df = pd.DataFrame(threshold_results)
model_results_df

Unnamed: 0,threshold (=/>),proportion,N
0,0.3,0.99,445898
1,0.32,0.99,445147
2,0.34,0.99,444183
3,0.36,0.99,443094
4,0.38,0.98,440821
5,0.4,0.98,439201
6,0.42,0.97,436929
7,0.44,0.97,434828
8,0.46,0.96,431418
9,0.48,0.95,427592


In [83]:
#print(model_results_df.to_latex(index=False))

In [84]:
#model_results_df.to_csv("../data/classifier_EDCS_results.csv")

# Combining the data back

In [85]:
EDCS_unique["type_of_inscription_auto"] = prediction
EDCS_unique["type_of_inscription_auto_prob"] = probs

In [86]:
EDCS_unique.head(5)

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,trismegistos_uri,pleiades_id,transcription,inscription,clean_text_conservative,clean_text_interpretive_sentence,clean_text_interpretive_word,clean_text_interpretive_word_EDCS,diplomatic_text,province,place,inscr_type,status_notation,inscr_process,status,partner_link,last_update,letter_size,type_of_inscription,work_status,year_of_find,present_location,text_edition,support_objecttype,support_material,support_decoration,keywords_term,people,type_of_inscription_clean,type_of_inscription_certainty,height_cm,width_cm,depth_cm,material_clean,type_of_monument_clean,type_of_monument_certainty,province_label_clean,province_label_certainty,country_clean,country_certainty,findspot_ancient_clean,findspot_ancient_certainty,modern_region_clean,modern_region_certainty,findspot_modern_clean,findspot_modern_certainty,findspot_clean,findspot_certainty,language,language_EDCS,raw_dating,not_after,not_before,Longitude,Latitude,geotemporal?,geometry,withinRE?,large_city_pop,medium_city_pop,small_city_pop,urban_context,urban_context_city,urban_context_pop_est,bigrams,status_list,type_of_inscription_auto,type_of_inscription_auto_prob
0,LIST-1,EDCS-31400030,,,,,Leius,Leius,,Leius,Leius,,Achaia,?,,,,,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,,,False,,False,,,,,,,[],[],identification inscription,0.666667
1,LIST-2,EDCS-55701593,,,,,Cn(aeus) Atei(us),Cn Atei,,Cnaeus Ateius,Cnaeus Ateius,,Achaia,Agios Donatos / Photike,tituli fabricationis,"praenomen et nomen, viri",sigilla impressa,praenomen et nomen; sigilla impressa; tituli...,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,20.506908,39.475976,False,POINT (20.50691 39.47598),True,41221.0,27923.0,1000.0,rural,,,[Cnaeus_Ateius],"[praenomen et nomen, sigilla impressa, tit...",owner/artist inscription,0.903192
2,LIST-3,EDCS-55701594,,,,,Avill(ius),Avill,,Avillius,Avillius,,Achaia,Agios Donatos / Photike,tituli fabricationis,"nomen singulare, viri",sigilla impressa,nomen singulare; sigilla impressa; tituli fa...,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,20.506908,39.475976,False,POINT (20.50691 39.47598),True,41221.0,27923.0,1000.0,rural,,,[],"[nomen singulare, sigilla impressa, tituli...",owner/artist inscription,0.840105
3,LIST-4,EDCS-55701595,,,,,L(ucius) Av(illius),L Av,,Lucius Avillius,Lucius Avillius,,Achaia,Agios Donatos / Photike,tituli fabricationis,"praenomen et nomen, viri",sigilla impressa,praenomen et nomen; sigilla impressa; tituli...,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,20.506908,39.475976,False,POINT (20.50691 39.47598),True,41221.0,27923.0,1000.0,rural,,,[Lucius_Avillius],"[praenomen et nomen, sigilla impressa, tit...",owner/artist inscription,0.903192
4,LIST-5,EDCS-55701596,,,,,Camurius,Camurius,,Camurius,Camurius,,Achaia,Agios Donatos / Photike,tituli fabricationis,"nomen singulare, viri",sigilla impressa,nomen singulare; sigilla impressa; tituli fa...,,,,,,,,,,,,,[ ],,False,,,,,,False,,False,,False,,False,,False,,False,,False,L,,,,,20.506908,39.475976,False,POINT (20.50691 39.47598),True,41221.0,27923.0,1000.0,rural,,,[],"[nomen singulare, sigilla impressa, tituli...",owner/artist inscription,0.840105


In [87]:
# prepare last part of the data - inscriptions we have only in EDH
# in this case, we can also use the information we already have
EDH_unique = LIST[(LIST["EDH-ID"].notnull()) & (LIST["EDCS-ID"].isnull())]
EDH_unique["type_of_inscription_auto"] = EDH_unique["type_of_inscription_clean"]
EDH_unique["type_of_inscription_auto_prob"] = [1] * len(EDH_unique)

In [88]:
EDH_overlap_all = LIST[(LIST["EDH-ID"].notnull()) & (LIST["EDCS-ID"].notnull())]
EDH_overlap_all["type_of_inscription_auto"] = EDH_overlap_all["type_of_inscription_clean"]
EDH_overlap_all["type_of_inscription_auto_prob"] = [1] * len(EDH_overlap_all)

In [89]:
print("the dataset we used for training and testing (+NULLs): ", EDH_overlap_all.shape)
print("inscriptions in EDH only: ", EDH_unique.shape)
print("inscriptions in EDCS only, we wanted to classify: ", EDCS_unique.shape)

the dataset we used for training and testing (+NULLs):  (76583, 69)
inscriptions in EDH only:  (3316, 69)
inscriptions in EDCS only, we wanted to classify:  (448585, 71)


In [90]:
# combine together
LIST_autotypes = pd.concat([EDH_overlap_all, EDH_unique, EDCS_unique])
LIST_autotypes.head()

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,trismegistos_uri,pleiades_id,transcription,inscription,clean_text_conservative,clean_text_interpretive_sentence,clean_text_interpretive_word,clean_text_interpretive_word_EDCS,diplomatic_text,province,place,inscr_type,status_notation,inscr_process,status,partner_link,last_update,letter_size,type_of_inscription,work_status,year_of_find,present_location,text_edition,support_objecttype,support_material,support_decoration,keywords_term,people,type_of_inscription_clean,type_of_inscription_certainty,height_cm,width_cm,depth_cm,material_clean,type_of_monument_clean,type_of_monument_certainty,province_label_clean,province_label_certainty,country_clean,country_certainty,findspot_ancient_clean,findspot_ancient_certainty,modern_region_clean,modern_region_certainty,findspot_modern_clean,findspot_modern_certainty,findspot_clean,findspot_certainty,language,language_EDCS,raw_dating,not_after,not_before,Longitude,Latitude,geotemporal?,geometry,withinRE?,large_city_pop,medium_city_pop,small_city_pop,urban_context,urban_context_city,urban_context_pop_est,type_of_inscription_auto,type_of_inscription_auto_prob,bigrams,status_list
448585,LIST-448586,EDCS-24900077,HD056163,https://www.trismegistos.org/text/177366,570485.0,Q(uinto) Caecilio C(ai) f(ilio) Metelo / imper...,Q(uinto) Caecilio C(ai) f(ilio) Metel(l)o / im...,Q Caecilio C f Metelo imperatori Italici quei ...,Quinto Caecilio Cai filio Metelo imperatori It...,Quinto Caecilio Cai filio Metelo imperatori It...,Quinto Caecilio Cai filio Metello imperatori I...,Q CAECILIO C F METELO / IMPERATORI ITALICI / Q...,Achaia,Agia Triada / Merbaka / Midea,tituli honorarii,"officium/professio, ordo senatorius, tria nomi...",,officium/professio; ordo senatorius; tituli ...,http://db.edcs.eu/epigr/partner.php?s_language...,2011-11-11,,honorific inscription,no image,,,\n Quinto Caecilio Cai filio Metelo imperatori...,,,1000.0,69.0,[ ],honorific inscription,False,,,,,,False,Achaia,False,Greece,False,Midea,False,Pelopónissos,False,Midhéa,False,,False,L,,-68 to -68,-68.0,,22.8412,37.6498,True,POINT (22.84120 37.64980),True,49319.0,9938.0,1000.0,rural,,,honorific inscription,1.0,,
448586,LIST-448587,EDCS-03700724,HD052964,https://www.trismegistos.org/text/121715,531064.0,Fortissimo et piis/simo Caesari d(omino) n(ost...,Fortissimo et Piis/simo Caesari d(omino) n(ost...,Fortissimo et piissimo Caesari d n Gal Val P F...,Fortissimo et piissimo Caesari domino nostro G...,Fortissimo et piissimo Caesari domino nostro G...,Fortissimo et Piissimo Caesari domino nostro G...,FORTISSIMO ET PIIS / SIMO CAESARI D N / GAL VA...,Achaia,Agios Athanasios / Photike,tituli honorarii,"Augusti/Augustae, ordo equester, tria nomina",litterae erasae,Augusti/Augustae; litterae erasae; ordo eque...,http://db.edcs.eu/epigr/partner.php?s_language...,2014-09-16,3-5.3 cm,honorific inscription,checked with photo,,Fragma Kalama,\n Fortissimo et piissimo Caesari domino nostr...,57.0,,1000.0,69.0,"[ { ""persname"": { ""name"": [ { ""@type"": ""nomen""...",honorific inscription,False,99.0,67.0,67.0,,statue base,False,Epirus,False,Greece,False,Photike,False,Ípeiros,False,Paramythía,False,{Agios Athanasios},False,L,,309 to 313,309.0,313.0,20.7668,39.4512,True,POINT (20.76680 39.45120),True,41221.0,27923.0,1000.0,rural,,,honorific inscription,1.0,,
448587,LIST-448588,EDCS-13800065,HD017714,https://www.trismegistos.org/text/177100,570049.0,Italicei / quei Aegei negotiantur / P(ublium) ...,Italicei / quei Aegei negotiantur / P(ublium) ...,Italicei quei Aegei negotiantur P Rutilium P f...,Italicei quei Aegei negotiantur Publium Rutili...,Italicei quei Aegei negotiantur Publium Rutili...,Italicei quei Aegei negotiantur Publium Rutili...,ITALICEI / QVEI AEGEI NEGOTIANTVR / P RVTILIVM...,Achaia,Aigio / Egio / Aiyion / Aegeum,tituli honorarii,"officium/professio, ordo senatorius, tria nomi...",,officium/professio; ordo senatorius; tituli ...,http://db.edcs.eu/epigr/partner.php?s_language...,2011-03-29,3.5-3.7 cm,votive inscription,checked with photo,,,\n Italicei quei Aegei negotiantur Publium Rut...,257.0,,1000.0,372.0,[ ],votive inscription,False,58.0,61.0,16.0,,tabula,False,Achaia,False,Greece,False,Aegeum,False,Dytikí Elláda,False,Aígion,False,,False,L,,-74 to -74,-74.0,,22.0845,38.2487,True,POINT (22.08450 38.24870),True,41538.0,9498.0,1000.0,small,Aegium,1000.0,votive inscription,1.0,,
448588,LIST-448589,EDCS-03300852,HD051000,https://www.trismegistos.org/text/177273,240855.0,[Imp(eratori) Caes(ari) M(arco)] / An[nio] Flo...,Imp(eratori) / Floriano / P(io) F(elici) Aug(u...,An Floriano P F Aug p p m p III Imp Caes M Aur...,Imperatori Caesari Marco Annio Floriano Pio Fe...,Imperatori Caesari Marco Annio Floriano Pio Fe...,Imperatori Floriano Pio Felici Augusto patri p...,[ ] / AN[ ] FLORIANO / P F AVG / P P / M P III...,Achaia,Alea / Tegea,miliaria,"Augusti/Augustae, viri",,Augusti/Augustae; miliaria; viri,http://db.edcs.eu/epigr/partner.php?s_language...,2011-05-24,3.3-6 cm,mile-/leaguestone,checked with photo,,"Tegea, Mus.",\n Imperatori Caesari Marco Annio Floriano Pio...,89.0,,1000.0,,"[ { ""persname"": { ""name"": [ { ""@type"": ""praeno...",mile-/leaguestone,False,44.0,24.0,,,mile-/leaguestone,False,Achaia,False,Greece,False,Tegea,False,Pelopónissos,False,Alea,False,"Stringu, bei",False,L,,a: 276 to 276; b: 276 to 282,276.0,,22.4171,37.4319,True,POINT (22.41710 37.43190),True,46362.0,25371.0,1000.0,large,Tegea,46362.0,mile-/leaguestone,1.0,,
448589,LIST-448590,EDCS-28500283,HD021396,https://www.trismegistos.org/text/177131,,T[i(berius)] Claudius Caesar Aug(ustus) / G[er...,T[(iberius)] Claudius Caesar Aug(ustus) / G[er...,T Claudius Caesar Aug Ganicus pontif max trib ...,Tiberius Claudius Caesar Augustus Germanicus p...,Tiberius Claudius Caesar Augustus Germanicus p...,Tiberius Claudius Caesar Augustus Germanicus p...,T[ ] CLAVDIVS CAESAR AVG / G[ ]ANICVS PONTIF M...,Achaia,Alea / Tegea,leges,"Augusti/Augustae, viri",,Augusti/Augustae; leges; viri,http://db.edcs.eu/epigr/partner.php?s_language...,2011-05-10,,public legal inscription,checked with photo,,,\n Tiberius Claudius Caesar Augustus Germanicu...,257.0,,1000.0,5.0,[ ],public legal inscription,False,160.0,58.0,17.0,,tabula,False,Achaia,False,Greece,False,Tegea,False,Pelopónissos,False,Alea,False,,False,L,,49 to 50,49.0,50.0,,,False,,False,,,,,,,public legal inscription,1.0,,


In [91]:
LIST_autotypes.drop(["bigrams", "status_list"], axis=1, inplace=True)

In [92]:
LIST_autotypes.shape

(528484, 69)

In [93]:
LIST_autotypes.head(5)

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,trismegistos_uri,pleiades_id,transcription,inscription,clean_text_conservative,clean_text_interpretive_sentence,clean_text_interpretive_word,clean_text_interpretive_word_EDCS,diplomatic_text,province,place,inscr_type,status_notation,inscr_process,status,partner_link,last_update,letter_size,type_of_inscription,work_status,year_of_find,present_location,text_edition,support_objecttype,support_material,support_decoration,keywords_term,people,type_of_inscription_clean,type_of_inscription_certainty,height_cm,width_cm,depth_cm,material_clean,type_of_monument_clean,type_of_monument_certainty,province_label_clean,province_label_certainty,country_clean,country_certainty,findspot_ancient_clean,findspot_ancient_certainty,modern_region_clean,modern_region_certainty,findspot_modern_clean,findspot_modern_certainty,findspot_clean,findspot_certainty,language,language_EDCS,raw_dating,not_after,not_before,Longitude,Latitude,geotemporal?,geometry,withinRE?,large_city_pop,medium_city_pop,small_city_pop,urban_context,urban_context_city,urban_context_pop_est,type_of_inscription_auto,type_of_inscription_auto_prob
448585,LIST-448586,EDCS-24900077,HD056163,https://www.trismegistos.org/text/177366,570485.0,Q(uinto) Caecilio C(ai) f(ilio) Metelo / imper...,Q(uinto) Caecilio C(ai) f(ilio) Metel(l)o / im...,Q Caecilio C f Metelo imperatori Italici quei ...,Quinto Caecilio Cai filio Metelo imperatori It...,Quinto Caecilio Cai filio Metelo imperatori It...,Quinto Caecilio Cai filio Metello imperatori I...,Q CAECILIO C F METELO / IMPERATORI ITALICI / Q...,Achaia,Agia Triada / Merbaka / Midea,tituli honorarii,"officium/professio, ordo senatorius, tria nomi...",,officium/professio; ordo senatorius; tituli ...,http://db.edcs.eu/epigr/partner.php?s_language...,2011-11-11,,honorific inscription,no image,,,\n Quinto Caecilio Cai filio Metelo imperatori...,,,1000.0,69.0,[ ],honorific inscription,False,,,,,,False,Achaia,False,Greece,False,Midea,False,Pelopónissos,False,Midhéa,False,,False,L,,-68 to -68,-68.0,,22.8412,37.6498,True,POINT (22.84120 37.64980),True,49319.0,9938.0,1000.0,rural,,,honorific inscription,1.0
448586,LIST-448587,EDCS-03700724,HD052964,https://www.trismegistos.org/text/121715,531064.0,Fortissimo et piis/simo Caesari d(omino) n(ost...,Fortissimo et Piis/simo Caesari d(omino) n(ost...,Fortissimo et piissimo Caesari d n Gal Val P F...,Fortissimo et piissimo Caesari domino nostro G...,Fortissimo et piissimo Caesari domino nostro G...,Fortissimo et Piissimo Caesari domino nostro G...,FORTISSIMO ET PIIS / SIMO CAESARI D N / GAL VA...,Achaia,Agios Athanasios / Photike,tituli honorarii,"Augusti/Augustae, ordo equester, tria nomina",litterae erasae,Augusti/Augustae; litterae erasae; ordo eque...,http://db.edcs.eu/epigr/partner.php?s_language...,2014-09-16,3-5.3 cm,honorific inscription,checked with photo,,Fragma Kalama,\n Fortissimo et piissimo Caesari domino nostr...,57.0,,1000.0,69.0,"[ { ""persname"": { ""name"": [ { ""@type"": ""nomen""...",honorific inscription,False,99.0,67.0,67.0,,statue base,False,Epirus,False,Greece,False,Photike,False,Ípeiros,False,Paramythía,False,{Agios Athanasios},False,L,,309 to 313,309.0,313.0,20.7668,39.4512,True,POINT (20.76680 39.45120),True,41221.0,27923.0,1000.0,rural,,,honorific inscription,1.0
448587,LIST-448588,EDCS-13800065,HD017714,https://www.trismegistos.org/text/177100,570049.0,Italicei / quei Aegei negotiantur / P(ublium) ...,Italicei / quei Aegei negotiantur / P(ublium) ...,Italicei quei Aegei negotiantur P Rutilium P f...,Italicei quei Aegei negotiantur Publium Rutili...,Italicei quei Aegei negotiantur Publium Rutili...,Italicei quei Aegei negotiantur Publium Rutili...,ITALICEI / QVEI AEGEI NEGOTIANTVR / P RVTILIVM...,Achaia,Aigio / Egio / Aiyion / Aegeum,tituli honorarii,"officium/professio, ordo senatorius, tria nomi...",,officium/professio; ordo senatorius; tituli ...,http://db.edcs.eu/epigr/partner.php?s_language...,2011-03-29,3.5-3.7 cm,votive inscription,checked with photo,,,\n Italicei quei Aegei negotiantur Publium Rut...,257.0,,1000.0,372.0,[ ],votive inscription,False,58.0,61.0,16.0,,tabula,False,Achaia,False,Greece,False,Aegeum,False,Dytikí Elláda,False,Aígion,False,,False,L,,-74 to -74,-74.0,,22.0845,38.2487,True,POINT (22.08450 38.24870),True,41538.0,9498.0,1000.0,small,Aegium,1000.0,votive inscription,1.0
448588,LIST-448589,EDCS-03300852,HD051000,https://www.trismegistos.org/text/177273,240855.0,[Imp(eratori) Caes(ari) M(arco)] / An[nio] Flo...,Imp(eratori) / Floriano / P(io) F(elici) Aug(u...,An Floriano P F Aug p p m p III Imp Caes M Aur...,Imperatori Caesari Marco Annio Floriano Pio Fe...,Imperatori Caesari Marco Annio Floriano Pio Fe...,Imperatori Floriano Pio Felici Augusto patri p...,[ ] / AN[ ] FLORIANO / P F AVG / P P / M P III...,Achaia,Alea / Tegea,miliaria,"Augusti/Augustae, viri",,Augusti/Augustae; miliaria; viri,http://db.edcs.eu/epigr/partner.php?s_language...,2011-05-24,3.3-6 cm,mile-/leaguestone,checked with photo,,"Tegea, Mus.",\n Imperatori Caesari Marco Annio Floriano Pio...,89.0,,1000.0,,"[ { ""persname"": { ""name"": [ { ""@type"": ""praeno...",mile-/leaguestone,False,44.0,24.0,,,mile-/leaguestone,False,Achaia,False,Greece,False,Tegea,False,Pelopónissos,False,Alea,False,"Stringu, bei",False,L,,a: 276 to 276; b: 276 to 282,276.0,,22.4171,37.4319,True,POINT (22.41710 37.43190),True,46362.0,25371.0,1000.0,large,Tegea,46362.0,mile-/leaguestone,1.0
448589,LIST-448590,EDCS-28500283,HD021396,https://www.trismegistos.org/text/177131,,T[i(berius)] Claudius Caesar Aug(ustus) / G[er...,T[(iberius)] Claudius Caesar Aug(ustus) / G[er...,T Claudius Caesar Aug Ganicus pontif max trib ...,Tiberius Claudius Caesar Augustus Germanicus p...,Tiberius Claudius Caesar Augustus Germanicus p...,Tiberius Claudius Caesar Augustus Germanicus p...,T[ ] CLAVDIVS CAESAR AVG / G[ ]ANICVS PONTIF M...,Achaia,Alea / Tegea,leges,"Augusti/Augustae, viri",,Augusti/Augustae; leges; viri,http://db.edcs.eu/epigr/partner.php?s_language...,2011-05-10,,public legal inscription,checked with photo,,,\n Tiberius Claudius Caesar Augustus Germanicu...,257.0,,1000.0,5.0,[ ],public legal inscription,False,160.0,58.0,17.0,,tabula,False,Achaia,False,Greece,False,Tegea,False,Pelopónissos,False,Alea,False,,False,L,,49 to 50,49.0,50.0,,,False,,False,,,,,,,public legal inscription,1.0


In [94]:
len(LIST_autotypes[LIST_autotypes["type_of_inscription_auto_prob"]>=0.6])

475286

In [95]:
len(LIST_autotypes[LIST_autotypes["type_of_inscription_auto"]=="NULL"])

20388

In [96]:
def apply_threshold(inscr_type, prob):
    if (prob < 0.6) or (inscr_type == "NULL"):
        return None
    else:
        return inscr_type
LIST_autotypes["type_of_inscription_auto"] =  LIST_autotypes.apply(lambda row: apply_threshold(row["type_of_inscription_auto"], row["type_of_inscription_auto_prob"]), axis=1)

In [97]:
def prob_to_zero(inscr_type, prob):
    if inscr_type == None:
        return 0
    else:
        return prob
LIST_autotypes["type_of_inscription_auto_prob"] =  LIST_autotypes.apply(lambda row: prob_to_zero(row["type_of_inscription_auto"], row["type_of_inscription_auto_prob"]), axis=1)

In [98]:
len(LIST_autotypes[LIST_autotypes["type_of_inscription_auto"].notnull()])

454898

# Save

In [100]:
LIST_autotypes.to_parquet("../data/large_data/LIST_v0-2.parquet")