In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('input/variant-names.csv')
df2 = pd.read_csv('input/dll-works-and-authors.csv')

In [3]:
# Filter out the unneeded columns
names_works = df2[['Title', 'DLL Identifier (Work)', 'Author Name Latin', 'DLL Identifier (Author)']]

In [7]:
# Reduce the set to rows with a value in "Author Name Latin"
la_names_works = names_works[~names_works['Author Name Latin'].isna()].reset_index()

In [8]:
# Compare the number of records in both
print(f"Original number of records: {len(names_works)}")
print(f"Number of records with Latin author names:  {len(la_names_works)}")

Original number of records: 5279
Number of records with Latin author names:  4753


In [9]:
# import Spacy and LatinCy model
import spacy

In [48]:
# Load the large Latin model from LatinCy
nlp = spacy.load('la_core_web_lg')

In [37]:
# Inspect the first five rows of the dataframe
la_names_works.head()

Unnamed: 0,index,Title,DLL Identifier (Work),Author Name Latin,DLL Identifier (Author),Title_Lemmatized,Title Label,Author Label
0,0,De signis et symptomatibus aegritudinum,W10655,Aegidius Corbeiensis,A3919,de signum et symptomatis aegritudo,TITLE,PERSON
1,2,Alda,W10653,Guilelmus Blasensis,A4844,alda,TITLE,PERSON
2,3,De Viris Illustribus,W10652,Gaius Suetonius Tranquillus,A4799,de uir illustris,TITLE,PERSON
3,4,De Philosophis,W10651,Gaius Suetonius Tranquillus,A4799,de philosophus,TITLE,PERSON
4,5,Epigrammata super exilio,W10650,Lucius Annaeus Seneca,A4655,epigramma super exilium,TITLE,PERSON


In [None]:
# Add columns with NER labels for titles and people
la_names_works['Title Label'] = "TITLE"
la_names_works['Author Label'] = "PERSON"

In [34]:
# Function to get lemmas from a text string
def get_lemmas(text):
    # Apply the model to the text
    doc = nlp(text)
    # Normalize the tokens in the text
    normals = [item.norm_ for item in doc]
    normalized = (' ').join(normals)
    # Apply the model to the new text
    normaldoc = nlp(normalized)
    # Lemmatize the text
    lemmas = [token.lemma_ for token in normaldoc]
    return ' '.join(lemmas)  # Return lemmas as a single string

In [35]:
# Apply the function to the 'Title' column (or any other column)
la_names_works['Title_Lemmatized'] = la_names_works['Title'].apply(get_lemmas)

In [36]:
la_names_works.head()

Unnamed: 0,index,Title,DLL Identifier (Work),Author Name Latin,DLL Identifier (Author),Title_Lemmatized,Title Label,Author Label
0,0,De signis et symptomatibus aegritudinum,W10655,Aegidius Corbeiensis,A3919,de signum et symptomatis aegritudo,TITLE,PERSON
1,2,Alda,W10653,Guilelmus Blasensis,A4844,alda,TITLE,PERSON
2,3,De Viris Illustribus,W10652,Gaius Suetonius Tranquillus,A4799,de uir illustris,TITLE,PERSON
3,4,De Philosophis,W10651,Gaius Suetonius Tranquillus,A4799,de philosophus,TITLE,PERSON
4,5,Epigrammata super exilio,W10650,Lucius Annaeus Seneca,A4655,epigramma super exilium,TITLE,PERSON


In [38]:
# Lemmatize the author names
la_names_works['Author_Lemmatized'] = la_names_works['Author Name Latin'].apply(get_lemmas)

In [39]:
la_names_works.head()

Unnamed: 0,index,Title,DLL Identifier (Work),Author Name Latin,DLL Identifier (Author),Title_Lemmatized,Title Label,Author Label,Author_Lemmatized
0,0,De signis et symptomatibus aegritudinum,W10655,Aegidius Corbeiensis,A3919,de signum et symptomatis aegritudo,TITLE,PERSON,aegide corbeiensis
1,2,Alda,W10653,Guilelmus Blasensis,A4844,alda,TITLE,PERSON,guilelmus blasendo
2,3,De Viris Illustribus,W10652,Gaius Suetonius Tranquillus,A4799,de uir illustris,TITLE,PERSON,gaius suetonius tranquillus
3,4,De Philosophis,W10651,Gaius Suetonius Tranquillus,A4799,de philosophus,TITLE,PERSON,gaius suetonius tranquillus
4,5,Epigrammata super exilio,W10650,Lucius Annaeus Seneca,A4655,epigramma super exilium,TITLE,PERSON,lucius annaeus senecus


In [41]:
ner_frame = la_names_works[['Author_Lemmatized','Author Label','Title_Lemmatized','Title Label']]

In [40]:
from spacy.tokens import DocBin

In [44]:
# Initialize a blank Latin model, adjust if needed
nlp = spacy.blank("la")
doc_bin = DocBin()  # Create a DocBin for storing Doc objects

# Assuming your DataFrame is already loaded and named df
for index, row in ner_frame.iterrows():
    author = row['Author_Lemmatized']
    author_label = row['Author Label']
    title = row['Title_Lemmatized']
    title_label = row['Title Label']
    
    # Create a simple sentence or string with the author and title
    text = f"{author} {title}"
    
    # Create a SpaCy Doc object
    doc = nlp.make_doc(text)
    entities = []
    
    # Find and annotate the author
    author_start = text.find(author)
    author_end = author_start + len(author)
    
    # Only add the entity if the char_span is valid
    author_span = doc.char_span(author_start, author_end, label=author_label)
    if author_span is not None:
        entities.append(author_span)
    
    # Find and annotate the title
    title_start = text.find(title)
    title_end = title_start + len(title)
    
    # Only add the entity if the char_span is valid
    title_span = doc.char_span(title_start, title_end, label=title_label)
    if title_span is not None:
        entities.append(title_span)
    
    # Set annotations
    doc.ents = entities
    
    # Add the doc to the DocBin
    doc_bin.add(doc)

# Save the DocBin to disk
output_file = 'train_data.spacy'
doc_bin.to_disk(output_file)

print(f"Training data saved to {output_file}")

Training data saved to train_data.spacy


In [45]:
# Load the saved training data
doc_bin = DocBin().from_disk("train_data.spacy")

# Convert the DocBin to a list of Doc objects
docs = list(doc_bin.get_docs(nlp.vocab))

# Inspect the training data
for doc in docs:
    print(f"Text: {doc.text}")
    for ent in doc.ents:
        print(f" - Entity: {ent.text}, Label: {ent.label_}, Start: {ent.start_char}, End: {ent.end_char}")
    print("-" * 40)  # Separator for readability


Text: aegide corbeiensis de signum et symptomatis aegritudo
 - Entity: aegide corbeiensis, Label: PERSON, Start: 0, End: 18
 - Entity: de signum et symptomatis aegritudo, Label: TITLE, Start: 19, End: 53
----------------------------------------
Text: guilelmus blasendo alda
 - Entity: guilelmus blasendo, Label: PERSON, Start: 0, End: 18
 - Entity: alda, Label: TITLE, Start: 19, End: 23
----------------------------------------
Text: gaius suetonius tranquillus de uir illustris
 - Entity: gaius suetonius tranquillus, Label: PERSON, Start: 0, End: 27
 - Entity: de uir illustris, Label: TITLE, Start: 28, End: 44
----------------------------------------
Text: gaius suetonius tranquillus de philosophus
 - Entity: gaius suetonius tranquillus, Label: PERSON, Start: 0, End: 27
 - Entity: de philosophus, Label: TITLE, Start: 28, End: 42
----------------------------------------
Text: lucius annaeus senecus epigramma super exilium
 - Entity: lucius annaeus senecus, Label: PERSON, Start: 0, End: 22

In [49]:
print(nlp.pipe_names)

['senter', 'normer', 'tok2vec', 'tagger', 'morphologizer', 'trainable_lemmatizer', 'parser', 'lookup_lemmatizer', 'ner']


In [50]:
ner = nlp.get_pipe("ner")
ner.add_label("TITLE")

1

In [51]:
import random
from spacy.training.example import Example

# Load your training data
doc_bin = DocBin().from_disk("train_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

# Disable other components in the pipeline during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

# Start training
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    for iteration in range(10):  # Adjust the number of iterations as needed
        random.shuffle(docs)
        losses = {}
        for doc in docs:
            example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
            nlp.update([example], drop=0.5, losses=losses, sgd=optimizer)
        print(f"Iteration {iteration}: Losses {losses}")

nlp.to_disk("path_to_save_your_model")


Iteration 0: Losses {'ner': 3083.321723178891}
Iteration 1: Losses {'ner': 1267.1095341879095}
Iteration 2: Losses {'ner': 852.5200421897015}
Iteration 3: Losses {'ner': 715.2959123770378}
Iteration 4: Losses {'ner': 603.6691058241373}
Iteration 5: Losses {'ner': 499.82435926955674}
Iteration 6: Losses {'ner': 544.877397764566}
Iteration 7: Losses {'ner': 406.6156816887558}
Iteration 8: Losses {'ner': 447.4768492653248}
Iteration 9: Losses {'ner': 399.5619176952475}


In [52]:
# Load the fine-tuned model
nlp = spacy.load("path_to_save_your_model")

# Test it on some text
doc = nlp("Virgil Aeneis")
for ent in doc.ents:
    print(ent.text, ent.label_)

Virgil PERSON
Aeneis TITLE


In [56]:
# Load the fine-tuned model
nlp = spacy.load("path_to_save_your_model")

# Test it on some text
doc = nlp("Publius Statinius Statius Silva")
for ent in doc.ents:
    print(ent.text, ent.label_)

Publius Statinius Statius Silva PERSON


In [54]:
# Load the fine-tuned model
nlp = spacy.load("path_to_save_your_model")

# Test it on some text
doc = nlp("Silva")
for ent in doc.ents:
    print(ent.text, ent.label_)

Silva TITLE


In [59]:
# Load the fine-tuned model
nlp = spacy.load("path_to_save_your_model")

# Test it on some text
doc = nlp("Publius Papinius Statius Silvae")
for ent in doc.ents:
    print(ent.text, ent.label_)

Publius Papinius Statius PERSON
Silvae TITLE
