# Inference Testing

The purpose of this notebook is to get a better sense of the performance of the author identification model.

In the cells below, I load the author identification module, read in the data from HathiTrust in the `latin_authors.csv` that was created when I ran the Greek-Latin classification model in `python/hybrid.ipynb`, and generate a dataframe with the following information:

- the original author name from the HathiTrust data
- the normalized form of that name
- the model's inference for that name
- the model's confidence about its inference
- the Jaro distance between the normalized form of the name and the model's inference
- the deterministic match of the normalized name form
- the fuzzy match of the normalized name form

The goal is to see how many of the correct deterministic and fuzzy matches are included among the model's correct matches. 

In [1]:
import pandas as pd

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Path to your saved fine-tuned model
author_matching_repo = 'sjhuskey/distilbert_multilingual_cased_latin_author_identifier'

# Load the tokenizer and model
author_matching_tokenizer = AutoTokenizer.from_pretrained(author_matching_repo)
author_matching = AutoModelForSequenceClassification.from_pretrained(author_matching_repo)

print("Model loaded successfully!")

# Verify label mappings
label2id = author_matching.config.label2id
id2label = author_matching.config.id2label

print("Label-to-ID Mapping:", label2id)
print("ID-to-Label Mapping:", id2label)

Model loaded successfully!
Label-to-ID Mapping: {'A1868': 1960, 'A1870': 2246, 'A2181': 2618, 'A2491': 14, 'A2492': 2290, 'A2493': 2215, 'A2494': 1752, 'A2495': 1554, 'A2508': 2051, 'A2755': 152, 'A2868': 365, 'A2870': 2274, 'A2871': 2455, 'A2872': 2372, 'A2873': 2917, 'A2874': 2558, 'A2875': 2821, 'A2876': 2690, 'A2877': 2536, 'A2878': 2419, 'A2879': 1455, 'A2880': 3074, 'A2881': 2952, 'A2882': 2626, 'A2883': 91, 'A2884': 2846, 'A2885': 1127, 'A2886': 2669, 'A2887': 1630, 'A2888': 733, 'A2889': 2798, 'A2890': 2396, 'A2891': 465, 'A2892': 2437, 'A2893': 3078, 'A2894': 2814, 'A2895': 2550, 'A2896': 1013, 'A2897': 498, 'A2898': 3053, 'A2901': 2435, 'A2902': 2147, 'A2903': 1443, 'A2904': 2325, 'A2905': 2570, 'A2906': 2606, 'A2907': 1235, 'A2908': 1622, 'A2909': 1236, 'A2910': 765, 'A2911': 330, 'A2912': 3049, 'A2913': 2385, 'A2914': 3077, 'A2915': 764, 'A2916': 2894, 'A2917': 2795, 'A2918': 2788, 'A2919': 2465, 'A2920': 127, 'A2921': 1530, 'A2922': 2941, 'A2923': 1462, 'A2924': 1205, 'A29

In [3]:
# Import the utility functions for making dictionaries
import utilities as utilities

# Read in the authors data
authors = pd.read_csv('../data/authors_db.csv',encoding='utf-8',quotechar='"')
works = pd.read_csv('../data/works_db.csv',encoding='utf-8',quotechar='"')
authors = authors.rename(columns={'Variant':'variant_name','Authorized Name':'authorized_name','DLL Identifier (Author)':'dll_id_author'})
works = works.rename(columns={'Title':'title','DLL Identifier (Work)':'dll_id_work','DLL Identifier (Author)': 'dll_id_author'})
# Prepare the lookup dictionaries of variant author names and titles
variant_to_authorized = {
        utilities.normalize_author_name(row["variant_name"]): {
            "authorized_name": row["authorized_name"], 
            "author_id": row["dll_id_author"]
        }
        for _, row in authors.iterrows()
    }


In [4]:
type(variant_to_authorized)

dict

In [5]:
import torch
import torch.nn.functional as F
import pandas as pd
from rapidfuzz.distance import Jaro
from rapidfuzz import fuzz, process

# Function to match author using deterministic method
def deterministic_author_match(input_author):
    author_info = variant_to_authorized.get(input_author)
    return author_info

# Function to match author using fuzzy matching
def fuzzy_author_match(input_author):
    result = process.extractOne(input_author, list(variant_to_authorized.keys()), scorer=fuzz.token_sort_ratio)
    if result:
        best_match, similarity, *_ = result
        if similarity > 90:
            return variant_to_authorized.get(best_match), similarity / 100
    return None, 0.0

# Function to calculate Jaro distance
def jaro_distance(s1, s2):
    return Jaro.normalized_similarity(s1, s2)

# Function to match author using DistilBERT model
def distilbert_author_match(input_author):
    if not isinstance(input_author, str):
        return None, 0.0

    inputs = author_matching_tokenizer(input_author, return_tensors="pt", truncation=True, padding=True)
    outputs = author_matching(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    confidence = torch.softmax(logits, dim=-1).max().item()

    predicted_author_id = author_matching.config.id2label[predicted_class]
    return predicted_author_id, confidence

# Function to translate ID to authorized name
def translate_id_to_authorized_name(predicted_author_id, lookup_df):
    row = lookup_df[lookup_df['dll_id_author'] == predicted_author_id]
    if not row.empty:
        return row.iloc[0]['authorized_name']
    return None

# Function to generate DataFrame
def generate_inference_dataframe(input_df, lookup_df):
    data = {
        'Original Input': [],
        'Model Inference': [],
        'Jaro Distance': [],
        'Model Confidence': [],
        'Deterministic Match': [],
        'Fuzzy Match': []
    }
    
    for _, row in input_df.iterrows():
        input_author_original = row["author"]
        input_author_normalized = row["normalized_author"]
        print(f'Processing: {input_author_original}')
        predicted_author_id, distilbert_author_score = distilbert_author_match(input_author_original)
        distilbert_author = translate_id_to_authorized_name(predicted_author_id, lookup_df)
        if distilbert_author is None:
            distilbert_author = ""
        distance = jaro_distance(input_author_normalized, distilbert_author)
        
        data['Original Input'].append(input_author_original)
        data['Model Inference'].append(distilbert_author)
        data['Jaro Distance'].append(distance)
        data['Model Confidence'].append(distilbert_author_score)
        data['Deterministic Match'].append(deterministic_author_match(input_author_normalized))
        data['Fuzzy Match'].append(fuzzy_author_match(input_author_normalized))
    
    df = pd.DataFrame(data)
    return df

# Function to clean input
def clean_input(df):
    """Ensure all author and title values are strings, replacing NaN with 'Unknown'."""
    df["author"] = df["author"].fillna("Unknown").astype(str)
    return df

In [6]:
# Apply cleaning before processing metadata
input_df = pd.read_csv('../output/latin_authors.csv', encoding='utf-8', quotechar='"') # Loading preprocessed, deduplicated hathi2.csv
input_df = clean_input(input_df)  # Clean the data
input_df['normalized_author'] = input_df['author'].apply(lambda x: utilities.normalize_author_name(x))  # Normalize the author names
output_df = generate_inference_dataframe(input_df, authors)  # Generate the inference DataFrame
print("Done with processing authors and titles.")
display(output_df.head())

Processing: Du Creux, François, 1596?-1666.
Processing: Meyer, Ernst H. F. 1791-1858.
Processing: Laet, Joannes de, 1593-1649.
Processing: Caesar, Julius
Processing: Unknown
Processing: Drexel, Jeremias, 1581-1638,
Processing: Kircher, Athanasius, 1602-1680
Processing: Hincmar, Archbishop of Reims, approximately 806-882
Processing: Acosta, José de, 1540-1600,
Processing: Lessius, Leonardus, 1554-1623
Processing: Riccioli, Giovanni Battista, 1598-1671,
Processing: Guazzo, Francesco Maria,
Processing: Kircher, Athanasius, 1602-1680.
Processing: Mersenne, Marin, 1588-1648,
Processing: Virgil.
Processing: Kepler, Johannes, 1571-1630.
Processing: Roothaan, Joannes Philippus, 1785-1853
Processing: Suarez, Francisco, 1548-1617.
Processing: Alvares, Manuel, 1526-1583.
Processing: Suarez, Francisco, 1548-1617.
Processing: Huygens, Constantijn, 1596-1687
Processing: Nieremberg, Juan Eusebio, 1595-1658.
Processing: Bellarmino, Roberto Francesco Romolo, Saint, 1542-1621.
Processing: Reiske, Johan

Unnamed: 0,Original Input,Model Inference,Jaro Distance,Model Confidence,Deterministic Match,Fuzzy Match
0,"Du Creux, François, 1596?-1666.","cruz, luís da, 1543-1604",0.592308,0.467436,,"(None, 0.0)"
1,"Meyer, Ernst H. F. 1791-1858.","meyer, wilhelm, 1845-1917",0.685556,0.999939,,"(None, 0.0)"
2,"Laet, Joannes de, 1593-1649.","larroumet, gustave",0.513889,0.494394,,"(None, 0.0)"
3,"Caesar, Julius","caesar, julius",0.97619,0.999999,,"({'authorized_name': 'caesar, julius', 'author..."
4,Unknown,stephanus abbas 4. or 6th century,0.395382,0.177454,,"(None, 0.0)"


In [7]:
import csv
# Save the output DataFrame to a CSV file
output_df.to_csv('../output/inference-testing2.csv',index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [8]:
output_df.describe()

Unnamed: 0,Jaro Distance,Model Confidence
count,13491.0,13491.0
mean,0.73131,0.790084
std,0.204544,0.298975
min,0.0,0.082294
25%,0.559135,0.568069
50%,0.676806,0.999766
75%,0.966667,0.999998
max,1.0,1.0


In [9]:
# Function to get value counts in percentiles
def value_counts_in_percentiles(df, column):
    # Get the percentiles from the describe method
    percentiles = df[column].describe(percentiles=[.25, .5, .75])
    
    # Extract the percentile values
    p25 = percentiles['25%']
    p50 = percentiles['50%']
    p75 = percentiles['75%']
    
    # Calculate the total number of values in each percentile range
    count_below_p25 = df[df[column] <= p25][column].count()
    count_p25_to_p50 = df[(df[column] > p25) & (df[column] <= p50)][column].count()
    count_p50_to_p75 = df[(df[column] > p50) & (df[column] <= p75)][column].count()
    count_above_p75 = df[df[column] > p75][column].count()
    
    return {
        'Below 25th percentile': count_below_p25,
        '25th to 50th percentile': count_p25_to_p50,
        '50th to 75th percentile': count_p50_to_p75,
        'Above 75th percentile': count_above_p75
    }

# Get value counts in percentiles
result = value_counts_in_percentiles(output_df, 'Model Confidence')

# Print the results
for percentile_range, count in result.items():
    print(f"{percentile_range}: {count}")

Below 25th percentile: 3373
25th to 50th percentile: 3373
50th to 75th percentile: 3519
Above 75th percentile: 3226


In [10]:
value_counts_in_percentiles(output_df, 'Jaro Distance')

{'Below 25th percentile': 3374,
 '25th to 50th percentile': 3372,
 '50th to 75th percentile': 3407,
 'Above 75th percentile': 3338}

In [11]:
output_df.sort_values(by='Jaro Distance', ascending=False).to_csv('../output/inference-testing2_output.csv',index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)  

In [12]:
# Function to extract values from the tuple
def extract_author_info(author_info):
    if isinstance(author_info, tuple) and len(author_info) == 2:
        info_dict, score = author_info
        if isinstance(info_dict, dict):
            return info_dict.get('authorized_name', None), score
    return None, None

# Apply the function to create new columns
output_df[['fuzzy_author', 'score']] = output_df['Fuzzy Match'].apply(lambda x: pd.Series(extract_author_info(x)))

# Display the DataFrame
print(output_df)

                                          Original Input  \
0                        Du Creux, François, 1596?-1666.   
1                          Meyer, Ernst H. F. 1791-1858.   
2                           Laet, Joannes de, 1593-1649.   
3                                         Caesar, Julius   
4                                                Unknown   
...                                                  ...   
13486                      Thomas, à Kempis, 1380-1471.   
13487                                            Unknown   
13488                                           Persius.   
13489  Jaʻfarī, Ṣāliḥ ibn al-Ḥusayn, d. 1269 or ...   
13490                                            Unknown   

                         Model Inference  Jaro Distance  Model Confidence  \
0              cruz, luís da, 1543-1604       0.592308          0.467436   
1              meyer, wilhelm, 1845-1917       0.685556          0.999939   
2                     larroumet, gustave       0

In [13]:
def get_deterministic_author(author_info):
    if isinstance(author_info, dict):
        return author_info.get('authorized_name', None)
    return None

# Apply the function to create a new column
output_df['deterministic_author'] = output_df['Deterministic Match'].apply(get_deterministic_author)

In [14]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13491 entries, 0 to 13490
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Original Input        13491 non-null  object 
 1   Model Inference       13491 non-null  object 
 2   Jaro Distance         13491 non-null  float64
 3   Model Confidence      13491 non-null  float64
 4   Deterministic Match   4339 non-null   object 
 5   Fuzzy Match           13491 non-null  object 
 6   fuzzy_author          5321 non-null   object 
 7   score                 5321 non-null   float64
 8   deterministic_author  4339 non-null   object 
dtypes: float64(3), object(6)
memory usage: 948.7+ KB


In [27]:
results = output_df[['Original Input', 'Model Inference', 'Model Confidence', 'Jaro Distance', 'deterministic_author', 'fuzzy_author', 'score']].sort_values(by=['Model Confidence','Jaro Distance'], ascending=False)

In [28]:
results.to_csv('../output/inference-testing2_sorted.csv',index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

## How many of the deterministic results were confidently identified by the model?

In [29]:
deterministic_results = results[results['deterministic_author'].notna() & (results['Model Confidence'] >= 0.9)]
deterministic_results

Unnamed: 0,Original Input,Model Inference,Model Confidence,Jaro Distance,deterministic_author,fuzzy_author,score
566,Persius.,persius,1.000000,1.000000,persius,persius,1.0
2632,Persius.,persius,1.000000,1.000000,persius,persius,1.0
3212,Persius.,persius,1.000000,1.000000,persius,persius,1.0
3457,Persius.,persius,1.000000,1.000000,persius,persius,1.0
3899,Persius.,persius,1.000000,1.000000,persius,persius,1.0
...,...,...,...,...,...,...,...
7038,"Sánchez de las Brozas, Francisco, 1523-1601","sánchez de las brozas, francisco, 1523-1601",0.999674,0.969697,"sánchez de las brozas, francisco, 1523-1601","sánchez de las brozas, francisco, 1523-1601",1.0
6044,"Dübner, Fr. 1802-1867,","dübner, fr. (friedrich), 1802-1867",0.999501,0.725926,"dübner, fr. (friedrich), 1802-1867","dübner, fr. (friedrich), 1802-1867",1.0
1190,"Andreä, Johann Valentin, 1586-1654.",valentinus,0.996520,0.519355,"andreä, johann valentin, 1586-1654","andreä, johann valentin, 1586-1654",1.0
8032,"Filopón, Juan.","philoponus, john, active 6th century",0.989659,0.619529,"philoponus, john, active 6th century","philoponus, john, active 6th century",1.0


How many unique authors are among the deterministic matches?

In [39]:
deterministic_results['deterministic_author'].nunique()

441

In [30]:
deterministic_results['Model Confidence'].describe()

count    4331.000000
mean        0.999992
std         0.000229
min         0.989659
25%         0.999997
50%         0.999998
75%         1.000000
max         1.000000
Name: Model Confidence, dtype: float64

The model matched all the deterministic matches with high confidence. The minimum confidence was 0.989559 and the maximum was 1.00.

## How many of the fuzzy matches with high confidence were matched by the model with high confidence?

I'll filter the `NaN` values out of `fuzzy_author` column in the original dataframe to see all the positive results from fuzzy matching. Then I'll compare their scores to see whether the model successfully matched all the records that fuzzy matching caught.

In [31]:
# Filter the NaN values and sort by score and model confidence
fuzzy_results = results[results['fuzzy_author'].notna()].sort_values(by=['score','Model Confidence'], ascending=False)

In [32]:
# Check the distribution for the Model Confidence and the score for the fuzzy results
fuzzy_results.describe()

Unnamed: 0,Model Confidence,Jaro Distance,score
count,5321.0,5321.0,5321.0
mean,0.998504,0.938805,0.992274
std,0.027389,0.088003,0.020102
min,0.264327,0.430556,0.901961
25%,0.999997,0.92674,1.0
50%,0.999998,0.979167,1.0
75%,1.0,0.986111,1.0
max,1.0,1.0,1.0


The two look very similar. The difference in the minimum score can be explained by the fact that the fuzzy matching function sets a limit to records with a score of 90 or higher.

Is there a significant correlation to be found here?

In [33]:
# Replace 'None' values with NaN to enable correlation calculation
fuzzy_results = fuzzy_results.replace('None', float('nan'))

# Calculate correlation
fuzzy_results[['Model Confidence', 'Jaro Distance', 'score']].corr()

Unnamed: 0,Model Confidence,Jaro Distance,score
Model Confidence,1.0,0.192542,0.074188
Jaro Distance,0.192542,1.0,0.30133
score,0.074188,0.30133,1.0


Not really? The Model Confidence looks negatively correlated with both Jaro Distance and score, which is weird and probably meaningless.

In [34]:
# Add a column that calcluates the Jaro distance between the model's inference and the fuzzy author match
fuzzy_results['fuzzy_jaro_distance'] = fuzzy_results.apply(lambda x: jaro_distance(x['Model Inference'], x['fuzzy_author']), axis=1)
# Sort the DataFrame by the fuzzy Jaro distance
fuzzy_results = fuzzy_results.sort_values(by='fuzzy_jaro_distance', ascending=True)


In [35]:
fuzzy_results.to_csv('../output/inference-testing2_fuzzy_results.csv',index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [36]:
# Count the number of fuzzy matches with a Jaro distance of less than 1.0
fuzzy_results[fuzzy_results['fuzzy_jaro_distance'] < 1.0]['fuzzy_jaro_distance'].count()

10

The model matched the performance of the fuzzy matching algorithm with the exception of 10 records, where there are big difference. But that's 10 out of 5,321 records. Not bad!

Let's see where the differences are.

In [37]:
fuzzy_results[fuzzy_results['fuzzy_jaro_distance'] < 1.0]

Unnamed: 0,Original Input,Model Inference,Model Confidence,Jaro Distance,deterministic_author,fuzzy_author,score,fuzzy_jaro_distance
2882,Apponius,egeria,0.351009,0.430556,apponius,apponius,1.0,0.430556
10212,"Crotto, Giovanni.","fonte, bartolommeo, 1445-1513",0.264327,0.488633,"cotta, giovanni","cotta, giovanni",1.0,0.478065
6195,Terence.,"correia, thome, 1536-1595",0.302905,0.504762,terence,terence,1.0,0.504762
4487,Terence.,"correia, thome, 1536-1595",0.302905,0.504762,terence,terence,1.0,0.504762
3383,Terence.,"correia, thome, 1536-1595",0.302905,0.504762,terence,terence,1.0,0.504762
2958,Terence,"correia, thome, 1536-1595",0.415654,0.504762,terence,terence,1.0,0.504762
1190,"Andreä, Johann Valentin, 1586-1654.",valentinus,0.99652,0.519355,"andreä, johann valentin, 1586-1654","andreä, johann valentin, 1586-1654",1.0,0.509524
8853,"Monachus, Antonius.","giovanni, di san vincenzo al volturno",0.997184,0.526907,,"antonius, monk, disciple of simeon stylites, a...",1.0,0.636235
5548,"Öhler, Franciscus.","noel, françois, 1651-1729",0.973839,0.64292,,"oehler, franz, 1817-1866",0.969697,0.708962
4776,"Plinius Caecilus Secundus, C.","pliny, the elder",0.933532,0.63179,,"pliny, the younger",0.981818,0.844907


Fuzzy matching outperformed the model in the nine out of ten instances where the Jaro distance score between the model inference and the fuzzy match was less than 1.0. Most telling is that the fuzzy matching algorithm successfully identified "Plinius Caeclilius Secundus, C." as "Pliny, the Younger," but the model identified it as "Pliny, the Elder".

In [38]:
len(input_df)

13491

How many unique authors are in the fuzzy matched results?

In [40]:
fuzzy_results['fuzzy_author'].nunique()

518

In [41]:
differences = fuzzy_results[fuzzy_results['fuzzy_jaro_distance'] < 1.0]
differences = differences[['Original Input', 'Model Inference', 'Model Confidence', 'deterministic_author', 'fuzzy_author', 'score', 'fuzzy_jaro_distance']]

In [43]:
differences = differences.rename(columns={'deterministic_author':'Deterministic Author', 'fuzzy_author': 'Fuzzy Author', 'score': 'Fuzzy Score', 'fuzzy_jaro_distance': 'Jaro Distance'})

In [45]:
differences.to_csv('../output/inference-testing2_fuzzy_differences.csv',index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [1]:
import pandas as pd

results = pd.read_csv('../output/inference-testing2_sorted.csv', encoding='utf-8', quotechar='"')

In [2]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13491 entries, 0 to 13490
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Original Input        13491 non-null  object 
 1   Model Inference       13491 non-null  object 
 2   Model Confidence      13491 non-null  float64
 3   Jaro Distance         13491 non-null  float64
 4   deterministic_author  4339 non-null   object 
 5   fuzzy_author          5321 non-null   object 
 6   score                 5321 non-null   float64
dtypes: float64(3), object(4)
memory usage: 737.9+ KB


In [3]:
model_results = results[results['Model Confidence'] == 1.0]

In [4]:
len(model_results)

286

In [13]:
model_results_accurate = results[results['Model Confidence'] >= 0.9901]

In [14]:
model_results_accurate

Unnamed: 0,Original Input,Model Inference,Model Confidence,Jaro Distance,deterministic_author,fuzzy_author,score
0,Persius.,persius,1.000000,1.00000,persius,persius,1.000000
1,Apicius.,apicius,1.000000,1.00000,,,
2,Apicius.,apicius,1.000000,1.00000,,,
3,Persius.,persius,1.000000,1.00000,persius,persius,1.000000
4,Persius.,persius,1.000000,1.00000,persius,persius,1.000000
...,...,...,...,...,...,...,...
7360,"Gellius, Aulus.","gellius, aulus",0.990122,0.97619,,"gellius, aulus",0.916667
7361,"Gellius, Aulus.","gellius, aulus",0.990122,0.97619,,"gellius, aulus",0.916667
7362,"Gellius, Aulus.","gellius, aulus",0.990122,0.97619,,"gellius, aulus",0.916667
7363,"Gellius, Aulus.","gellius, aulus",0.990122,0.97619,,"gellius, aulus",0.916667


In [15]:
model_results_accurate['Model Inference'].nunique()

919

In [16]:
titles = pd.read_csv('../output/compare_all_matched_titles.csv',encoding='utf-8',quotechar='"')

In [17]:
titles

Unnamed: 0,normalized_author,title,overall_match,fuzzy_title_score
0,boethius 524,Anicii Manlii Torquati Severini Boethii De con...,euclidis megarensis geometriae libri duo ab a ...,0.855
1,vitruvius pollio,Vitruvii De architectura libri decem / edidit ...,de architectura,0.855
2,thomas aquinas saint 12251274,Summa philosophiæ : ex variis libris D. Thomæ ...,commentari in boethii de consolatione philosop...,0.855
3,unknown,Liber precum : in quo variae et multae egregia...,stephani abbatis epistola ad sanctum aunarium,0.855
4,cicero marcus tullius,M. Tullii Ciceronis opera philosophica / ex ed...,singulae uoces ex incertis libris,0.855
...,...,...,...,...
2657,pliny the younger,C. Plini Caecili Secundi Epistolarum libri nov...,panegyricus,0.855
2658,cicero marcus tullius,M. Tullii Ciceronis De oratore libri tres. wit...,orator,0.000
2659,nepos cornelius,Cornelii Neoptis liber De excellentibus ducibu...,cato,0.855
2660,virgil,P. Vergili Maronis Aeneidos libri [I-IX] / rec...,aeneis,0.000


In [1]:
import pandas as pd
df = pd.read_csv('../output/inference-testing2_sorted.csv', encoding='utf-8', quotechar='"')

In [2]:
incorrect = df[df['Model Confidence'] < 0.97619]

In [3]:
len(incorrect)

5869

In [4]:
incorrect['Original Input'].nunique()

3501

In [5]:
incorrect['Original Input'].apply(lambda x: x.lower().strip('.')).nunique()

3247

In [6]:
import utilities as utilities
normalized = incorrect['Original Input'].apply(lambda x: utilities.normalize_author_name(x))
normalized.nunique()

3159

In [8]:
for author in sorted(normalized.unique()):
    print(author)

abad diego jose 17271779
abadia de santillana del mar
abati baldo angelo
abaunza pedro de 15991649
abbatius baldus angelus 16th cent
abbeloos j b 18361896
abbeloos jean baptiste 18361906
abicht rudolf 18501921
abrahams nicolai christian levin 17981870
abril pedro simon ca 1530 ca 1595
abu alfaraj alisbahani 897 or 8967
abu alfaraj alisbahani 897 or 898967
abu alrabi sulayman ibn abd allah almuwahhid
abu mihjan althaqafi active 629637
abu tammam habib ibn aws altai active 808842
abu tammam habib ibn aws altai fl 808842
abu ubayd alqasim ibn sallam approximately 773approximately 837
academia molshemensis francia
accademia degli occulti brescia
acevedo alfonso de 15181598
achillini alessandro
achillini alessandro 14631512
acidalius valens 15671595
ackermann johann christian gottlieb 17561801
ackermann petrus fouerius 17711831
aconcio iacopo 1566
adam ludwig
ader guillaume
ader guillaume b 1578
adriano vi papa 14591523
agahd reinhold hermann august
aguilon francois de 15671617
aguilon fran