In [58]:
import os
import re

import pandas as pd
import numpy as np

import books_utils as bu

In [2]:
annotations = pd.read_csv('../data/character_relation_annotations.txt.gz', sep='\t')
# dropping values that have gibberish affinity - might transform this later based on category
annotations = annotations[(annotations['affinity'] != 'NR') & (annotations['character_1'] != 'NR') & (annotations['character_2'] != 'NR')].copy()
annotations['book_name'] = (annotations['title'] + ' ' + annotations['author']).str.replace("\s", "_")
print(annotations.shape)
# making sure no NR in character_1/character_2/affinity
annotations.describe()

(2137, 11)


Unnamed: 0,annotator,change,title,author,character_1,character_2,affinity,coarse_category,fine_category,detail,book_name
count,2137,2137,2137,2137,2137,2137,2137,2137,2137,2137,2137
unique,14,3,109,49,1005,825,3,4,30,528,109
top,annotator_1,no,Othello,William Shakespeare,Joseph K.,Timon,positive,social,friend,NR,The_Taming_of_the_Shrew_William_Shakespeare
freq,760,1712,20,613,15,17,1120,886,342,1591,20


In [12]:
annotations.head(1)

Unnamed: 0,annotator,change,title,author,character_1,character_2,affinity,coarse_category,fine_category,detail,book_name
0,annotator_1,yes,Don Quixote,Miguel de Cervantes,Sancho Panza,Don Quixote,positive,professional,servant,he ends up taking more authority and advising ...,Don_Quixote_Miguel_de_Cervantes


In [17]:
np.unique(annotations[annotations['book_name'] == 'Don_Quixote_Miguel_de_Cervantes'][['character_1', 'character_2']].values)

array(['Altisidora', 'Cervantes', 'Cide Hamete Benengeli', 'Dapple',
       'Don Quixote', 'Dulcinea del Toboso', 'Rocinante',
       'Sampson Carrasco', 'Sancho Panza', 'The Duke and Duchess',
       'The barber', 'The priest'], dtype=object)

In [48]:
annotations[(annotations['character_1'].str.contains(' and ')) | (annotations['character_2'].str.contains(' and '))| (annotations['character_1'].str.contains(';')) | (annotations['character_2'].str.contains(';'))]

Unnamed: 0,annotator,change,title,author,character_1,character_2,affinity,coarse_category,fine_category,detail,book_name
4,annotator_1,no,Don Quixote,Miguel de Cervantes,The Duke and Duchess,Don Quixote,negative,social,enemy,NR,Don_Quixote_Miguel_de_Cervantes
5,annotator_1,no,Don Quixote,Miguel de Cervantes,The Duke and Duchess,Sancho Panza,negative,social,enemy,NR,Don_Quixote_Miguel_de_Cervantes
7,annotator_1,no,Don Quixote,Miguel de Cervantes,Altisidora,The Duke and Duchess,positive,professional,servant,NR,Don_Quixote_Miguel_de_Cervantes
14,annotator_2,no,Don Quixote,Miguel de Cervantes,The Duke and Duchess,Don Quixote,neutral,social,rivals,NR,Don_Quixote_Miguel_de_Cervantes
15,annotator_2,no,Don Quixote,Miguel de Cervantes,The Duke and Duchess,Sancho Panza,neutral,social,rivals,NR,Don_Quixote_Miguel_de_Cervantes
16,annotator_2,no,Don Quixote,Miguel de Cervantes,Altisidora,The Duke and Duchess,neutral,professional,employee,Altisidora is The Duchess' maid.,Don_Quixote_Miguel_de_Cervantes
119,annotator_1,no,Richard II,William Shakespeare,Henry Percy Earl of Northumberland; Lord Ross;...,Henry Bolingbroke Duke of Herford,positive,professional,colleague,NR,Richard_II_William_Shakespeare
552,annotator_10,no,Henry IV Part 2,William Shakespeare,King Henry IV,Duke of Lancaster; Humphrey,positive,familial,NR,NR,Henry_IV_Part_2_William_Shakespeare
553,annotator_10,no,Henry IV Part 2,William Shakespeare,King Henry IV,Duke of Gloucester; and Thomas,positive,familial,NR,NR,Henry_IV_Part_2_William_Shakespeare
558,annotator_10,yes,Henry IV Part 2,William Shakespeare,Prince Hal (later King Henry V),Mowbray and Hastings,negative,social,NR,Enemies,Henry_IV_Part_2_William_Shakespeare


In [73]:
# https://git.io/vpzth
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def splitListToRows(row, separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            # super custom hack for Mr & Mrs. case
            substitution = s
            if s.endswith('Mr.') or s.endswith('Mrs.'):
                substitution = s + ' ' + re.sub('^Mrs?\.\s?', '', split_row[-1])
            new_row[target_column] = substitution
            row_accumulator.append(new_row)

    df.apply(splitListToRows, axis=1, args = (separator, ))
    new_df = pd.DataFrame(row_accumulator)
    return new_df

new_df = splitDataFrameList(annotations, 'character_1', ' and ')
new_df = splitDataFrameList(new_df, 'character_2', ' and ')
new_df = splitDataFrameList(new_df, 'character_1', ';')
new_df = splitDataFrameList(new_df, 'character_2', ';')

In [75]:
existing_files = []
chars = {}
book_names = new_df['book_name'].unique()
for name in book_names:
    chars[name] = np.unique(new_df[new_df['book_name'] == name][['character_1', 'character_2']].values)
    file = '../data/books/{}.txt'.format(name)
    existing_files.append(os.path.isfile(file))
len(book_names), len(existing_files), all(existing_files)

(109, 109, True)

In [76]:
books = [bu.Book(name, book_NLP_folder="../data/bookNLP_output", source_folder="../data/books") for name in book_names]

In [77]:
for book in books:
    len(book.characters.all)

In [80]:
total = 0
no_name = 0
total_found = 0
multiple_chars = 0
mosts = [{
    'count': -1,
    'book': books[0],
    'char': ''
}]
for book in books:
    print(f'Book: {book.name}')
    for char in chars[book.name]:
        total += 1
        times_found = 0
        found = False
        print(f'{char} <- ', end = '')
        for bchar in book.characters.all:
            name_in_book = bu.book_name_to_annotated_name(book.name, bchar, [char], False)
            if name_in_book:
                times_found += 1
                if found:
                    multiple_chars += 1
                else:
                    total_found += 1
                found = True
                
                print(f'({bu.longest_name(bchar)} {bchar["id"]}), ', end='')
        if times_found > mosts[-1]['count']:
            mosts.append({
                'count': times_found,
                'book': book.name,
                'char': char
            })
        if not found:
            no_name += 1
        print('')
    print('')
print(f'Total chars annotated: {total}.\nTotal chars from annotations in books {total_found}.\nNot found: {no_name}.\nMultiple chars corresponding: {multiple_chars}')
print(mosts[1:])

Book: Don_Quixote_Miguel_de_Cervantes
Altisidora <- (ALTISIDORA 65), 
Cervantes <- (Miguel de Cervantes 149), 
Cide Hamete Benengeli <- (Cide Hamete Benengeli 162), 
Dapple <- (Dapple 185), 
Don Quixote <- (Don Quixotes 25), (Senor Don Quixote 66), (lord Don Quixote 103), 
Duchess <- 
Dulcinea del Toboso <- (lady Dona Dulcinea del Toboso 247), 
Rocinante <- (Rocinante 118), 
Sampson Carrasco <- (SAMSON CARRASCO 102), 
Sancho Panza <- (Sancho Panzas 174), (Senor Don Sancho Panza 236), 
The Duke <- (Duke of Sesa 119), (Duke Ricardo 157), 
The barber <- 
The priest <- 

Book: Little_Women_Louisa_May_Alcott
Amy March <- (Amy March 42), 
Aunt Carrol <- (Mrs. Carrol 84), 
Beth March <- (Beth 120), 
Florence <- (Flo 55), 
Frederick Bhaer <- (Mother Bhaer 30), (Fred Vaughn 126), 
Hannah <- (Hannah 53), 
Josephine March <- (Miss Jo 37), (Josephine 131), 
Laurie Laurence <- (Mr. Laurence 17), (Mr. Laurie 80), 
Marmee <- (Marmee 14), 
Meg March <- (Meg 113), 
Mr. Brooke <- (Mrs. John Brooke 39), 

Jane Gradgrind <- (Jane 13), (Mr. Thomas Gradgrind 25), 
Josiah Bounderby <- (MR. BOUNDERBY 9), (Josiah Bounderby 16), 
Jr <- 
Louisa <- (Miss Louisa 31), 
Mrs. Sparsit <- (Mrs. Sparsit 24), 
Rachael <- (Rachael 34), 
Stephen Blackpool <- (STEPHEN BLACKPOOL 27), 
Thomas Gradgrind <- (Mr. Thomas Gradgrind 25), 

Book: The_Republic_Plato
Aporia <- 
Appetite <- 
Auxiliary <- 
Belief <- 
Empirical <- 
Guardian <- 
Instrumental reason <- 
Kallipolis <- 
Knowledge <- 
Philosopher-king <- 
Producers <- 
Reason <- 
Sophist <- 
Spirit <- 

Book: Uncle_Tom&rsquo;s_Cabin_Harriet_Beecher_Stowe
Arthur Shelby <- (Mr. Shelby 90), 
Augustine St. Clare <- (Augustine St. Clare 98), 
Aunt Chloe <- (Aunt Chloe 53), 
Eliza Harris <- (GEORGE HARRIS 65), (Eliza Harris 113), 
Emily Shelby <- (Mr. Shelby 90), (Emily 94), 
Eva <- (Miss Eva 41), (Evangeline 82), 
George Harris <- (GEORGE HARRIS 65), 
George Shelby <- (George Shelby 61), (GEORGE HARRIS 65), (Mr. Shelby 90), 
Mr. Haley <- (Mr. Haley 71), 
The Quak


Book: Dubliners_James_Joyce
Corley <- (Corley 89), 
Farrington <- (Farrington 44), 
Father Flynn <- (Nosey Flynn 29), 
Gallaher <- (Ignatius Gallaher 122), 
Joe Donnelly <- (Joe Dillon 88), 
Lenehan <- (Lenehan 16), 
Little Chandler <- (Little Chandler 102), 
Mahony <- (Mahony 25), 
Mangan’s sister <- (Mangan 98), 
Maria <- (Maria 126), 
Mr. Alleyne <- (Mr. Alleyne 104), 
Mr. Doran <- (Mr. Doran 129), 
Mr. Duffy <- (Mr. Duffy 118), 
Mr. Holohan <- (Mr. Holohan 7), 
Mrs. Kearney <- (Mrs. Kearney 15), 
Mrs. Mooney <- (MRS. MOONEY 74), 
Mrs. Sinico <- (Mrs. Sinico 11), 
Old Cotter <- (Mr. Cotter 52), 
“An Encounter” narrator <- 
“Araby” narrator <- 
“The Sisters” narrator <- 

Book: As_You_Like_It_William_Shakespeare
Adam <- (Adam 14), 
Celia <- (Celia 0), 
Corin <- (Corin 5), 
Duke Frederick <- (Duke 1), (Frederick 25), 
Duke Senior <- (Duke 1), 
Jaques <- (Monsieur Jaques 24), 
Lord Amiens <- 
Oliver <- (OLIVER MARTEXT 26), 
Orlando <- (Orlando 11), 
Phoebe <- 
Rosalind <- (Rosalinde 9

In [81]:
# probable people https://github.com/datamade/probablepeople