In [546]:
import os
import re
import baseline_utils
import pandas as pd
import collections as col
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import spacy
from tqdm import tqdm
from sklearn import metrics

In [498]:
nlp = spacy.load('en_core_web_md')

In [579]:
annotations = pd.read_csv('../data/character_relation_annotations.txt.gz', sep='\t')
annotations.head(3)

Unnamed: 0,annotator,change,title,author,character_1,character_2,affinity,coarse_category,fine_category,detail
0,annotator_1,yes,Don Quixote,Miguel de Cervantes,Sancho Panza,Don Quixote,positive,professional,servant,he ends up taking more authority and advising ...
1,annotator_1,no,Don Quixote,Miguel de Cervantes,Rocinante,Don Quixote,positive,professional,person offering service to client,Rocinante is his horse
2,annotator_1,no,Don Quixote,Miguel de Cervantes,Dulcinea del Toboso,Don Quixote,positive,social,unrequited love interest,NR


In [500]:
annotations.describe()

Unnamed: 0,annotator,change,title,author,character_1,character_2,affinity,coarse_category,fine_category,detail
count,2170,2170,2170,2170,2170,2170,2170,2170,2170,2170
unique,14,3,109,49,1008,827,4,4,30,530
top,annotator_1,no,War and Peace,William Shakespeare,NR,NR,positive,social,friend,NR
freq,760,1714,20,620,26,29,1120,887,342,1622


In [501]:
# dropping values that have gibberish affinity - might transform this later base on category
annotations = annotations[annotations['affinity'] != 'NR'].copy()

In [502]:
annotations['book_name'] = (annotations['title'] + ' ' + annotations['author']).str.replace("\s", "_")

In [503]:
affinity_mapping = {
    'positive': 1,
    'neutral': 0.5,
    'negative': 0
}
annotations['num_affinity'] = annotations['affinity'].map(lambda aff: affinity_mapping[aff])
annotations['num_affinity'].head(3)

0    1.0
1    1.0
2    1.0
Name: num_affinity, dtype: float64

In [504]:
def avg(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

In [505]:
all_df = pd.DataFrame(columns=['book_name', 'char_1', 'char_2', 'affinity'])
by_book_annotations = col.defaultdict(col.defaultdict)
def add_books_annotations(row):
    book_name = row['book_name']
    char_1, char_2 = sorted([row['character_1'], row['character_2']])
    affinity = row['num_affinity']
    by_book_annotations[book_name][char_1 + ':' + char_2] = (by_book_annotations[book_name][char_1 + ':' + char_2] if (char_1 + ':' + char_2) in by_book_annotations[book_name] else []) + [affinity]

annotations.apply(add_books_annotations, axis=1)

for book in by_book_annotations:
    for pair in by_book_annotations[book]:
        [char_1, char_2] = pair.split(':')
        all_df = all_df.append([{
            'book_name': book, 'char_1': char_1, 'char_2': char_2, 'affinity': avg(by_book_annotations[book][pair])
        }])
        
all_y = all_df['affinity'].copy()
all_X = all_df.drop('affinity', axis=1)

In [506]:
titles = annotations['title'].unique()
authors = [annotations[annotations['title'] == title]['author'][0:1].ravel()[0] for title in titles]
existing_files = []
names = []
for pair in zip(titles, authors):
    title, author = pair
    name = re.sub("\s", "_", '{} {}'.format(title, author))
    names.append(name)
    file = './books/{}.txt'.format(name)
    existing_files.append(os.path.isfile(file))

In [507]:
titles # all books that are used

array(['Don Quixote', 'Little Women', 'Antony and Cleopatra',
       'Sense and Sensibility', 'Henry VIII', 'Richard II', 'Lord Jim',
       'The Count of Monte Cristo', 'The House of the Seven Gables',
       'Typee', 'Timon of Athens', 'The Three Musketeers', 'Emma',
       'Persuasion', 'Oliver Twist', 'The Adventures of Tom Sawyer',
       'A Portrait of the Artist as a Young Man', 'Troilus and Cressida',
       'Hard Times', 'The Republic', 'Uncle Tom&rsquo;s Cabin',
       'Silas Marner', 'King John', 'The American',
       'The Merry Wives of Windsor', "Swann's Way", 'Anna Karenina',
       'The Comedy of Errors', 'Henry IV Part 2', 'The Aeneid',
       'Henry VI Part 3', 'Ulysses', 'The Iliad', 'Paradise Lost',
       'The Merchant of Venice', 'Adam Bede', 'Romeo and Juliet',
       'O Pioneers!', 'Narrative of the Life of Frederick Douglass',
       'Bleak House', 'Henry VI Part 2', 'Madame Bovary', 'War and Peace',
       'Middlemarch', 'Pride and Prejudice', 'Candide', 'The 

In [508]:
len(existing_files), all(existing_files) # if all returns True => we have the book source for every source in dataset

(109, True)

In [509]:
books = [baseline_utils.Book(name) for name in names]

In [510]:
for book in books:
    print(book.name, len(book.characters.meaningful))

Don_Quixote_Miguel_de_Cervantes 238
Little_Women_Louisa_May_Alcott 136
Antony_and_Cleopatra_William_Shakespeare 36
Sense_and_Sensibility_Jane_Austen 61
Henry_VIII_William_Shakespeare 31
Richard_II_William_Shakespeare 24
Lord_Jim_Joseph_Conrad 45
The_Count_of_Monte_Cristo_Alexandre_Dumas 217
The_House_of_the_Seven_Gables_Nathaniel_Hawthorne 32
Typee_Herman_Melville 51
Timon_of_Athens_William_Shakespeare 17
The_Three_Musketeers_Alexandre_Dumas 117
Emma_Jane_Austen 81
Persuasion_Jane_Austen 54
Oliver_Twist_Charles_Dickens 65
The_Adventures_of_Tom_Sawyer_Mark_Twain 39
A_Portrait_of_the_Artist_as_a_Young_Man_James_Joyce 102
Troilus_and_Cressida_William_Shakespeare 28
Hard_Times_Charles_Dickens 45
The_Republic_Plato 33
Uncle_Tom&rsquo;s_Cabin_Harriet_Beecher_Stowe 107
Silas_Marner_George_Eliot 48
King_John_William_Shakespeare 23
The_American_Henry_James 53
The_Merry_Wives_of_Windsor_William_Shakespeare 27
Swann's_Way_Marcel_Proust 98
Anna_Karenina_Leo_Tolstoy 195
The_Comedy_of_Errors_William

In [577]:
def longest_name(character):
    names = sorted([name['n'] for name in character['names']],key=len)
    return names[-1]
[longest_name(char) for char in books[0].characters.meaningful[0:10]]

['Pope',
 'Don Jeronimo',
 'Don Juan',
 'Alfonso VI',
 'Quiteria',
 'Trifaldin',
 'CANON',
 'Helen',
 'Dona Christina',
 'Gil Polo']

In [514]:
books[0].tokens[1000:1050]

Skipping line 105415: field larger than field limit (131072)
Skipping line 107383: field larger than field limit (131072)


Unnamed: 0,paragraphId,sentenceID,tokenId,beginOffset,endOffset,whitespaceAfter,headTokenId,originalWord,normalizedWord,lemma,pos,ner,deprel,inQuotation,characterId
1000,56,1,1012,5850,5859,NN,1010,INCIDENTS,INCIDENTS,incident,NNS,O,pobj,False,-1
1001,57,1,1013,5861,5868,S,1014,CHAPTER,CHAPTER,CHAPTER,NNP,O,nn,False,-1
1002,57,1,1014,5869,5871,N,1012,LI,LI,LI,NNP,O,dep,False,-1
1003,57,1,1015,5872,5877,S,1016,WHICH,WHICH,which,WDT,O,det,False,-1
1004,57,1,1016,5878,5883,S,1014,DEALS,DEALS,deal,NNS,O,dep,False,-1
1005,57,1,1017,5884,5888,S,1021,WITH,WITH,with,IN,O,rel,False,-1
1006,57,1,1018,5889,5893,S,1017,WHAT,WHAT,what,WDT,O,pobj,False,-1
1007,57,1,1019,5894,5897,S,1020,THE,THE,the,DT,O,det,False,-1
1008,57,1,1020,5898,5906,S,1021,GOATHERD,GOATHERD,goatherd,NN,O,nsubj,False,-1
1009,57,1,1021,5907,5911,S,1016,TOLD,TOLD,tell,VBD,O,rcmod,False,-1


In [515]:
books[0].characters.meaningful[13]

{'NNPcount': 10,
 'agent': [{'i': 8136, 'w': 'knew'},
  {'i': 8213, 'w': 'tried'},
  {'i': 8306, 'w': 'sent'},
  {'i': 8366, 'w': 'demanded'},
  {'i': 8419, 'w': 'agreed'}],
 'g': 0,
 'id': 15,
 'mod': [],
 'names': [{'c': 10, 'n': 'Dey'}],
 'patient': [],
 'poss': [],
 'speaking': []}

In [540]:
def book_name_to_X_name(book_name, book_char, all_X):
    subset = all_X[all_X['book_name'] == book_name]
    present_chars = pd.concat([subset['char_1'], subset['char_2']]).unique()
    aliases = set([names['n'] for names in book_char['names']])
    set_intersection = set(present_chars) & set(aliases)
    name = None
    if len(set_intersection) > 0:
        if len(set_intersection) > 1:
            print("WARNING: {} might have multiple aliases: {}".format(longest_name(book_char), list(set_intersection)))
        name = set_intersection.pop()
    else:
        for alias in aliases:
            for present_char in present_chars:
                if alias in present_char or present_char in alias:
                    name = present_char
    return name or "_" + longest_name(book_char)

In [541]:
def extract_relations_from(book, paragraphIds, meaningful_ids):
    relations = col.defaultdict(list)

    for paragraphId in paragraphIds:
        subset = book.tokens[(book.tokens['paragraphId'] == paragraphId)]
        subset_charIds = subset[subset['characterId'].isin(meaningful_ids)]['characterId'].unique()
        if len(subset_charIds) > 1:
            sorted_chars = sorted(subset_charIds)
            for char1 in sorted_chars:
                starting_index = sorted_chars.index(char1) + 1
                for char2 in sorted_chars[starting_index:]:
                    key = str(char1) + '<=>' + str(char2)
                    relations[key].append(paragraphId)
    return relations

def senti_pos(pos):
    if (pos == 'VERB'):
        return wn.VERB
    if (pos == 'NOUN'):
        return wn.NOUN
    if (pos == 'ADJ'):
        return wn.ADJ
    if (pos == 'ADV'):
        return wn.ADV
    return None

def get_avg_sent(word, pos = None):
    synset = list(swn.senti_synsets(word, senti_pos(pos)))[0:5]
    count = len(synset)
    if (count == 0):
        return 0
    total_pos = 0
    total_neg = 0
    for syn in synset:
        total_pos += syn.pos_score()
        total_neg += syn.neg_score()
    return (total_pos - total_neg) / count


def get_relation(book, paragraphIds):
    related_tokens = ' '.join(book.tokens[book.tokens['paragraphId'].isin(paragraphIds)]['lemma'].fillna('').ravel())
    doc = nlp(related_tokens);
    tks = list(doc)
    sent = 0
    for token in tks:
        sent += get_avg_sent(str(token), token.pos_)
    return (sent / len(tks))

def analyze_book(book):
    results = pd.DataFrame(columns=['bookname', 'char_1', 'char_2', 'affinity'])
    meaningful_ids = [char['id'] for char in book.characters.meaningful]
    paragraphIds = book.tokens['paragraphId'].unique()
    
    map_id_to_char = {}
    for id in meaningful_ids:
        map_id_to_char[id] = book.characters.meaningful[meaningful_ids.index(id)]
    
    relations = extract_relations_from(book, paragraphIds, meaningful_ids)
    to_delete = []
    for key, value in relations.items():
        if len(value) < 6:
            to_delete.append(key)
    for key in to_delete:
        del relations[key] 
    for key, parIds in sorted(relations.items(), key=lambda pair: -len(pair[1])):
        id1, id2 = [int(s) for s in key.split("<=>")]
        relation = get_relation(book, parIds) * 50 + 0.5
        results = results.append([{
            'bookname': book.name, 'char_1': id1, 'char_2': id2, 'affinity': relation
        }])
    return results, map_id_to_char

In [542]:
def predict(pr, Xs):
    def get_val(row):
        try:
#         print(row)
            return pr[((pr['bookname'] == row['book_name']) & (pr['char_1'] == row['char_1']) & (pr['char_2'] == row['char_2'])) |
                   ((pr['bookname'] == row['book_name']) & (pr['char_2'] == row['char_1']) & (pr['char_1'] == row['char_2']))][0:1]['affinity'][0]
        except:
            return 0.5
    return Xs.apply(get_val, axis=1)
def create_predictor_for(books):
    predictor = pd.DataFrame(columns=['bookname', 'char_1', 'char_2', 'affinity'])
    for book in tqdm(books, desc='Looping over books'):
        single_book_res, map_id_to_char = analyze_book(book)
        single_book_res['char_1'] = single_book_res['char_1'].apply(lambda id: book_name_to_X_name(book.name, map_id_to_char[id], all_X))
        single_book_res['char_2'] = single_book_res['char_2'].apply(lambda id: book_name_to_X_name(book.name, map_id_to_char[id], all_X))
        predictor = predictor.append(single_book_res)
    return predictor

In [543]:
predictor = create_predictor_for(books)


Looping over books:   0%|          | 0/109 [00:00<?, ?it/s][A
Looping over books:   4%|▎         | 4/109 [02:59<1:20:22, 45.93s/it]



Looping over books:   6%|▌         | 6/109 [03:24<49:54, 29.07s/it]  Skipping line 39112: field larger than field limit (131072)
Looping over books:   6%|▋         | 7/109 [03:53<49:18, 29.01s/it]Skipping line 46957: field larger than field limit (131072)
Looping over books:  10%|█         | 11/109 [05:04<26:10, 16.03s/it]Skipping line 4798: field larger than field limit (131072)
Looping over books:  11%|█         | 12/109 [05:20<25:54, 16.02s/it]Skipping line 2497: field larger than field limit (131072)
Skipping line 81788: field larger than field limit (131072)
Skipping line 92344: unexpected end of data
Looping over books:  12%|█▏        | 13/109 [06:26<49:28, 30.92s/it]Skipping line 2842: field larger than field limit (131072)
Looping over books:  14%|█▍        | 15/109 [07:28<48:11, 30.76s/it]Skipping line 44550: unexpected end of data
Looping over books:  17%|█▋        | 19/109 [08:56<41:23, 27.59s/it]Skipping line 14804: field larger than field limit (131072)
Looping over books:



Looping over books:  32%|███▏      | 35/109 [13:13<24:19, 19.73s/it]Skipping line 34053: field larger than field limit (131072)
Skipping line 75137: field larger than field limit (131072)
Looping over books:  35%|███▍      | 38/109 [14:21<23:09, 19.57s/it]Skipping line 44495: unexpected end of data
Looping over books:  38%|███▊      | 41/109 [14:54<15:26, 13.62s/it]Skipping line 65566: unexpected end of data
Looping over books:  39%|███▊      | 42/109 [15:02<13:26, 12.04s/it]Skipping line 284551: field larger than field limit (131072)
Looping over books:  39%|███▉      | 43/109 [16:18<34:09, 31.05s/it]Skipping line 27375: field larger than field limit (131072)
Skipping line 182087: unexpected end of data
Looping over books:  41%|████▏     | 45/109 [18:00<43:30, 40.79s/it]Skipping line 24012: unexpected end of data
Looping over books:  42%|████▏     | 46/109 [18:01<30:19, 28.88s/it]Skipping line 82917: field larger than field limit (131072)
Skipping line 129427: unexpected end of data
L



Looping over books:  51%|█████▏    | 56/109 [19:44<06:50,  7.75s/it]



Skipping line 1810: field larger than field limit (131072)
Looping over books:  52%|█████▏    | 57/109 [19:46<05:15,  6.06s/it]Skipping line 58447: unexpected end of data
Looping over books:  54%|█████▍    | 59/109 [19:56<04:28,  5.37s/it]Skipping line 12426: field larger than field limit (131072)
Looping over books:  55%|█████▌    | 60/109 [20:00<03:52,  4.75s/it]Skipping line 99714: field larger than field limit (131072)
Looping over books:  56%|█████▌    | 61/109 [20:22<07:53,  9.87s/it]Skipping line 76720: field larger than field limit (131072)
Skipping line 80034: field larger than field limit (131072)
Looping over books:  58%|█████▊    | 63/109 [20:53<08:56, 11.67s/it]Skipping line 63897: unexpected end of data
Looping over books:  63%|██████▎   | 69/109 [21:31<03:44,  5.61s/it]Skipping line 66943: field larger than field limit (131072)
Skipping line 68239: field larger than field limit (131072)
Skipping line 84704: unexpected end of data
Looping over books:  65%|██████▌   | 71/1



Skipping line 53969: field larger than field limit (131072)
Skipping line 67972: unexpected end of data
Looping over books:  67%|██████▋   | 73/109 [24:17<13:23, 22.33s/it]Skipping line 111575: unexpected end of data
Looping over books:  70%|██████▉   | 76/109 [24:55<07:56, 14.44s/it]Skipping line 28824: field larger than field limit (131072)
Looping over books:  72%|███████▏  | 78/109 [25:56<12:50, 24.84s/it]Skipping line 33035: unexpected end of data
Looping over books:  76%|███████▌  | 83/109 [43:08<1:08:15, 157.51s/it]Skipping line 12473: field larger than field limit (131072)
Skipping line 128921: field larger than field limit (131072)
Skipping line 155824: unexpected end of data
Looping over books:  77%|███████▋  | 84/109 [45:40<1:05:00, 156.03s/it]Skipping line 103611: unexpected end of data
Looping over books:  78%|███████▊  | 85/109 [46:01<46:09, 115.39s/it]  Skipping line 13516: field larger than field limit (131072)
Looping over books:  81%|████████  | 88/109 [46:23<15:24, 4



Looping over books:  95%|█████████▌| 104/109 [48:18<00:41,  8.21s/it]Skipping line 10692: field larger than field limit (131072)
Looping over books:  96%|█████████▋| 105/109 [48:22<00:27,  6.92s/it]Skipping line 50498: unexpected end of data
Looping over books:  99%|█████████▉| 108/109 [48:30<00:03,  3.95s/it]Skipping line 83431: unexpected end of data
Looping over books: 100%|██████████| 109/109 [48:41<00:00,  6.07s/it]


In [545]:
y_predicted = predict(predictor, all_X)

In [549]:
y_predicted.clip(0, 1, inplace=True)

In [551]:
metrics.mean_squared_error(y_predicted, all_y)

0.19005082472622473

In [565]:
def score_to_label(val):
    if val <= 0.33:
        return 'negative'
    if val <= 0.66:
        return 'neutral'
    return 'positive'

In [572]:
print(metrics.classification_report(all_y.map(score_to_label), y_predicted.map(score_to_label)))

             precision    recall  f1-score   support

   negative       0.34      0.11      0.17       397
    neutral       0.22      0.87      0.36       313
   positive       0.61      0.09      0.16       759

avg / total       0.46      0.26      0.20      1469



In [576]:
print(metrics.confusion_matrix(all_y.map(score_to_label), y_predicted.map(score_to_label), labels=('negative', 'neutral', 'positive')))

[[ 44 325  28]
 [ 24 273  16]
 [ 62 627  70]]
