### CODE FOR DIALOGUE ATTRIBUTION ALGORITHM

In [1]:
# These are the versions of numpy and pandas that the code below is running on
# If any functions don't work, please uninstall and install the correct version

import pandas as pd
import numpy as np

print(np.__version__)
print(pd.__version__)

1.26.4
2.2.2


In [3]:
# Loading the annotated novel database (the annotations will be later removed)

def book_read(book_name):
    
    book = pd.read_csv('annotated_books/'+book_name+'annotated.csv')

    # make addressees lists, needed to deal with multiple addressees

    for i in book.index:

        if book.at[i,'dialogue'] != 'NO QUOTE':

            list_ad = []

            addressee = book.at[i,'addressee']

            for j in range(len(addressee)):

                if addressee[j] == '\'':

                    word = ''

                    for k in range(j+1,len(addressee)):

                        if (addressee[k] != '\''):

                            word = word + addressee[k]

                        else:
                
                            break
                
        
                    list_ad.append(word)


            while ', ' in list_ad:
                    
                list_ad.remove(', ')

            while ']' in list_ad:
                    
                list_ad.remove(']')
                

            book.at[i,'addressee'] = list_ad


    # getting all character and alias names
    
    character_info = pd.read_csv('pdnc_dataset/data/'+book_name+'/character_info.csv')
    character_info
        
    Characters = character_info['Main Name']
    Characters = list(Characters)
        
    Aliases = []

    Characters_genders = list(character_info['Gender'])
        
    for i in character_info.index:
        new_list = []
        for j in range(len(character_info.at[i,'Aliases'])):
            string = character_info.at[i,'Aliases']
            if string[j] == '\'':
                new = ''
                index = j+1
                while (string[index] != '\'' and index < (len(string)-1)):
                    new = new + string[index]
                    index += 1
                new_list.append(new)
        
        # removing ', ' and '' terms
        
        for i in new_list:
            if (i == ', ' or i == ''):
                new_list.remove(i)
            
        Aliases.append(new_list)

    return (book, Characters, Aliases, Characters_genders)


In [5]:
# Populating the subject row, needed for identifying anaphoric and implicit speakers

def subject_population(book, Characters, Aliases):
    
    for i in range(len(book)):
    
        if book.at[i,'dialogue'] == 'NO QUOTE':

            # get the text in the appends

            text = book.at[i,'append']
        
            # search the appends for any characters mentioned

            subjects = []
            
            for index in range(len(Characters)):
            
                    for j in Aliases[index]:

                        if j in text:

                            # appending the word, aswell as the index which the word appears. Need this because we need to prioritise
                            # words that appear towards the end of the subject. Mainly for calculating new speakers
                        
                            subjects.append((text.index(j),Characters[index]))


            subjects.sort() # sorting values based on appearance of alias in text

            subject_final = []

            for word in subjects:
                subject_final.append(word[1])

            for character in subject_final:

                while subject_final.count(character) > 1:

                    subject_final.remove(character)

            # populating subject column

            book.at[i,'subject'] = subject_final



In [7]:
# Explicit Dialogue Function 


def explicit_speaker(index, book, Characters, Aliases):

    # there is a slight issue with Aliases where (Mr and Mrs appears instead of Mr. and Mrs.)

    new_Aliases = []

    for i in Aliases:
        new_list = []
        for alias in i:
            new_alias = ''
            for character in alias:
                if character != '.':
                    new_alias = new_alias + character
            new_list.append(new_alias)

        new_Aliases.append(new_list)

    # function to find alias in referringExpression cell
    
    def character_match(row, book, Characters, new_Aliases):

        found = False # function to state if a character is found
        
        for i in range(len(Characters)):
            for alias in new_Aliases[i]:
                if alias.lower() in (book.at[row,'referringExpression']).lower():
                    if found == True: # Character has already been found, see which characters' alias is contained in the other
                        change = True # a placeholder variable if we should change the already matched character 
                        for alias in Aliases[i]:
                            for alias_2 in new_Aliases[candidate]:
                                if alias in alias_2:
                                    # Character i alias is contained inside alias of candidate, meaning it is not as strong a match
                                    change = False

                        if change == True:
                            candidate = i
                    else:
                        found = True # update counter
                        candidate = i

        try:
            return Characters[candidate]

        except:
            return None

    

    # searching for characters that could be a match, (some referringExpression strings don't work)

    try:
        character_match(index, book, Characters, new_Aliases)
    
        for i in range(len(Characters)):
        
            if Characters[i] == character_match(index, book, Characters, new_Aliases):
            
                book.at[index,'speaker'] = Characters[i]
    except:
        book.at[index,'speaker'] = []


In [9]:
# Anaphoric Dialogue Function 

# Needs input for a gender identified in the anaphoric dialogue's appends, 'M' or 'F', and the index of the row


def anaphoric_speaker(gender, i, book, Characters, Aliases, Characters_genders):
    
    # The genders of characters
        
    book_monologue = book[book['dialogue'] == 'NO QUOTE'] # database of monologue paragraphs

    book_monologue = book_monologue[book_monologue.index < i] # all monologue rows prior to current row

    index = len(book_monologue) - 1  # final index of monologue paragraph

    steps = 5 # check a maximum of 5 prior monologue paragraphs
    
    while (steps > 0) and (index >= 0):  

        count = book_monologue.index[index]
                
        characters = book_monologue.at[count,'subject'] # looks at all characters identified within the subject column of monologue paragraph

        while (characters != []):

            first = characters.pop()

            for ind in range(len(Characters)):

                for al in Aliases[ind]:

                    if first == al:

                        # index match, check gender

                        if Characters_genders[ind] == gender:

                            # end function found match

                            book.at[i,'speaker'] = Characters[ind]

                            # function ended
                        
                            return 
                    
        index -= 1 # keep going back until you get a monologue paragraph with gender match

        steps -= 1

    


In [11]:
# Implicit Speaker search 

# Requires the index of the row to search for implicit dialogue

# Considers three subcases

# - Row directly prior is monologue -> Takes first speaker from monologue paragraph
# - Row is second from monologue
# -1 Looks for vocative in previous row
# -2 Looks for character in previous monologue

def implicit_speaker(index, book, Characters, Aliases):

    if book.at[index-1,'dialogue'] == 'NO QUOTE': # directly following a monologue paragraph

        # search for characters in monologue
        
        character = book.at[index-1,'subject']

            
        if character != []:
            
            speaker = character.pop()

            book.at[index,'speaker'] = speaker

        if character == []:

            if (index > 2):

                if (book.at[index-2,'dialogue'] != 'NO QUOTE'):

                    book.at[index,'speaker'] = book.at[index-2,'speaker']
                
                
    # check now for case if second paragraph post monologue

    
    elif (index > 1):
        
        
        if (book.at[index-2,'dialogue'] == 'NO QUOTE'):
    
    
            # Look for vocative in previous row
    
            prev_dialogue = book.at[index-1,'dialogue']
    
            # initialise vocative
    
            vocative = ''
    
            # search for vocative in prev_dialogue
    
            for i in range(len(Characters)):
    
                for aliases in Aliases[i]:
    
                    if (', ' + aliases) in prev_dialogue: # has to have a comma prior or post word to be vocative
    
                        vocative = Characters[i]
    
                    elif (aliases + ', ') in prev_dialogue:
    
                        vocative = Characters[i]
    
            # if found a vocative, make it the speaker. Else, take a speaker from previous monologue, but only if not the previous speaker
    
            if vocative != '':
    
                book.at[index,'speaker'] = vocative
    
            else:
    
                characters = book.at[index-2,'subject']
                
                while (characters != []):
    
                    character = characters.pop()
    
                    if character != book.at[index-1,'speaker']:
    
                        book.at[index,'speaker'] = character
    
            

In [13]:
# Final Speaker attribution framework function

# Follow following strategy

# 1 - Find A monologue paragraph
# 2 - Identify two of the following paragraphs based on their classification
# 3 - Identify all subsequent rows based on looking at prior 2 speakers

def speaker_attribution(book, Characters, Characters_genders, Aliases):

    index = 0
    count = 0

    while index < (len(book.index)-2):
        
        if book.at[index,'dialogue'] == 'NO QUOTE': # Found a Monologue paragraph
            
            #print(index)
            
            count = index + 1

            for i in range(2): # 2 Dialogue paragraphs found directly post monologue

                if book.at[count,'syntactic'] != 'Implicit':
                    #print(count)
                    if book.at[count,'syntactic'] == 'Anaphoric': # Not implicit

                        try:

                            if (' he ' in book.at[count,'referringExpression']).lower(): # Male anaphoric speaker
    
                                anaphoric_speaker('M',count, book, Characters, Aliases, Characters_genders)
                                #print(f'{count} Male match')
                            elif (' she ' in book.at[count,'referringExpression']).lower(): # Female anaphoric speaker
    
                                anaphoric_speaker('F',count, book, Characters, Aliases, Characters_genders)
                                #print(f'{count} Female Match')
                            # Do explicit speaker last, let it override any anaphoric speaker, as it should have precedence
                        except:

                            book.at[count,'speaker'] == []
                            
                    elif book.at[count,'syntactic'] == 'Explicit': # Not implicit
                        explicit_speaker(count, book, Characters, Aliases)
                        #print(f'{count} did explicit match')
                
                elif book.at[count,'syntactic'] == 'Implicit':


                    implicit_speaker(count, book, Characters, Aliases)
                    #print(f'{count} Implicit Match')

                count += 1


            # Check now if its still monologue. In this case, simply retrieve the speaker from 2 steps prior
                
            
            while ((count < (len(book.index)-1)) and (book.at[count,'dialogue'] != 'NO QUOTE')):

                if book.at[count,'syntactic'] != 'Explicit':

                    book.at[count,'speaker'] = book.at[count-2,'speaker']
                    #print(f'{count} did alternate speaker')
                else:
                    explicit_speaker(count, book, Characters, Aliases)            

                count += 1
                
            index = count
            

        # Completed both while loops now, shift up index and look at next monologue
        else:
            
            index += 1




In [15]:
# Final Dialogue Attribution function. Start with speaker attribution, then simply search for 
# passages of dialogue, and then make addressees the previous speaker

def dialogue_attribution(book, Characters, Characters_genders, Aliases):

    # Complete the speaker_attribution() function, populate all speakers
    
    speaker_attribution(book, Characters, Characters_genders, Aliases)

    # Do the addressee population

    # Strategy is to find all speakers in a passage, and then make the addressee any of the speakers not currently speaking

    for i in book.index:

        count = 0

        while count < (len(book.index) - 2):

            if book.at[count,'dialogue'] == 'NO QUOTE':

            # See if there exists a passage
                
                if ((book.at[count+1,'dialogue'] != 'NO QUOTE') and (book.at[count+2,'dialogue'] != 'NO QUOTE')):

                    # getting a speakers list

                    index = count + 1
                    speakers = []

                    while book.at[index,'dialogue'] != 'NO QUOTE':

                        speakers.append(book.at[index,'speaker'])

                        index += 1

                        if (index >= max(book.index)):

                            break
                            
                    # removing duplicates

                    for j in speakers:

                        while speakers.count(j) > 1:

                            speakers.remove(j)
                            
                    # assigning addressees

                    index = count + 1


                    while book.at[index,'dialogue'] != 'NO QUOTE':

                        speakers_temp = speakers.copy()

                        speakers_temp.remove(book.at[index,'speaker'])

                        book.at[index,'addressee'] = speakers_temp

                        index += 1

                        if (index >= max(book.index)):

                            break

                    # update count
                    
                    count = index

                else:

                    count += 1
            
            else:
    
                count += 1

        return book # getting back the annotated book


In [19]:
# Complete the dialogue_attribution framework

import csv

performance = [['book_name','speaker_ac','explcit_prop','accuracy_ac','full_ac']]

# novel names

Novel_names = []

with open('pdnc_dataset/PDNC-Novel-index.csv','r') as file:

    reader = csv.reader(file)

    for i in reader:

        novel_name = ""

        for j in i[1]:
            if ((j != " ") and (j != "'") and (j != "-")):
                
                novel_name = novel_name + j

        
        Novel_names.append(novel_name)
        
    Novel_names.remove('NovelTitle')


# checking the accuracy of the baseline algorithm

for book_name in Novel_names:
        
    book, Characters, Aliases, Characters_genders = book_read(book_name)
    
    book['speaker'] == np.zeros(len(book.index)) # removing the correct speakers
    
    subject_population(book, Characters, Aliases) # populating subjects for monologue
    
    book = dialogue_attribution(book, Characters, Characters_genders, Aliases) # running the baseline algorithm

    # reading annotated book into csv

    book.to_csv('predicted_books/baseline/'+book_name+'baselinepredicted.csv')
    
    # accuracy test, re-reading book file, in order to check real quotees
    
    new_book, Characters, Aliases, Characters_genders = book_read(book_name)
    
    real_speakers = list(new_book[new_book['dialogue'] != 'NO QUOTE']['speaker'])
    count = 0
    
    for i in range(len(real_speakers)):
        if list(book[book['dialogue']!= 'NO QUOTE']['speaker'])[i] == real_speakers[i]:
            count += 1
    
    accuracy_sp = count/(len(real_speakers)) # speaker attribution accuracy

    prop_explicit = len(book[book['syntactic'] == 'Explicit'])/len(real_speakers) # proportion of speakers that are explicit
                      
    # looking at accuracy for single addressees

    count_sa_correct = 0
    count_sa = 0
    count_full = 0

    for i in book.index:
        
        if book.at[i,'dialogue'] != 'NO QUOTE':
                
            if len(book.at[i,'addressee']) == 1:

                count_sa += 1 # single addressee found
        
                if (book.at[i,'addressee'] == new_book.at[i,'addressee']):
        
                    count_sa_correct += 1 # correct single addressee

                    if book.at[i,'speaker'] == new_book.at[i,'speaker']:

                        count_full += 1
                
    

    accuracy_sa = count_sa_correct / count_sa

    accuracy_final = count_full / count_sa

    performance.append([book_name,accuracy_sp,prop_explicit,accuracy_sa,accuracy_final])
                  
with open('results_baseline.csv','w') as file:
    reader = csv.writer(file)
    reader.writerows(performance)
