### CODE FOR DIALOGUE ATTRIBUTION ALGORITHM

In [22]:
# These are the versions of numpy and pandas that the code below is running on
# If any functions don't work, please uninstall and install the correct version

import pandas as pd
import numpy as np

print(np.__version__)
print(pd.__version__)

1.26.4
2.2.2


In [26]:
# Loading the jeyrewide csv file from the data cleaning R file

jeyre = pd.read_csv('jeyrewide.csv')


In [28]:
# Extracting known characters from spark-notes, simply using the known characteres

Characters = ['Jane Eyre','Edward Rochester','St. John Rivers','John Reed','Mrs. Reed','Bessie Lee','Mr. Lloyd','Georgiana Reed','Eliza Reed',
              'John Burns','Helen Burns','Mr. Brocklehurst','Miss Temple','Miss Scatcherd','Alice Fairfax','Bertha Mason',
              'Grace Poole','Adéle Varens','Celine Varens','Sophie','Richard Mason','Mr. Briggs','Blanche Ingram','Diana Rivers',
              'Mary Rivers','Rosamond Oliver','John Eyre','Uncle Reed']



In [30]:
# Finding Aliases of Characters, completed Manually

Aliases = [['I ','Jane','Jane Eyre', 'Mrs. Eyre','Miss Eyre'],['Mr Rochester','Edward','Rochester','Edward Rochester'],
                      ['St. John','St. John Rivers'],['John Reed','Mr. Reed'],['Mrs. Reed'],['Bessie','Bessie Lee','Miss Lee','Mrs. Lee'],['Mr. Lloyd'],['Georgiana','Georgiana Reed'],
              ['Eliza','Eliza Reed'],['John Burns','Mr. Burns','Sir Burns','Mr. Burns'],['Helen Burns','Mrs. Burns','Miss Burns'],['Mr. Brocklehurst'],['Miss Temple'],
              ['Miss Scatcherd'],['Alice','Alice Fairfax','Mrs. Fairfax','Miss Fairfax'],['Bertha','Bertha Mason','Miss Mason','Mrs. Mason'],
              ['Grace','Grace Poole','Miss Poole','Mrs. Poole'],['Adéle','Adéle Varens','Mrs Varens'],['Celine','Celine Varens','Miss Varens'],
              ['Sophie'],['Richard','Richard Mason','Mr. Mason','Sir Mason'],['Mr Briggs'],['Blanche','Blanche Ingram','Mrs Ingram','Miss Ingram'],
                      ['Diana','Diana Rivers'],
              ['Mary','Mary Rivers'],['Rosamond','Rosamond Oliver','Miss Oliver','Mrs. Oliver'],['John Eyre','Mr. Eyre'],['Uncle','Uncle Reed']]


In [32]:
for i in Aliases:
    for j in i:
        for k in Aliases:
            if k != i:
                for l in k:
                    if j == l:
                        print(f' match in aliases {i} and {k}') # alias in full matching another alias
                    z = l.split() # splitting a name into subnames
                    for v in z:
                        if j == v:
                            print(f' match in aliases {i} and {k}') # alias in full matching sub name of another alias

# No Matches in aliases so good to go

In [34]:
# Populating the subject row

def subject_population():
    
    for i in range(len(jeyre)):
    
        if jeyre.at[i,'dialogue'] == 'NO QUOTE':

            # get the text in the appends

            text = jeyre.at[i,'append']
        
            # search the appends for any characters mentioned

            subjects = []
            
            for index in range(len(Characters)):

                if Characters[index] != 'Jane Eyre': # book is first person so no way she refers to herself as a subject
            
                    for j in Aliases[index]:

                        if j in text:

                            # appending the word, aswell as the index which the word appears. Need this because we need to prioritise
                            # words that appear towards the end of the subject. Mainly for calculating new speakers
                        
                            subjects.append((text.index(j),Characters[index]))


            subjects.sort() # sorting values based on appearance of alias in text

            subject_final = []

            for word in subjects:
                subject_final.append(word[1])

            for character in subject_final:

                while subject_final.count(character) > 1:

                    subject_final.remove(character)

            # populating subject column

            jeyre.at[i,'subject'] = subject_final



In [36]:
# Explicit Dialogue Function (Alogrithm 1 in report)

# Works on each index of the Jeyre database

# Algorithm 1 

def explicit_speaker(index):

    text = jeyre.at[index,'append']

    # initialising speakers list

    speakers = []

    # searching for the character that appears in appends

    
    for i in range(len(Characters)):
            
        for j in Aliases[i]:

            if j in text:

                speakers.append(Characters[i])


    # Try make some modifications specifically for Jane

    if (speakers == []) and (' I ' in text):

        speakers.append('Jane Eyre')
        

    # remove duplicates

    for character in speakers:

         while speakers.count(character) > 1:

            speakers.remove(character)

    # populating speaker column

    jeyre.at[index,'speaker'] = speakers


In [38]:
# Anaphoric Dialogue Function (Alogrithm 2 in report)

# Needs input for a gender identified in the anaphoric dialogue's appends, 'M' or 'F', and the index of the row

def anaphoric_speaker(gender,i):

    # The genders of characters
    
    Characters_genders = ['F','M','M','M','F','F','M','F','F','M','F','M','F','F','F','F','F','F','F','F','M','M','F','F','F','M','M','M']
    
    jeyre_monologue = jeyre[jeyre['dialogue'] == 'NO QUOTE'] # database of monologue paragraphs

    jeyre_monologue = jeyre_monologue[jeyre_monologue.index < i] # all monologue rows prior to current row

    index = len(jeyre_monologue) - 1  # final index of monologue paragraph

    steps = 5 # check a maximum of 5 prior monologue paragraphs
    
    while (steps > 0) and (index >= 0):  

        count = jeyre_monologue.index[index]
                
        characters = jeyre_monologue.at[count,'subject'] # looks at all characters identified within the subject column of monologue paragraph

        while (characters != []):

            first = characters.pop()

            for ind in range(len(Characters)):

                for al in Aliases[ind]:

                    if first == al:

                        # index match, check gender

                        if Characters_genders[ind] == gender:

                            # end function found match

                            jeyre.at[i,'speaker'] = [Characters[ind]]

                            # function ended
                        
                            return 
                    
        index -= 1 # keep going back until you get a monologue paragraph with gender match

        steps -= 1

    


In [46]:
# Implicit Speaker search (Alogrithm 3 in report)

# Requires the index of the row to search for implicit dialogue

# Considers three subcases

# - Row directly prior is monologue -> Takes first speaker from monologue paragraph
# - Row is second from monologue
# -1 Looks for vocative in previous row
# -2 Looks for character in previous monologue


def implicit_speaker(index):

    if jeyre.at[index-1,'dialogue'] == 'NO QUOTE': # directly following a monologue paragraph

        # search for characters in monologue
        
        character = jeyre.at[index-1,'subject']

            
        if character != []:
            
            speaker = character.pop()

            jeyre.at[index,'speaker'] = [character]


        # Ad some modifications in this case, mainly to improve the chances of matching Jane's character. Also, if a monologue
        # paragraph is simply breaking up a line of dialogue

        if character == []:

            if (' my ' in jeyre.at[index-1,'subject']) or (' I ' in jeyre.at[index-1,'subject']):

                jeyre.at[index,'speaker'] = ['Jane Eyre']

            elif (jeyre.at[index-2,'dialogue'] != 'NO QUOTE'):

                jeyre.at[index,'speaker'] = jeyre.at[index-2,'speaker']
                
                
    # check now for case if second paragraph post monologue
    
    elif jeyre.at[index-2,'dialogue'] == 'NO QUOTE':

        # Look for vocative in previous row

        prev_dialogue = jeyre.at[index-1,'dialogue']

        # initialise vocative

        vocative = ''

        # search for vocative in prev_dialogue

        for i in range(len(Characters)):

            for aliases in Aliases[i]:

                if (', ' + aliases) in prev_dialogue: # has to have a comma prior or post word to be vocative

                    vocative = Characters[i]

                elif (aliases + ', ') in prev_dialogue:

                    vocative = Characters[i]

        # if found a vocative, make it the speaker. Else, take a speaker from previous monologue, but only if not the previous speaker

        if vocative != '':

            jeyre.at[index,'speaker'] = [vocative]

        else:

            characters = jeyre.at[index-2,'subject']
            
            while (characters != []):

                character = characters.pop()

                if character != jeyre.at[index-1,'speaker']:

                    jeyre.at[index,'speaker'] = [character]

            

In [44]:
# Final Speaker attribution framework function

# Follow following strategy

# 1 - Find A monologue paragraph
# 2 - Identify two of the following paragraphs based on their classification
# 3 - Identify all subsequent rows based on looking at prior 2 speakers

def speaker_attribution():

    index = 0

    while index < 4046:

        if jeyre.at[index,'dialogue'] == 'NO QUOTE': # Found a Monologue paragraph
            #print(index)
            count = index + 1

            for i in range(2): # 2 Dialogue paragraphs found directly post monologue

                if jeyre.at[count,'dialogue'] != 'NO QUOTE':
                    #print(count)
                    if jeyre.at[count,'append'] != 'NO APPEND': # Not implicit

                        if (' he ' in jeyre.at[count,'append']): # Male anaphoric speaker

                            anaphoric_speaker('M',count)
                            #print(f'{count} Male match')
                        elif (' she ' in jeyre.at[count,'append']): # Female anaphoric speaker

                            anaphoric_speaker('F',count)
                            #print(f'{count} Female Match')
                        # Do explicit speaker last, let it override any anaphoric speaker, as it should have precedence
                    
                        else:
                            explicit_speaker(count)
                            #print(f'{count} did explicit match')
                    else: # implicit row


                        implicit_speaker(count)
                        #print(f'{count} Implicit Match')

                    count += 1


            # Check now if its still monologue. In this case, simply retrieve the speaker from 2 steps prior
                
            
            while ((count < 4047) and (jeyre.at[count,'dialogue'] != 'NO QUOTE')):

                jeyre.at[count,'speaker'] = jeyre.at[count-2,'speaker']
                #print(f'{count} did alternate speaker')

                count += 1
            index = count
            

        # Completed both while loops now, shift up index and look at next monologue
        else:
            
            index = count

            index += 1




In [48]:
# Final Dialogue Attribution function. Start with speaker attribution, then simply search for 
# passages of dialogue, and then make addressees the previous speaker

def dialogue_attribution():

    # Complete the speaker_attribution() function, populate all speakers
    
    speaker_attribution()

    # Do the addressee population

    count = 0

    while count < 4048:

        if jeyre.at[count,'dialogue'] == 'NO QUOTE':

            # Do a next speaker adressee match
            temp = count + 1
            
            while ((temp < 4047) and (jeyre.at[temp,'dialogue'] != 'NO QUOTE')):

                jeyre.at[temp,'addressee'] = jeyre.at[temp-1,'speaker']

                temp += 1  


            count = temp

        else:

            count += 1


In [50]:
# Complete the dialogue_attribution framework

subject_population()

dialogue_attribution()

  jeyre.at[i,'speaker'] = [Characters[ind]]
  jeyre.at[temp,'addressee'] = jeyre.at[temp-1,'speaker']


In [52]:
# Do a final bit of data cleaning, simply remove any na's

for i in jeyre.index:

    if type(jeyre.at[i,'speaker']) == type(2.0):

        jeyre.at[i,'speaker'] = []

    elif type(jeyre.at[i,'addressee']) == type(2.0):

        jeyre.at[i,'addressee'] = []

## Extracting suitable rows for Sentiment Analysis

In [54]:
jeyre_good = jeyre[jeyre['dialogue'] != 'NO QUOTE']


jeyre_good['speaker_test'] = np.zeros(3021)

for i in jeyre_good.index:
    if ((len(jeyre_good.at[i,'speaker']) == 1) and (len(jeyre_good.at[i,'addressee']) == 1)):
        jeyre_good.at[i,'speaker_test'] = 1
    elif (len(jeyre_good.at[i,'subject']) == 1) and (jeyre_good.at[i,'subject'] != 'NULL'): # including monologue about certain characters
        jeyre_good.at[i,'speaker_test'] = 1
        

jeyre_good = jeyre_good[jeyre_good['speaker_test'] == 1]
jeyre_good

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jeyre_good['speaker_test'] = np.zeros(3021)


Unnamed: 0,paragraph,append,dialogue,speaker,addressee,subject,sentiment,chapter,speaker_test
53,53,"at last said Bessie, turning to the Abigail.","She never did so before,",[Bessie Lee],[Mrs. Reed],NO MONOLOGUE,0,2,1.0
54,54,was the reply.,"But it was always in her, /newquote I’ve told ...",[Mrs. Reed],[Bessie Lee],NO MONOLOGUE,0,2,1.0
59,59,"added Bessie, in no harsh voice,","What we tell you is for your good, /newquote y...",[Bessie Lee],[Mrs. Reed],NO MONOLOGUE,0,2,1.0
60,60,"said Miss Abbot,","Besides, /newquote God will punish her: He mig...",[Mrs. Reed],[Bessie Lee],NO MONOLOGUE,0,2,1.0
90,90,"I pronounced his name, offering him at the sam...",We shall do very well by-and-by.,[John Reed],[Mr. Lloyd],NO MONOLOGUE,0,3,1.0
...,...,...,...,...,...,...,...,...,...
4028,4028,he said: /newquote (John was an old servant...,"I telled Mary how it would be, /newquote I kne...",[Mary Rivers],[Mary Rivers],NO MONOLOGUE,0,38,1.0
4029,4029,I put into his hand a five-pound note. Withou...,"Thank you, John. Mr. Rochester told me to give...",[Mary Rivers],[Mary Rivers],NO MONOLOGUE,0,38,1.0
4030,4030,"And again,",She’ll happen do better for him nor ony o’ t’ ...,[Mary Rivers],[Mary Rivers],NO MONOLOGUE,0,38,1.0
4039,4039,I had a gold watch-chain: I answered,Yes.,[Jane Eyre],[Edward Rochester],NO MONOLOGUE,0,38,1.0


In [56]:
# Removing list entries from speaker / addressee columns, making them the element of list

for i in jeyre_good.index:

    jeyre_good.at[i,'speaker'] = jeyre_good.at[i,'speaker'][0]

    jeyre_good.at[i,'addressee'] = jeyre_good.at[i,'addressee'][0]

# Saving to csv

jeyre_good.to_csv('jeyre_fix.csv')



### CODE FOR GENERATION OF FIGURES

In [58]:
## TABLE 4 - Different Types of Speakers in Jane Eyre

# initialising variables

implicit_count = 0
explicit_count = 0
anaphoric_count = 0

for i in range(len(jeyre)):

    if jeyre.at[i,'dialogue'] != 'NO QUOTE': # Extracting a Dialogue paragraph

        if jeyre.at[i,'append'] != 'NO APPEND':

            # Either explicit or anaphoric speakers

            if (' he ' in jeyre.at[i,'append']) or (' she ' in jeyre.at[i,'append']): # anaphoric speaker

                anaphoric_count += 1

            else: # has to be explicit

                explicit_count += 1

        else: # implicit speaker

            implicit_count += 1


# displaying results

print(f' There are {implicit_count} implicit speakers')
print(f' There are {anaphoric_count} anaphoric speakers')
print(f' There are {explicit_count} explicit speakers')

            

 There are 1881 implicit speakers
 There are 468 anaphoric speakers
 There are 672 explicit speakers


In [60]:
## TABLE 5 - Different Types of Speakers Extracted in Jane Eyre

# initialising variables

implicit_found = 0
explicit_found = 0
anaphoric_found = 0

for i in range(len(jeyre)):

    if jeyre.at[i,'dialogue'] != 'NO QUOTE': # Extracting a Dialogue paragraph

        if jeyre.at[i,'append'] != 'NO APPEND':

            # Either explicit or anaphoric speakers

            if (' he ' in jeyre.at[i,'append']) or (' she ' in jeyre.at[i,'append']): # anaphoric speaker

                if jeyre.at[i,'speaker'] != []:
                    
                    anaphoric_found += 1

            else: # has to be explicit

                if jeyre.at[i,'speaker'] != []:

                    explicit_found += 1

        else: # implicit speaker

            if jeyre.at[i,'speaker'] != []:

                    implicit_found += 1


# displaying results

print(f' {implicit_found} implicit speakers were found')
print(f' {anaphoric_found} anaphoric speakers were found')
print(f' {explicit_found} explicit speakers were found')

            

 856 implicit speakers were found
 234 anaphoric speakers were found
 375 explicit speakers were found


In [64]:
# Table 6 - Speakers extracted in bigraphs

# initialising variables

implicit_count = 0
explicit_count = 0
anaphoric_count = 0

implicit_found = 0
explicit_found = 0
anaphoric_found = 0

# need to final all indices in jeyre database that are monologue paragraphs

jeyre_monologue = jeyre[jeyre['dialogue'] == 'NO QUOTE']


for i in jeyre_monologue.index:

    if jeyre.at[i+1,'dialogue'] != 'NO QUOTE': # dialogue paragraph post monologue

            # Either explicit or anaphoric speakers

            if jeyre.at[i+1,'append'] != 'NO APPEND':

                if (' he ' in jeyre.at[i+1,'append']) or (' she ' in jeyre.at[i+1,'append']): # anaphoric speaker

                    if jeyre.at[i+1,'speaker'] != []:
                    
                        anaphoric_found += 1
                        anaphoric_count += 1
                    else:
                        anaphoric_count += 1

                else: # has to be explicit

                    if jeyre.at[i+1,'speaker'] != []:

                        explicit_found += 1
                        explicit_count += 1
                    else:
                        explicit_count += 1

            else: # implicit speaker

                if jeyre.at[i+1,'speaker'] != []:

                    implicit_found += 1
                    implicit_count += 1

                else:
                    implicit_count += 1

    elif jeyre.at[i+2,'dialogue'] != 'NO QUOTE': # dialogue paragraph second after monologue
            
                        # Either explicit or anaphoric speakers

            if jeyre.at[i+2,'append'] != 'NO APPEND':

                if (' he ' in jeyre.at[i+2,'append']) or (' she ' in jeyre.at[i+2,'append']): # anaphoric speaker

                    if jeyre.at[i+2,'speaker'] != []:
                    
                        anaphoric_found += 1
                        anaphoric_count += 1
                    else:
                        anaphoric_count += 1

                else: # has to be explicit

                    if jeyre.at[i+2,'speaker'] != []:

                        explicit_found += 1
                        explicit_count += 1
                    else:
                        explicit_count += 1

            else: # implicit speaker

                if jeyre.at[i+2,'speaker'] != []:

                    implicit_found += 1
                    implicit_count += 1
                else:
                    implicit_count += 1


# displaying results
print(f' {implicit_count} implicit speakers are in bigraphs')
print(f' {anaphoric_count} anaphoric speakers are in bigraphs')
print(f' {explicit_count} explicit speakers are in bigraphs')

print(f' {implicit_found} implicit speakers were found in bigraphs')
print(f' {anaphoric_found} anaphoric speakers were found in bigraphs')
print(f' {explicit_found} explicit speakers were found in bigraphs')



 243 implicit speakers are in bigraphs
 255 anaphoric speakers are in bigraphs
 306 explicit speakers are in bigraphs
 108 implicit speakers were found in bigraphs
 142 anaphoric speakers were found in bigraphs
 188 explicit speakers were found in bigraphs
