In [20]:
#all the required imports
import spacy
import pandas as pd
import os
import re, string
import warnings
import plotly.express as px
import random
import plotly.graph_objs as go
from plotly.offline import iplot
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.plotting.backend = "plotly"

In [21]:
#load the english spacy mode
nlp = spacy.load('en_core_web_sm')

In [22]:
#read in the three acts
act_one_df = pd.read_csv('TwelveAngrySceneOne.csv', encoding= 'unicode_escape') 
act_two_df = pd.read_csv('TwelveAngrySceneTwo.csv', encoding= 'unicode_escape') 
act_three_df = pd.read_csv('TwelveAngrySceneThree.csv', encoding= 'unicode_escape') 

In [23]:
#strip any whitetext from the end of the text
act_one_df.Utterance = act_one_df.Utterance.str.lstrip()
act_two_df.Utterance = act_two_df.Utterance.str.lstrip()
act_three_df.Utterance = act_three_df.Utterance.str.lstrip()

AttributeError: 'DataFrame' object has no attribute 'Utterance'

In [None]:
format = {'Speaker': [], 'Utterance': []}

In [None]:
split_frame = pd.DataFrame(data=format)

In [None]:
#reformat the data so that each sentence is on a new line.
def split_text(df):
    split_frame = pd.DataFrame(data=format)
    text = df.Utterance
    for x in range(0,len(df)):
        splitText = text[x]
        delimiters = "?", ".", "!"
        example = splitText
        regexPattern = '|'.join('(?<={})'.format(re.escape(delim)) for delim in delimiters)
        splitArray = re.split(regexPattern, example)
        for sentence in splitArray:
            newRow = {'Speaker' : df.Speaker[x], 'Utterance' : sentence}
            split_frame = split_frame.append(newRow,ignore_index=True)
    return split_frame   

In [None]:
#above method modified from https://stackoverflow.com/questions/4998629/split-string-with-multiple-delimiters-in-python

In [None]:
text_one_split = split_text(act_one_df)
text_two_split = split_text(act_two_df)
text_three_split = split_text(act_three_df)

In [None]:
text_one_split

In [None]:
text_two_split

In [None]:
text_three_split

In [None]:
#strip the whitespace from the end of the text again
text_one_split.Utterance = text_one_split.Utterance.str.lstrip()
text_two_split.Utterance = text_two_split.Utterance.str.lstrip()
text_three_split.Utterance = text_three_split.Utterance.str.lstrip()
#remove the empty rows that are formed after the fullstops
text_one_formatted = text_one_split[text_one_split.Utterance != '']
text_two_formatted = text_two_split[text_two_split.Utterance != '']
text_three_formatted = text_three_split[text_three_split.Utterance != '']
#reset the index so there are no empty rows
text_one_formatted.reset_index(drop=True,inplace=True)
text_two_formatted.reset_index(drop=True,inplace=True)
text_three_formatted.reset_index(drop=True,inplace=True)

In [None]:
#wh/open question definition
def is_wh_question_text(doc):
    wh_tags = ["WDT", "WP", "WP$", "WRB"]
    wh_words = [t for t in doc if t.tag_ in wh_tags]
    is_wh = wh_words 

    return is_wh

In [None]:
#polar questions
def is_polar_text(doc):
    root = [t for t in doc if t.dep_ == "ROOT"][0]
    subj = [t for t in root.children if is_subject(t)]

    if is_wh_question_text(doc):
        return False
    #'is' is an aux
    aux = [t for t in root.lefts if t.dep_ == "aux"]
    if subj and aux:
        return aux[0] < subj[0]

    #'is' is the main verb
    is_main_verb = root.pos_ == "VERB" 
    if subj and is_main_verb:
        return root < subj[0]

    return False

In [None]:
#two methods above inspired and modified from https://towardsdatascience.com/linguistic-rule-writing-for-nlp-ml-64d9af824ee8

In [None]:
#opinion definition
def is_opinion_text(sent, doc):
    
    personal = ["i", "we", "myself","i've","i have", "i'd", "me", "i'll", "we're", "we've", "we'd","our","i'm"]
    phrases = ["agree", "disagree", "believe","my opinion", "point of view","think", "should", "don't know", "thought","believed"
               ,"agreed", "disagreed", "should've","like","dislike","love"]
    in_text_personal = False
    in_text_phrase = False
    #remove fullstop for string search
    for char in sent:
        if char in ".":
            sent = sent.replace(char,'') 
    words = sent.split() 
    #looks for personal words in text
    for person in personal:
        if person in words: 
            in_text_personal = True
    
    #look for opinion phrases in text
    for phrase in phrases:
        if phrase in words: 
            in_text_phrase = True
            
    #if both true then return true    
    if in_text_personal == True and in_text_phrase == True:
        return True
    return False

In [None]:
#personal statement definition
def is_first_person_text(sent, doc):
    personal = ["i", "we", "myself","i've","i have", "i'd", "me", "i'll", "we're", "we've", "we'd","our","i'm"]
    is_personal = False
    is_noun = False
    is_verb = False
    is_adj = False
    is_adverb = False
    #remove fullstop for string search
    #sent = sent.translate(str.maketrans('', '', string.punctuation))
    for char in sent:
        if char in ".":
            sent = sent.replace(char,'')
    #look for nouns, verb, adverb and adjectives in text
    for token in doc: 
        if token.pos_ == "NOUN":
            is_noun = True
        if token.pos_ == "VERB":
            is_verb = True
        if token.pos_ == "ADJ":
            is_adj = True
    words = sent.split() 
    #looks for personal words in text
    for person in personal:     
        if person in words:
            is_personal = True
    #if personal word and one of the other then return true
    return is_personal and (is_noun or is_verb or is_adj)

In [None]:
#other statement definition
def is_other_statement_text(sent,doc):
    is_personal = False
    for token in doc:
        if (token.pos_ == "NOUN") or (token.pos_ =="PRON") or (token.pos_ == "DET"):
            is_personal = True
    #if about themselves or someone or something else then return false else return true
    if is_personal == False:
        return True
    else:
        return False    

In [None]:
#statement about others definition
def is_non_first_person_text(sent, doc):
    personal = ["i", "we", "myself","i've","i have", "i'd", "me", "i'll", "we're", "we've", "we'd","our","i'm"]
    is_personal = False
    is_noun = False
    is_pron = False
    is_adj = False
    is_verb = False
    is_det = False
    for char in sent:
        if char in ".":
            sent = sent.replace(char,'')
    for token in doc:
        if token.pos_ == "NOUN":
            is_noun = True
        if token.pos_ == "PRON":
            is_pron = True
        if token.pos_ == "VERB":
            is_verb = True
        if token.pos_ == "ADJ":
            is_adj = True
        if token.pos_ == "DET":
            is_det = True
    words = sent.split() 
    for person in personal:     
        if person in words:
            is_personal = True
    #if not about themselves and has one of the others then return true       
    if is_personal == False and (is_noun == True or is_pron == True or is_adj or is_det):
        return True
    return False

In [None]:
#not used but was experimentation for a potential commitment speech act
def commitment(doc):
    is_personal = False
    pos_personal = 0
    is_noun = False
    pos_noun = 0
    is_verb = False
    pos_verb = 0
    is_aux = False
    pos_aux = 0
    counter = 0
    for token in doc:
        if token.pos_ == "AUX":
            is_aux = True
            if pos_aux == 0:
                pos_aux = counter
        if token.pos_ == "PRON":
            is_personal = True
            if pos_personal == 0:
                pos_personal = counter
        if token.pos_ == "NOUN":
            is_noun = True
            if pos_noun == 0:
                pos_noun = counter   
        if token.pos_ == "VERB":
            is_verb = True
            if pos_verb == 0:
                pos_verb = counter
        counter = counter + 1
    if (is_noun == True and is_personal == True and is_verb == True) and (pos_personal < pos_verb and pos_aux < pos_verb and pos_verb < pos_noun):
        return True

In [None]:
#is subject helper function for questions
def is_subject(tok):
    subject_deps = {"csubj", "nsubj", "nsubjpass"}
    return tok.dep_ in subject_deps

In [None]:
#looks for punctuation in a sentence
def punct(sent):
    punctuation = [',',';']
    for word in sent:
        for punct in punctuation:
            if word in punct:
                return True
    return False

In [None]:
format = {'Line': [], 'Speaker': [], 'Act': []}

In [None]:
d = {'No.': [],'Line': [],'Speaker': [], 'act_1': [],'act_2': [],'act_3': [],'act_4': [],'act_5': [],'act_6': []}

In [None]:
#initialise variables used in function below
act = 0
speaker = ""
global counter
classifiedActs = pd.DataFrame(data=format)
newActs = pd.DataFrame({'Line' : [],'Sentence': [], 'Act' : []})

In [None]:
#runner function for the all the methods
def apply_speech_act(data):
     
    classifiedActs = pd.DataFrame(data=format)
    allActs = pd.DataFrame(data=d)
    sentences = data.Utterance
    counter = int(0)
    newActs = pd.DataFrame({'Line' : [],'Speaker' : [],'Sentence': [], 'Act' : []})
    for sent in sentences:
        sent = sent.lower()
        list_sentence = []
        #split sentence as , and ; to look for more than one act in a sentence
        list_sentence = re.split('[,;-]', sent)
        act_list =[]
        speaker = data.Speaker[counter]
        print(speaker)
        counter = counter + 1
        for line in list_sentence:
            act = 0
            counter2 = 0
            doc = nlp(line)
            #check if the sentence matches with any of the speech act functiond
            is_wh = is_wh_question_text(doc)
            is_polar = is_polar_text(doc)
            is_opinion = is_opinion_text(line, doc)
            is_first_person = is_first_person_text(line,doc)  
            is_other_statement = is_other_statement_text(line,doc)
            is_non_first_person = is_non_first_person_text(line,doc)
            is_punct = punct(line)
            #used to not double assign questions to the polar speech act
            is_in_polar = False
            
            #check for wh-questions
            if is_wh and '?' in line:
                act = 1
                act_list.append(act)
            
            #check for polar questions    
            if is_polar and '?' in line:
                act = 2
                act_list.append(act)
                is_in_polar = True
            
            #check for opinions    
            if is_opinion:
                act = 3
                act_list.append(act)
            
            #assign non-assigned questions to polar
            if '?' in line and not is_in_polar:
                act = 1
                act_list.append(act)
            
            #check for first-person statements    
            if is_first_person:
                question_punct = True
                if '?' in line:
                    if is_punct == False:
                        question_punct = False
                if question_punct == True:
                    act = 4
                    act_list.append(act) 
            
            #check for non-first-person statements      
            if is_non_first_person:
                question_punct = True
                if '?' in line:
                    if is_punct == False:
                        question_punct = False
                if question_punct == True:
                    act = 5
                    act_list.append(act)
            
            #check for other statements        
            if is_other_statement:
                question_punct = True
                if '?' in line:
                    if is_punct == False:
                        question_punct = False
                if question_punct == True:
                    act = 6
                    act_list.append(act)            
        newRow = {'Line' : int(counter),'Speaker' : speaker,'Sentence' : sent, 'Act' : act_list}
        newActs = newActs.append(newRow,ignore_index=True)
        print(newRow)
        counter2 = counter2 + 1
    return newActs     

In [None]:
#outputed singular acts and multiple acts
multiple_acts_one = apply_speech_act(text_one_formatted)
multiple_acts_two = apply_speech_act(text_two_formatted)
multiple_acts_three = apply_speech_act(text_three_formatted)

In [None]:
#change line numbers for act 2
for i in range(0, len(multiple_acts_two)):  
    multiple_acts_two.at[i,'Line'] = 1 + len(multiple_acts_one) + i

In [None]:
#change line numbers for act 3
for j in range(0, len(multiple_acts_three)):
    multiple_acts_three.at[j,'Line'] = 1 + len(multiple_acts_one) + len(multiple_acts_two) + j

In [None]:
#concat all acts
full_play_acts = pd.concat([multiple_acts_one,multiple_acts_two,multiple_acts_three],ignore_index = True)

In [None]:
#remove duplicate speech act values
act_list = full_play_acts['Act'].tolist()
for i in range (0, len(full_play_acts)):
    full_play_acts.at[i, 'Act'] = pd.unique(act_list[i]).tolist()    

In [None]:
#get a random sample of 150 lines from the whole play (for precision analysis)
random_sample = full_play_acts.sample(n=150)
random_sample.to_csv('sample.csv')

In [None]:
#get a random saple of 50 lines from each act (for precision analysis)
random_sample_one = multiple_acts_one.sample(n=50)
random_sample_one.to_csv('sampleActOne.csv')
random_sample_two = multiple_acts_two.sample(n=50)
random_sample_two.to_csv('sampleActTwo.csv')
random_sample_three = multiple_acts_three.sample(n=50)
random_sample_three.to_csv('sampleActThree.csv')

In [None]:
#join acts together to create 6 more speech acts
def join_acts(acts):
    acts['Act'] = acts['Act'].astype(str)
    act_7_1 = '5'
    act_7_2 = '6'
    list_of_acts = acts['Act'] 
    counter = 0
    for act in list_of_acts:
        
        if '5' in act and '6' in act:
            acts.loc[counter, 'Act'] = '7'
        elif '4' in act and '5' in act:
            acts.loc[counter, 'Act'] = '8'
        elif ('1' in act or '2' in act) and '3' in act:
            acts.loc[counter, 'Act'] = '9'
        elif ('1' in act or '2' in act) and '6' in act:
            acts.loc[counter, 'Act'] = '11'
        elif ('1' in act or '2' in act) and '5' in act:
            acts.loc[counter, 'Act'] = '10'
        elif '4' in act and '6' in act:
            acts.loc[counter, 'Act'] = '12'
        elif '1' in act:
            acts.loc[counter, 'Act'] = '1'
        elif '2' in act:
            acts.loc[counter, 'Act'] = '2'
        elif '3' in act:
            acts.loc[counter, 'Act'] = '3'
        elif '4' in act:
            acts.loc[counter, 'Act'] = '4'
        elif '5' in act:
            acts.loc[counter, 'Act'] = '5'
        elif '6' in act:
            acts.loc[counter, 'Act'] = '6'
        elif '' in act:
            acts.loc[counter, 'Act'] = '6'
        print(act + " " + str(counter))
        counter = counter + 1
    return acts
        

In [None]:
multiple_acts_one = join_acts(multiple_acts_one)
multiple_acts_two = join_acts(multiple_acts_two)
multiple_acts_three = join_acts(multiple_acts_three)
full_play_acts = join_acts(full_play_acts)

In [None]:
#convert speech act column to ints
multiple_acts_one['Act'] = multiple_acts_one['Act'].astype(int)
multiple_acts_two['Act'] = multiple_acts_two['Act'].astype(int)
multiple_acts_three['Act'] = multiple_acts_three['Act'].astype(int)
full_play_acts['Act'] = full_play_acts['Act'].astype(int)

In [None]:
#function to make graphs with custom data for hovering
def make_graph(data,title):
    graph = px.line(data, x="Line",y='Act',custom_data=['Sentence','Speaker'], title = title)
    graph.update_traces(
        hovertemplate="<br>".join([
            "Line: %{x}",
            "Act: %{y}",
            "Sentence: %{customdata[0]}",
            "Speaker: %{customdata[1]}"     
        ])
    )
    return graph

List of Speech Acts:

1) Wh question. (What is the time?)

2) Polar question. (Am I correct?)

3) Opinion. (I think this is good.)

4) Personal statement. (I am going to the shop.)

5) Non-personal statement, about someone, or something else. (They aren't going to like this.)

6) Other statement. Singular words etc. ("Okay.", "All right.")

Combined acts:

7) 5 & 6

8) 4 & 5

9) (1 or 2) and 3

10) (1 or 2) and 5

11) (1 or 2) and 6

12) 4 and 6

The section below was used for analysing the outputted graphs.

In [None]:
act_one_graph = make_graph(multiple_acts_one,"Act One")
act_one_graph.show()

In [None]:
act_two_graph = make_graph(multiple_acts_two,"Act Two")
act_two_graph.show()

In [None]:
act_three_graph = make_graph(multiple_acts_three,"Act Three")
act_three_graph.show()

In [None]:
#individual speaker graphs. To look at another speaker, change the end of the lines below to the desired speakers name.
act_one_three = full_play_acts.loc[full_play_acts['Speaker'] == 'THREE']
act_one_four = full_play_acts.loc[full_play_acts['Speaker'] == 'FOUR']
act_one_five = full_play_acts.loc[full_play_acts['Speaker'] == 'FIVE']
act_one_eight = full_play_acts.loc[full_play_acts['Speaker'] == 'EIGHT']
act_one_foreman = full_play_acts.loc[full_play_acts['Speaker'] == 'FOREMAN']

In [None]:
three_graph = make_graph(act_one_three,"Character Three Speech Acts")
three_graph.show()

In [None]:
four_graph = make_graph(act_one_four,"Character Four Speech Acts")
four_graph.show()

In [None]:
five_graph = make_graph(act_one_five,"Character Five Speech Acts")
five_graph.show()

In [None]:
foreman_graph = make_graph(act_one_foreman,"Character Foreman Speech Acts")
foreman_graph.show()

In [None]:
eight_graph = make_graph(act_one_eight,"Character Eight Speech Acts")
eight_graph.show()

In [None]:
#code to show the graph which displays all the speakers in one
x = full_play_acts.Line
y1 = full_play_acts.loc[full_play_acts['Speaker']=='FOREMAN'].Act
y2 = full_play_acts.loc[full_play_acts['Speaker']=='TWO'].Act
y3 = full_play_acts.loc[full_play_acts['Speaker']=='THREE'].Act
y4 = full_play_acts.loc[full_play_acts['Speaker']=='FOUR'].Act
y5 = full_play_acts.loc[full_play_acts['Speaker']=='FIVE'].Act
y6 = full_play_acts.loc[full_play_acts['Speaker']=='SIX'].Act
y7 = full_play_acts.loc[full_play_acts['Speaker']=='SEVEN'].Act
y8 = full_play_acts.loc[full_play_acts['Speaker']=='EIGHT'].Act
y9 = full_play_acts.loc[full_play_acts['Speaker']=='NINE'].Act
y10 = full_play_acts.loc[full_play_acts['Speaker']=='TEN'].Act
y11 = full_play_acts.loc[full_play_acts['Speaker']=='ELEVEN'].Act
y12 = full_play_acts.loc[full_play_acts['Speaker']=='TWELVE'].Act
#format the graph
layout = go.Layout(
    autosize=True,
   

    xaxis= go.layout.XAxis(linecolor = 'black',title = 'Line',
                          linewidth = 1,
                          mirror = True),

    yaxis= go.layout.YAxis(linecolor = 'black',title = 'Act',
                          linewidth = 1,
                          mirror = True),

    margin=go.layout.Margin(
        l=50,
        r=50,
        b=100,
        t=100,
        pad = 4
    )
)
fig = go.Figure(layout=layout)
fig.add_trace(
   go.Scatter(
      x = x, y = y1, name = 'FOREMAN'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y2, name = 'TWO'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y3, name = 'THREE'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y4, name = 'FOUR'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y5, name = 'FIVE'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y6, name = 'SIX'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y7, name = 'SEVEN'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y8, name = 'EIGHT'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y9, name = 'NINE'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y10, name = 'TEN'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y11, name = 'ELEVEN'
   )
)
fig.add_trace(
   go.Scatter(
      x = x, y = y12, name = 'TWELVE'
   )
)
#update which speaker to display
fig.layout.update(
   updatemenus = [
      go.layout.Updatemenu(
         type = "buttons", direction = "right", active = 0, x = 1.5, y = 1.2,
         buttons = list(
            [
               dict(
                  label = "Foreman", method = "update",
                  args = [{"visible": [True, False, False, False, False, False, False, False, False, False, False, False]},{"title": "Foreman"} ]
               ),
               dict(
                  label = "Two", method = "update", 
                  args = [{"visible": [False, True, False, False, False, False, False, False, False, False, False, False]},{"title": "Two"}]
               ),
               dict(
                  label = "Three", method = "update", 
                  args = [{"visible": [False, False, True, False, False, False, False, False, False, False, False, False]},{"title": "Three"}]
               )
                ,
               dict(
                  label = "Four", method = "update", 
                  args = [{"visible": [False, False, False, True, False, False, False, False, False, False, False, False]},{"title": "Four"}]
               )
                ,
               dict(
                  label = "Five", method = "update", 
                  args = [{"visible": [False, False, False, False, True, False, False, False, False, False, False, False]},{"title": "Five"}]
               )
                ,
               dict(
                  label = "Six", method = "update", 
                  args = [{"visible": [False, False, False, False, False, True, False, False, False, False, False, False]},{"title": "Six"}]
               )
                ,
               dict(
                  label = "Seven", method = "update", 
                  args = [{"visible": [False, False, False, False, False, False, True, False, False, False, False, False]},{"title": "Seven"}]
               )
                ,
               dict(
                  label = "Eight", method = "update", 
                  args = [{"visible": [False, False, False, False, False, False, False, True, False, False, False, False]},{"title": "Eight"}]
               )
                ,
               dict(
                  label = "Nine", method = "update", 
                  args = [{"visible": [False, False, False, False, False, False, False, False, True, False, False, False]},{"title": "Nine"}]
               )
                ,
               dict(
                  label = "Ten", method = "update", 
                  args = [{"visible": [False, False, False, False, False, False, False, False, False, True, False, False]},{"title": "Ten"}]
               )
                ,
               dict(
                  label = "Eleven", method = "update", 
                  args = [{"visible": [False, False, False, False, False, False, False, False, False, False, True, False]},{"title": "Eleven"}]
               )
                ,
               dict(
                  label = "Twelve", method = "update", 
                  args = [{"visible": [False, False, False, False, False, False, False, False, False, False, False, True]},{"title": "Twelve"}]
               )
                
                
            ]
         )
      )
   ]
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Line: %{x}",
        "Act: %{y}",    
        ]))
fig.show()