# 5 Argument Identification

This script provides a function to identify arguments per predicate <br>


*Input:*  
- executionMode_dict
- mode               -> ('production' / 'sample')
- model              -> ('train' / 'test')
- print_status       -> (True / False)
- sentence_limit = None  (limit of sentences to import (default: None)

*Output:* 
- executionMode_dict 


## Preparation

In [1]:
import pandas as pd
import numpy  as np

import warnings
warnings.filterwarnings('ignore')

## Reading data in

In [2]:
def identifyArguments(executionMode_dict,
                      mode,                   #('production' / 'sample')
                      model,                  #('train' / 'test')
                      print_status   = False,
                      sentence_limit = None):
    

    path_to_input = executionMode_dict[mode]['intermediate'][model]['04_FeaturesExtracted']
    path_to_save = '../data/intermediate/' + mode + '_' + model +'_05_identifiedArguments.csv'
    executionMode_dict[mode]['intermediate'][model]['05_identifiedArguments'] = path_to_save
    
    # read dataframe in
    df = pd.read_csv(path_to_input)
    
    # insert predicted predicates here
    df['predicate_prediction'] = False

    display(df.head(5))
    ## insert here
    
    
    ## variable
    #reference column for predicates :  prediction vs truth
    predicate_column = 'predicate_prediction'
    #predicate_column = 'predicate_gold'



    # assign only subset of dataframe with 
    # uncomment if you want to restrict for this or another sentence
    df = df[df.sentenceId == 1].copy()


    # loop through sentences
    for s_id in df.sentenceId.unique():

        # filter for only this sentence
        df_sentence = df[df.sentenceId == s_id]

        # initiate np array to insert preditioncs
        predicates = np.full(len(df_sentence.id.unique()), False)

        ### find all predicates of that sentence 

        # loop through each repetition
        for s_rep in df_sentence.sentenceRepetition.unique():

            # create new subframe for working within this repetition of sentence
            df_sentence_repetition = df_sentence[df_sentence.sentenceRepetition == s_rep]


            ###  identify arguments (their indices)

            ## 1. identify id of predicate
            predicate_id = int(df_sentence_repetition[df_sentence_repetition[predicate_column] == True].id)

            predicate_identification = np.where(df_sentence_repetition[predicate_column] == True)[0]
            if len(predicate_identification) == 1:
                index_of_pred = np.where(df_sentence_repetition[predicate_column] == True)[0][0]
            index_of_pred

            predicates[index_of_pred] = True


        # loop through each repetition
        for s_rep in df_sentence.sentenceRepetition.unique():

            # create new subframe for working within this repetition of sentence
            df_sentence_repetition = df_sentence[df_sentence.sentenceRepetition == s_rep]

            # initiate np array to insert preditioncs
            # pred = np.full(len(df_sentence_repetition), False)


            ###  identify arguments (their indices)

            ## 1. identify id of predicate
            predicate_id = int(df_sentence_repetition[df_sentence_repetition[predicate_column] == True].id)

            ## RULE 1
            # ->take subset which have predicate id as head
            #df_potentials = df_sentence_repetition[df_sentence_repetition['head'] == predicate_id]
            pred = np.array(df_sentence_repetition['head'] == predicate_id)

            # set label_intent prediction to true
            #df_potentials.label_ident_prediction = True
            #display(df_potentials)


            ## RULE 2
            # -> exclude punctuation
            punct_index = df_sentence_repetition[df_sentence_repetition.dep == 'punct'].id - 1
            pred[punct_index] = False

            ## RULE 3
            # -> exclude all the identified predicates from arguments
            for i in range(len(pred)):
                # if this token is a predicate
                if predicates[i] == True:
                    pred[i] = False


            # assign value to all features
            df_sentence_repetition.label_ident_prediction = pred
    
    
    
    
    #write dataframe out
    df.to_csv(path_to_save, index=False)
    
    
    if print_status == True:
        
        print('\n\n#### 5 Argument Identification ####\n\n')
        print(' - completed')
    
    return executionMode_dict

In [3]:
#path_to_input = executionMode_dict[mode]['intermediate'][model]['04_FeaturesExtracted']
path_to_input = '../data/intermediate/' + 'sample' + '_' + 'train' + '_04_ExtractedFeatures.csv'
#executionMode_dict[mode]['intermediate'][model]['05_identifiedArguments'] = path_to_save

# read dataframe in
df = pd.read_csv(path_to_input)

# insert predicted predicates here
df['predicate_prediction'] = False

display(df.head(15))
## insert here





Unnamed: 0,sentenceId,sentenceRepetition,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate_prediction,label_ident_prediction,label_prediction,predicate_gold,label_ident_gold,label_gold,passive,full_constituent
0,0,0,1,Really,really,ADV,RB,_,2,advmod,2:advmod,_,False,,,False,True,ARGM-EXT,,Really
1,0,0,2,enjoyed,enjoy,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,0,root,0:root,_,False,,,True,False,_,,Really enjoyed it.
2,0,0,3,it,it,PRON,PRP,Case=Nom|Gender=Neut|Number=Sing|Person=3|Pron...,2,obj,2:obj,SpaceAfter=No,False,,,False,True,ARG1,,it
3,0,0,4,.,.,PUNCT,.,_,2,punct,2:punct,_,False,,,False,False,_,,.
4,1,0,1,Compare,compare,VERB,VBN,Tense=Past|VerbForm=Part,8,advcl,8:advcl,_,False,,,True,False,_,,Compare to last decade this University is gain...
5,1,0,2,to,to,ADP,IN,_,4,case,4:case,_,False,,,False,False,_,,to last decade
6,1,0,3,last,last,ADJ,JJ,Degree=Pos,4,amod,4:amod,_,False,,,False,False,_,,last
7,1,0,4,decade,decade,NOUN,NN,Number=Sing,1,obl,1:obl:to,_,False,,,False,True,ARG2,,last decade
8,1,0,5,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,False,,,False,False,_,,this
9,1,0,6,University,University,PROPN,NNP,Number=Sing,8,nsubj,8:nsubj,_,False,,,False,False,_,,this University


In [36]:
## variable
#reference column for predicates :  prediction vs truth
predicate_column = 'predicate_prediction'
predicate_column = 'predicate_gold'



# assign only subset of dataframe with 
# uncomment if you want to restrict for this or another sentence
df = df[df.sentenceId == 1].copy()


# loop through sentences
for s_id in df.sentenceId.unique():
    
    # filter for only this sentence
    df_sentence = df[df.sentenceId == s_id]
    
    # initiate np array to insert preditioncs
    predicates = np.full(len(df_sentence.id.unique()), False)
    
    ### find all predicates of that sentence 
    
    # loop through each repetition
    for s_rep in df_sentence.sentenceRepetition.unique():
        
        # create new subframe for working within this repetition of sentence
        df_sentence_repetition = df_sentence[df_sentence.sentenceRepetition == s_rep]
        
        
        ###  identify arguments (their indices)
        
        ## 1. identify id of predicate
        predicate_id = int(df_sentence_repetition[df_sentence_repetition[predicate_column] == True].id)
    
        predicate_identification = np.where(df_sentence_repetition[predicate_column] == True)[0]
        if len(predicate_identification) == 1:
            index_of_pred = np.where(df_sentence_repetition[predicate_column] == True)[0][0]
        index_of_pred
        
        predicates[index_of_pred] = True

    
    # loop through each repetition
    for s_rep in df_sentence.sentenceRepetition.unique():
        
        # create new subframe for working within this repetition of sentence
        df_sentence_repetition = df_sentence[df_sentence.sentenceRepetition == s_rep]
        
        # initiate np array to insert preditioncs
        # pred = np.full(len(df_sentence_repetition), False)
        
        
        ###  identify arguments (their indices)
        
        ## 1. identify id of predicate
        predicate_id = int(df_sentence_repetition[df_sentence_repetition[predicate_column] == True].id)
        
        ## RULE 1
        # ->take subset which have predicate id as head
        #df_potentials = df_sentence_repetition[df_sentence_repetition['head'] == predicate_id]
        pred = np.array(df_sentence_repetition['head'] == predicate_id)
        
        # set label_intent prediction to true
        #df_potentials.label_ident_prediction = True
        #display(df_potentials)
        
        
        ## RULE 2
        # -> exclude punctuation
        punct_index = df_sentence_repetition[df_sentence_repetition.dep == 'punct'].id - 1
        pred[punct_index] = False
        
        ## RULE 3
        # -> exclude all the identified predicates from arguments
        for i in range(len(pred)):
            # if this token is a predicate
            if predicates[i] == True:
                pred[i] = False

        
        # assign value to all features
        df_sentence_repetition.label_ident_prediction = pred
        
        
        
        
        
        print  ('\n** sentence : ', s_id, 
                '\n*  sentence repetition :', s_rep)
        display( df_sentence_repetition, '\n')
        
    break
    


** sentence :  1 
*  sentence repetition : 0


Unnamed: 0,sentenceId,sentenceRepetition,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate_prediction,label_ident_prediction,label_prediction,predicate_gold,label_ident_gold,label_gold,passive,full_constituent
4,1,0,1,Compare,compare,VERB,VBN,Tense=Past|VerbForm=Part,8,advcl,8:advcl,_,False,False,,True,False,_,,Compare to last decade this University is gain...
5,1,0,2,to,to,ADP,IN,_,4,case,4:case,_,False,False,,False,False,_,,to last decade
6,1,0,3,last,last,ADJ,JJ,Degree=Pos,4,amod,4:amod,_,False,False,,False,False,_,,last
7,1,0,4,decade,decade,NOUN,NN,Number=Sing,1,obl,1:obl:to,_,False,True,,False,True,ARG2,,last decade
8,1,0,5,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,False,False,,False,False,_,,this
9,1,0,6,University,University,PROPN,NNP,Number=Sing,8,nsubj,8:nsubj,_,False,False,,False,False,_,,this University
10,1,0,7,is,be,AUX,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,8,aux,8:aux,_,False,False,,False,False,_,,is
11,1,0,8,gaining,gain,VERB,VBG,VerbForm=Ger,0,root,0:root,_,False,False,,False,False,_,,to last decade this University is gaining more...
12,1,0,9,more,more,ADJ,JJR,Degree=Cmp,10,amod,10:amod,_,False,False,,False,False,_,,more
13,1,0,10,prestige,prestige,NOUN,NN,Number=Sing,8,obj,8:obj,_,False,False,,False,False,_,,more prestige


'\n'


** sentence :  1 
*  sentence repetition : 1


Unnamed: 0,sentenceId,sentenceRepetition,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate_prediction,label_ident_prediction,label_prediction,predicate_gold,label_ident_gold,label_gold,passive,full_constituent
17,1,1,1,Compare,compare,VERB,VBN,Tense=Past|VerbForm=Part,8,advcl,8:advcl,_,False,False,,False,False,_,,Compare to last decade this University is gain...
18,1,1,2,to,to,ADP,IN,_,4,case,4:case,_,False,False,,False,False,_,,to last decade
19,1,1,3,last,last,ADJ,JJ,Degree=Pos,4,amod,4:amod,_,False,False,,False,False,_,,last
20,1,1,4,decade,decade,NOUN,NN,Number=Sing,1,obl,1:obl:to,_,False,False,,False,False,_,,last decade
21,1,1,5,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,False,False,,False,False,_,,this
22,1,1,6,University,University,PROPN,NNP,Number=Sing,8,nsubj,8:nsubj,_,False,False,,False,False,_,,this University
23,1,1,7,is,be,AUX,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,8,aux,8:aux,_,False,False,,True,False,_,,is
24,1,1,8,gaining,gain,VERB,VBG,VerbForm=Ger,0,root,0:root,_,False,False,,False,False,_,,to last decade this University is gaining more...
25,1,1,9,more,more,ADJ,JJR,Degree=Cmp,10,amod,10:amod,_,False,False,,False,False,_,,more
26,1,1,10,prestige,prestige,NOUN,NN,Number=Sing,8,obj,8:obj,_,False,False,,False,False,_,,more prestige


'\n'


** sentence :  1 
*  sentence repetition : 2


Unnamed: 0,sentenceId,sentenceRepetition,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate_prediction,label_ident_prediction,label_prediction,predicate_gold,label_ident_gold,label_gold,passive,full_constituent
30,1,2,1,Compare,compare,VERB,VBN,Tense=Past|VerbForm=Part,8,advcl,8:advcl,_,False,False,,False,True,ARGM-ADV,,Compare to last decade this University is gain...
31,1,2,2,to,to,ADP,IN,_,4,case,4:case,_,False,False,,False,False,_,,to last decade
32,1,2,3,last,last,ADJ,JJ,Degree=Pos,4,amod,4:amod,_,False,False,,False,False,_,,last
33,1,2,4,decade,decade,NOUN,NN,Number=Sing,1,obl,1:obl:to,_,False,False,,False,False,_,,last decade
34,1,2,5,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,False,False,,False,False,_,,this
35,1,2,6,University,University,PROPN,NNP,Number=Sing,8,nsubj,8:nsubj,_,False,True,,False,True,ARG0,,this University
36,1,2,7,is,be,AUX,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,8,aux,8:aux,_,False,False,,False,False,_,,is
37,1,2,8,gaining,gain,VERB,VBG,VerbForm=Ger,0,root,0:root,_,False,False,,True,False,_,,to last decade this University is gaining more...
38,1,2,9,more,more,ADJ,JJR,Degree=Cmp,10,amod,10:amod,_,False,False,,False,False,_,,more
39,1,2,10,prestige,prestige,NOUN,NN,Number=Sing,8,obj,8:obj,_,False,True,,False,True,ARG1,,more prestige


'\n'

In [26]:
predicate_id = int(df_sentence_repetition[df_sentence_repetition[predicate_column] == True].id)
predicate_id

8

In [27]:
df_sentence_repetition[df_sentence_repetition['head'] == predicate_id].label_ident_prediction = True
df_sentence_repetition[df_sentence_repetition.dep == 'punct'].index[0]


IndexError: index 0 is out of bounds for axis 0 with size 0

In [28]:
predicate_identification = np.where(df_sentence_repetition[predicate_column] == True)[0]
if len(predicate_identification) == 1:
    index_of_pred = np.where(df_sentence_repetition[predicate_column] == True)[0][0]
index_of_pred

7

In [29]:
predicates[0]
predicates

array([ True, False, False, False, False, False,  True,  True, False,
       False, False, False, False])

In [30]:
pred

array([ True, False, False, False, False,  True,  True, False, False,
        True, False, False,  True])

In [33]:
for i in range(len(pred)):
    # if this token is a predicate
    if predicates[i] == True:
        pred[i] = False
        print(i)
    #else:
        #pred[i] = 'BLA'

0
6
7


In [34]:
pred

array([False, False, False, False, False,  True, False, False, False,
        True, False, False,  True])