# Data Exploration

In [123]:
import pandas as pd
import numpy as np

In [451]:
# variables
#path_train = '../data/input/srl_univprop_en.train.conll'
#path_dev   = '../data/input/srl_univprop_en.dev.conll'
path_example   = '../data/input/srl_univprop_en.example.conll'

path_train = '../data/input/en_ewt-up-train.conllu' 
path_test  = '../data/input/en_ewt-up-test.conllu'
path_dev   = '../data/input/en_ewt-up-dev.conllu'

In [470]:

max_line_length = 15

def printLines(path):

    # first exploration
    c = 0
    with open(path) as file:
        '''for line in file:
            print(line)
            c += 1

            if c >= 10:
                break
        '''
        sentences = 0
        for line in file:

                # pass all other lines
                if line.startswith('# text'):
                    sentences += 1

                elif line.startswith('#') or line.startswith('\n'):
                    pass

                # only go into token lines
                else:

                    # omit linebreaks from some lines
                    if line.endswith('\n'):
                        line = line.replace('\n', '')

                    # split input line
                    values = np.array(line.split('\t'))
                    print(values)

                    array  = np.full(max_line_length+1, np.str)
                    
                    # add sentenceId
                    array[0] = sentences
                    # add retrieved information from conll file
                    array[1:len(values)+1] = values
                    # fill remaining columns   !!** use np.nan ?! **!! 
                    array[len(values) +1:] = '_'
                    
                    print(array)
                    
                    break

In [471]:
import os

directory = os.fsencode('../data/input/')
    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    print(filename)

srl_univprop_en.dev.conll
.DS_Store
srl_univprop_en.example.conll
en_ewt-up-test.conllu
srl_univprop_en.train.conll
en_ewt-up-dev.conllu
en_ewt-up-train.conllu


In [472]:
printLines(path_example)

print('\n#######\n')

#printLines(path_train)

['1' 'Really' 'really' 'ADV' 'RB' '_' '2' 'advmod' '2:advmod' '_' '_'
 'ARGM-EXT']
[1 '1' 'Really' 'really' 'ADV' 'RB' '_' '2' 'advmod' '2:advmod' '_' '_'
 'ARGM-EXT' '_' '_' '_']

#######



## Conll Description

"Sentences consist of one or more word lines, and word lines contain the following fields:

ID: Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes (decimal numbers can be lower than 1 but must be greater than 0). <br>
FORM: Word form or punctuation symbol. <br>
LEMMA: Lemma or stem of word form. <br>
UPOS: Universal part-of-speech tag. <br>
XPOS: Language-specific part-of-speech tag; underscore if not available. <br>
FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available. <br>
HEAD: Head of the current word, which is either a value of ID or zero (0). <br>
DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one. <br>
DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs. <br>
MISC: Any other annotation.

The fields DEPS and MISC replace the obsolete fields PHEAD and PDEPREL of the CoNLL-X format. In addition, we have modified the usage of the ID, FORM, LEMMA, XPOS, FEATS and HEAD fields as explained below.

The fields must additionally meet the following constraints:

Fields must not be empty.
Fields other than FORM, LEMMA, and MISC must not contain space characters.
Underscore (_) is used to denote unspecified values in all fields except ID. Note that no format-level distinction is made for the rare cases where the FORM or LEMMA is the literal underscore – processing in such cases is application-dependent. Further, in UD treebanks the UPOS, HEAD, and DEPREL columns are not allowed to be left unspecified except in multiword tokens, where all must be unspecified, and empty nodes, where UPOS is optional and HEAD and DEPREL must be unspecified. The enhanced DEPS annotation is optional in UD treebanks, but if it is provided, it must be provided for all sentences in the treebank. "


*** taken from https://universaldependencies.org/format.html

In [473]:
# retrieved header according to documentation
conll_header = ['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']

# header from lecture form 25.02.
conll_header = ['id', 'form', 'lemma', 'upos', 'xpos', 'morph', 'head', 'dep', 'head_dep', 'space', 'predicate', 'label']

conll_header_adapted = ['sentenceId', 'id', 'form', 'lemma', 'upos', 'xpos', 'morph', 'head', 'dep', 'head_dep', 'space', 'predicate', 'label']


In [474]:
# retrieve longest line
# -> required for the creation of the dataframe later
def retrieveLength(path_to_file):
    c = 0
    max_line_length = -1
    sentences = 0
    tokens = 0
    with open(path_to_file) as file:
        for line in file:


            if line.startswith('# text'):
                sentences += 1
            elif line.startswith('#') or line.startswith('\n'):
                pass
            else:
                values = line.split('\t')
                line_length = len(values)
                if line_length > max_line_length:
                    max_line_length = line_length

                tokens += 1

            c += 1   
    
    print(f'# Sentences in file: {sentences}')
    print(f'# Tokens in file: {tokens}')
    print(f'Maxium of columns in file: {max_line_length}')
    
    return max_line_length

In [520]:
# conversion into dataframe
def createDataFrame(path_to_file, sentence_limit=None):

    max_line_length = retrieveLength(path_to_file)
    sentences = -1

    ### create header
    
    # create empty dataframe with known columns and fillers for remaining collumns
    headers_df = np.full(max_line_length + 1, np.str)  #  + 1 to add sentence column
    
    # add sentence column to header
    #headers_df[1] = 
    
    # add columns from identified columns
    headers_df[:len(conll_header_adapted)] = conll_header_adapted
    
    # fill remaining column headers with '_'
    headers_df[len(conll_header_adapted):] = '_'
    
    
    ### create dataframe
    df = pd.DataFrame(columns=headers_df)

    
    ### fill dataframe

    # loop through file
    with open(path_to_file) as file:
        for line in file:

            # pass all other lines
            if line.startswith('# text'):
                sentences += 1
                
            elif line.startswith('#') or line.startswith('\n'):
                pass
            
            # only go into token lines
            else:
                
                # omit linebreaks from some lines
                if line.endswith('\n'):
                    line = line.replace('\n', '')
                
                # split input line
                values = np.array(line.split('\t'))

                array  = np.full(max_line_length+1, np.str)
                
                # add sentenceId
                array[0] = sentences
                # add retrieved information from conll file
                array[1:len(values)+1] = values
                # fill remaining columns   !!** use np.nan ?! **!! 
                array[len(values)+1:] = '_'
    
                # create new entry
                df_entry = pd.DataFrame(columns=headers_df, data=[array])

                # concatenate to large dataframe
                df = pd.concat([df, df_entry], axis = 0, ignore_index=True)

            if type(sentence_limit) == int and sentences >= sentence_limit:
                break
                
        print(f'\n ## {len(df.sentenceId.unique())} sentences were added to dataframe.')

    return df

In [529]:
# call function with path to file and a integer that set the limit of sentences to include
df = createDataFrame(path_example, 4)  # instead use (path_example, 2)  to only insert 2 sentences  

print('added true value: ', len(df.sentenceId.unique()))
df.tail(5)



# Sentences in file: 6
# Tokens in file: 59
Maxium of columns in file: 14

 ## 4 sentences were added to dataframe.
added true value:  4


Unnamed: 0,sentenceId,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate,label,_,_.1
27,2,11,ca,can,AUX,MD,VerbForm=Fin,13,aux,13:aux,SpaceAfter=No,_,_,_,ARGM-MOD
28,2,12,n't,not,PART,RB,_,13,advmod,13:advmod,_,_,_,_,ARGM-NEG
29,2,13,beat,beat,VERB,VB,VerbForm=Inf,0,root,0:root,_,beat.03,_,_,V
30,2,14,this,this,PRON,DT,Number=Sing|PronType=Dem,13,obj,13:obj,_,_,_,_,ARG1
31,3,1,ok,ok,INTJ,UH,_,0,root,0:root,_,_,,_,_


In [499]:
%%time
# call function with path to file and a integer that set the limit of sentences to include
#df = createDataFrame(path_train, 1)

#df.iloc[:,0:20].head(40)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


In [500]:
### Looping through sentences & assigning new values

'''# assign df to variable for this loop
df = df  

# creat dummy columns for each new variable
# use this format:  df['columnName'] = np.nan
df['aFeature'] = np.nan

# loop through sentences
for s_id in df.sentenceId.unique():
    
    # filter for only this sentence
    df_sentence = df[df.sentenceId == s_id]
    
    # assign value to all features
    #df_sentence.aFeature = ... # uncomment
    
    # loop through lines if necesarry
    
    #display(df_sentence)'''

"# assign df to variable for this loop\ndf = df  \n\n# creat dummy columns for each new variable\n# use this format:  df['columnName'] = np.nan\ndf['aFeature'] = np.nan\n\n# loop through sentences\nfor s_id in df.sentenceId.unique():\n    \n    # filter for only this sentence\n    df_sentence = df[df.sentenceId == s_id]\n    \n    # assign value to all features\n    #df_sentence.aFeature = ... # uncomment\n    \n    # loop through lines if necesarry\n    \n    #display(df_sentence)"

In [530]:
### split datasets for predicate prediction


# use test set 
df_full = createDataFrame(path_example, 6)

# cutt of predicate and argument rows
df_x = df_full.iloc[:,:11]

df_y_true = df_full.iloc[:,11]
y_true = np.array([True if x != '_' else False for x in df_y_true])

# assign false prediction array for test purpose
#y_true = [True, True, False, False,  # sentence 1
#          True, True, False, False, False, False, True, True, False, False, False, False, False] #sentence 2
y_true

# Sentences in file: 6
# Tokens in file: 59
Maxium of columns in file: 14

 ## 6 sentences were added to dataframe.


array([False,  True, False, False,  True, False, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True,  True, False,
       False, False, False, False, False,  True, False, False, False,
        True,  True, False, False, False])

In [541]:
df_full.head(5)

Unnamed: 0,sentenceId,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate,label,_,_.1,predicate_prediction
0,0,1,Really,really,ADV,RB,_,2,advmod,2:advmod,_,_,ARGM-EXT,_,_,False
1,0,2,enjoyed,enjoy,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,0,root,0:root,_,enjoy.01,V,_,_,True
2,0,3,it,it,PRON,PRP,Case=Nom|Gender=Neut|Number=Sing|Person=3|Pron...,2,obj,2:obj,SpaceAfter=No,_,ARG1,_,_,False
3,0,4,.,.,PUNCT,.,_,2,punct,2:punct,_,_,_,_,_,False
4,1,1,Compare,compare,VERB,VBN,Tense=Past|VerbForm=Part,8,advcl,8:advcl,_,compare.01,V,_,ARGM-ADV,True


In [532]:
### helper functions for transformation of dataframe

# helper function to combine predicate arrays
# input value of predicate_gold and predicate_predicted. if either one is true, return true
# -> applied via lambda function to each row in respective dataframe containing one sentence
# -> goal is to have a boolean array that dictates the amount of needed repetitions of sentence  
#    and at which index to look for predicate
def findPredicateUnion(predicateGold, predicatePredicted):
    if predicateGold != '_' or predicatePredicted == True:
        return True
    else:
        return False

In [533]:
### function to retrieve arguments

def expandDataframe(input_df):

    '''
    input:  a dataframe containing the following columns:
                ['sentenceId', 
                 'id', 'form', 'lemma', 'upos', 'xpos', 'morph', 'head', 'dep', 'head_dep', 'space', 
                 'predicate', 'label', '_', '_', ... '_', 
                 'predicate_prediction']

                 -> note that 
                    - predicate_prediction has to be created beforehand
                    - a variable amount of '_' columns is possible
                    

    output: the expanded dataframe dataframe containing the following columns
                ['sentenceId', 'sentenceRepetition', 
                 'id', 'form', 'lemma', 'upos', 'xpos', 'morph', 'head', 'dep', 'head_dep', 'space',
                 'predicate_prediction', 'label_ident_prediction', 'label_prediction',
                 'predicate_gold', 'label_ident_gold', 'label_gold']

    '''

    ## hand over variables
    df = input_df


    ## prepare a dataframe to store all conversions in
    # basic features
    df_expanded = pd.DataFrame(columns=conll_header_adapted[:-2])
    # + these four additional columns, we want to add
    df_expanded['predicate_gold']       = False #np.nan
    df_expanded['label_gold']           = np.nan
    df_expanded['predicate_prediction'] = False #np.nan
    df_expanded['sentenceRepetition']   = 0

    df_expanded_columns = df_expanded.columns

    ## do conversion

    # loop through sentences
    for s_id in df.sentenceId.unique():

        # filter for only this sentence
        df_sentence = df[df.sentenceId == s_id].copy()   # remove hardcoing of sentence 2 (equivalent to index 1) as example

        # count rows for which predicate_gold is true (actually != '_') OR predicate_predicted is true
        df_sentence['union_predicates_gold_predicted'] = df.apply(lambda x: findPredicateUnion(x.predicate, x.predicate_prediction), axis=1)


        # return indices of rows with label True of the columns of the predicates
        indices_union     = np.where(np.array(df_sentence.union_predicates_gold_predicted) == True)[0]
        indices_gold      = np.where(np.array(df_sentence.predicate)                       != '_' )[0]
        indices_predicted = np.where(np.array(df_sentence.predicate_prediction)            == True)[0]


        #nr_of_predicates = df_sentence.union_predicates_gold_predicted[df_sentence.union_predicates_gold_predicted == True].count()
        nr_of_predicates = len(indices_union)


        # loop through nr_of_predicates
        for i in range(nr_of_predicates):


            # create new copy for working with within this repetition of sentence
            df_sentence_repetition = df_sentence.copy()


            ### fill values for new important columns

            # id for repition of sentence to be able to loop through afterwards
            df_sentence_repetition['sentenceRepetition']   = i


            ## predicates

            # fill predicate columns with False as default 
            # -> afterwards only replace that one specific row with True, which we look at in this repitition
            predicate_array_gold   = np.full(len(df_sentence_repetition), False)
            predicate_array_pred   = np.full(len(df_sentence_repetition), False)

            # now replace respective index of predicate columns if it is also in the respective column
            if indices_union[i] in indices_gold:
                predicate_array_gold[indices_union[i]] = True
            if indices_union[i] in indices_predicted:
                predicate_array_pred[indices_union[i]] = True

            # assign created arrays to dataframe
            df_sentence_repetition['predicate_gold']       = predicate_array_gold
            df_sentence_repetition['predicate_prediction'] = predicate_array_pred



            ## labels

            # -> transform labels from all label columns to this one column

            # create filler array
            label_array = np.full(len(df_sentence_repetition), '_')

            # slice df_sentence
            row = df_sentence.iloc[indices_union[i], :]
            list_of_column_indices_with_V = np.where(np.array(row) == 'V')[0]

            # sanity check -> columns found with V should be 1
            if len(list_of_column_indices_with_V) == 1:

                # do conversion

                # find respective_label_column
                respective_column_index = list_of_column_indices_with_V[0]

                # retrieve column
                respective_label_column = np.array(df_sentence.iloc[:, respective_column_index])

                # replave 'V' label with '_'
                respective_label_column[respective_label_column == 'V'] = '_'

                # overwrite filler with retrieved labels
                label_array = respective_label_column

            # label_array remains only filled with '_' because no (coherent) labels could be found
            else:
                pass


            df_sentence_repetition['label_gold']        = label_array



            ### "postprocessing"

            # drop unneccessary columns 
            df_sentence_repetition = df_sentence_repetition.drop(labels=['_', 'label', 'predicate', 'union_predicates_gold_predicted'], axis=1)

            # concatenate to large dataframe
            df_expanded = pd.concat([df_expanded, df_sentence_repetition], axis = 0, ignore_index=True)




    ### insert general columns for later use

    # for later insert of predicted label in in classification task
    df_expanded['label_prediction']       = np.nan 

    # for prediction of label identification
    df_expanded['label_ident_prediction'] = np.nan

    # gold of label identification (true/false)
    df_expanded['label_ident_gold']       = df_expanded.label_gold.apply(lambda x: True if x != '_' else False)

    #reordering columns
    df_expanded = df_expanded[['sentenceId', 'sentenceRepetition', 
                'id', 'form', 'lemma', 'upos', 'xpos', 'morph', 'head', 'dep', 'head_dep', 'space', 
                'predicate_prediction', 'label_ident_prediction', 'label_prediction', 
                'predicate_gold',       'label_ident_gold',       'label_gold']]
        
    
    return df_expanded


In [534]:
# assigning values for trying:
df = df_full
#df['prediction_true'] = df_full.prediction
#df['label_true']      = np.nan # needs to be filled

predicate_pred = y_true # for the moment, replace by predictions here


## prepare df to work with
# add predicate arry to df
df['predicate_prediction'] = y_true

new_df = expandDataframe(df)

new_df.to_csv('../data/intermediate/expandedDataframe_smallExample', index=False )#, index_label='index')

In [540]:
df_read = pd.read_csv('../data/intermediate/expandedDataframe_smallExample')

# head
display(df_read.head(5))

Unnamed: 0,sentenceId,sentenceRepetition,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate_prediction,label_ident_prediction,label_prediction,predicate_gold,label_ident_gold,label_gold
0,0,0,1,Really,really,ADV,RB,_,2,advmod,2:advmod,_,False,,,False,True,ARGM-EXT
1,0,0,2,enjoyed,enjoy,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,0,root,0:root,_,True,,,True,False,_
2,0,0,3,it,it,PRON,PRP,Case=Nom|Gender=Neut|Number=Sing|Person=3|Pron...,2,obj,2:obj,SpaceAfter=No,False,,,False,True,ARG1
3,0,0,4,.,.,PUNCT,.,_,2,punct,2:punct,_,False,,,False,False,_
4,1,0,1,Compare,compare,VERB,VBN,Tense=Past|VerbForm=Part,8,advcl,8:advcl,_,True,,,True,False,_


In [539]:
# passive sentence
df_read.tail(39)

Unnamed: 0,sentenceId,sentenceRepetition,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate_prediction,label_ident_prediction,label_prediction,predicate_gold,label_ident_gold,label_gold
127,5,0,1,One,one,NUM,CD,NumType=Card,5,nsubj,5:nsubj,_,False,,,False,True,ARG0
128,5,0,2,of,of,ADP,IN,_,4,case,4:case,_,False,,,False,False,_
129,5,0,3,the,the,DET,DT,Definite=Def|PronType=Art,4,det,4:det,_,False,,,False,False,_
130,5,0,4,pictures,picture,NOUN,NNS,Number=Plur,1,nmod,1:nmod:of,_,False,,,False,False,_
131,5,0,5,shows,show,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,0,root,0:root,_,True,,,True,False,_
132,5,0,6,a,a,DET,DT,Definite=Ind|PronType=Art,7,det,7:det,_,False,,,False,False,_
133,5,0,7,flag,flag,NOUN,NN,Number=Sing,5,obj,5:obj|10:nsubj:pass,_,False,,,False,True,ARG1
134,5,0,8,that,that,PRON,WDT,PronType=Rel,10,nsubj:pass,7:ref,_,False,,,False,False,_
135,5,0,9,was,be,AUX,VBD,Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbF...,10,aux:pass,10:aux:pass,_,False,,,False,False,_
136,5,0,10,found,find,VERB,VBN,Tense=Past|VerbForm=Part|Voice=Pass,7,acl:relcl,7:acl:relcl,_,False,,,False,False,_
