# Convert Input Conll to json file

This script converts the input file (already read in as dataframe) into the json input for the neural SRL scripts

In [None]:
import pandas as pd
import numpy  as np
import json

In [None]:
path_to_input = '../data/intermediate/sample_train_01_importedData.csv'
    
# read dataframe in
df = pd.read_csv(path_to_input)

# list to append dicts as json element
x = []



## do conversion

# loop through sentences
for s_id in df.sentenceId.unique():

    # filter for only this sentence
    df_sentence = df[df.sentenceId == s_id].copy()   

    # return indices of rows with label True of the columns of the predicates
    indices_gold      = np.where(np.array(df_sentence.predicate) != '_' )[0]

    nr_of_predicates = len(indices_gold)

    
    # loop through nr_of_predicates
    for i in range(nr_of_predicates):

        # create new dict as json element
        elem = {}
        seq_words  = []
        bio        = []
        pred_sense = []
        
        
        # create new copy for working with within this repetition of sentence
        df_sentence_repetition = df_sentence.copy()

        # retrieve token forms
        seq_words  = list(df_sentence_repetition.form)
        
        # assign pred_sense
        pred_sense.append(int(indices_gold[i]))
        pred_sense.append(np.array(df_sentence_repetition.predicate)[indices_gold[i]])
        pred_sense.append('_')
        pred_sense.append(np.array(df_sentence_repetition.xpos)[indices_gold[i]])
    
    
        ## labels

        # -> transform labels from all label columns to this one column

        # create filler array
        label_array = np.full(len(df_sentence_repetition), '0')

        # slice df_sentence
        row = df_sentence.iloc[indices_gold[i], :]
        list_of_column_indices_with_V = np.where(np.array(row) == 'V')[0]

        # sanity check -> columns found with V should be 1
        if len(list_of_column_indices_with_V) == 1:

            # do conversion

            # find respective_label_column
            respective_column_index = list_of_column_indices_with_V[0]

            # retrieve column
            respective_label_column = np.array(df_sentence.iloc[:, respective_column_index])

            # replave '_' label with '0'
            respective_label_column[respective_label_column == '_'] = '0'

            # overwrite filler with retrieved labels
            label_array = respective_label_column

        # label_array remains only filled with '_' because no (coherent) labels could be found
        else:
            pass

        # assign retrieved array
        #df_sentence_repetition['label_gold']        = label_array
        for i in range(len(label_array)):
            if label_array[i] != '0':
                label_array[i] = 'B-' + label_array[i]
                

        bio = list(label_array)
        
        
        elem["seq_words"]  = seq_words
        elem["BIO"]        = bio
        elem["pred_sense"] = pred_sense
        
        
        # append list of elements
        x.append(elem)


In [None]:
json_string = json.dumps(x, indent=4)

In [None]:
with open('../data/intermediate/neuralSRL_sample_train_input.json', 'w') as outfile:
    outfile.write(json_string)