In [None]:
import pandas as pd
import numpy as np
import csv
import os
import re
import time
from tqdm import tqdm_notebook as tqdm



pd.set_option('max_colwidth', 400)

In [None]:
def add_metadata_and_sent_number(df, last_sentence):
    sentence_number = last_sentence
    sentence = ''
    new_df = pd.DataFrame(columns=['ix', 'token', 'is_modal', 'is_prej', 'modal_type', 'sentence_number'])
    first_token = -1
    modal_count = 0
    for i, row in df.iterrows():
        sentence += row['token'] + ' '
        df.at[i, 'sentence_number'] = sentence_number
        if row['is_modal'] in ['S', 'B']:
            modal_count += 1
        if row['token'] in ['.', '?', '!']:
            metadata = pd.DataFrame(data={'ix': [None, '#', '#', '#'], 
                                          'token': [None, 'Sent_number = ', 'sentence_text = ', 'modal_count = '], 
                                          'is_modal': [None, sentence_number, sentence, modal_count], 
                                          'is_prej': [None, None, None, None], 'modal_type': [None, None, None, None], 
                                          'sentence_number': [None, None, None, None]})
            new_df = pd.concat([new_df, metadata, df.iloc[first_token+1:i]],sort=False).reset_index(drop=True)
    
            sentence_number += 1
            sentence = ''
            first_token = i
            modal_count = 0            
            
    return new_df, sentence_number

def get_elements_ids_and_tokens(file_content):
    tokens = {}
    for row in file_content:
        try:
            if re.search('mark id=', row):
                component_id = re.findall('id=\"(\w+\_*\d+)\"', row)[0]

                tokens[component_id] = re.findall('#sTok(\d+)', row)
        except:
            if not re.search('#sTok(\d+)', row):
                continue
            else:
                raise Exception(file, row)
    return tokens

In [None]:
def tag_BIOSE(token_list, col):
    for token_group in token_list:
        if len(token_group) == 1:
            df.at[df['ix'] == int(token_group[0]), col] = 'S'
        else:
            df.at[df['ix'] == int(token_group[0]), col] = 'B'
            df.at[df['ix'] == int(token_group[-1]), col] = 'E'
            for token in token_group[1:-1]:
                df.at[df['ix'] == int(token), col] = 'I'


In [None]:
def update_modal_type(element_ids, lines):
    modal_fine_grain = ''
    modal_coarse_grain = ''
    for modal_id, tokens in element_ids.items():
        if 'modal' in modal_id:
            for line in lines:
                if re.search(modal_id, line):
                    modal_fine_grain = re.findall('value=\"(\w+)\"', line)[0]
            if modal_fine_grain in ['deontic', 'buletic', 'teleological', 'buletic_teleological']:
                modal_coarse_grain = 'priority'
            elif modal_fine_grain in ['epistemic_circumstantial', 'epistemic']:
                modal_coarse_grain = 'epistemic'
            elif modal_fine_grain in ['ability_circumstantial', 'ability']:
                modal_coarse_grain = 'ability'
            elif modal_fine_grain in ['circumstantial']:
                modal_coarse_grain = 'circumstantial'
            if modal_coarse_grain:
                for token in tokens:
                    df.at[df['ix'] == int(token), 'modal_type'] = str((modal_coarse_grain, modal_fine_grain))


In [None]:
def tag_span(element_ids, lines):
    modals_with_spans = {}
    for element_id, tokens in element_ids.items():
        if 'modal' in element_id:
            span_particles = []
            for line in lines:
                if re.search(element_id, line):
                    try:
                        particle = re.findall('href=\"\#(\w+)\"', line)[0]
                        
                        span_particles.append(element_ids[particle])
                        span_tokens = [t for tokens in span_particles for t in tokens]
                        for token in span_tokens:
                            df.at[df['ix'] == int(token), 'span'] = element_id
                    except:
                        raise Exception(line, re.findall('href=\"\#(\w+)\"', line))

In [None]:
def get_prejacent_tokens(element_ids) -> list:
    prejacent_tokens = []
    for other_id, tokens in element_ids.items():
        if 'other' in other_id:
            for line in lines:
                if re.search(other_id + '\"', line) and 'prejacent' in line:
                    if tokens not in prejacent_tokens: 
                        prejacent_tokens.append(tokens)
    return prejacent_tokens


In [None]:
docs = [files for root, dirs, files in os.walk('./gme-conll/')]
docs = ['.'.join(file.split('.')[1:-1]) for files in docs for file in files]

last_sentence = 0

for doc in tqdm(docs):
    if len(doc) < 3:
        continue
    for root, dirs, files in os.walk('./gme-conll/'):
        for file in files:
            if doc in file:
                df = pd.read_csv(os.path.join(root,file), sep='\t', usecols=[0,1,2,3,4,5, 6], 
                                 names=['ix', 'token', 'is_modal', 'is_prej', 'modal_type', 'sentence_number', 'span'])
    for root, dirs, files in os.walk('./xmls/'):
        for file in files:
            if doc in file:
                if 'mark.xml' in file:
                    with open(os.path.join(root,file), 'r') as f:
                        lines = f.readlines()
                        element_ids = get_elements_ids_and_tokens(lines)
                        modal_tokens = [v for k, v in element_ids.items() if 'modal' in k]
                        tag_BIOSE(modal_tokens, 'is_modal')
                if 'mark_label.xml' in file:
                    with open(os.path.join(root,file), 'r') as f:
                        lines = f.readlines()
                        prej_tokens = get_prejacent_tokens(element_ids)
                        tag_BIOSE(prej_tokens, 'is_prej')
                if 'mark_modal_id.xml' in file:
                    with open(os.path.join(root,file), 'r') as f:
                        lines = f.readlines() 
                        tag_span(element_ids, lines)
                if '.mark_subtype_' in file:
                    with open(os.path.join(root,file), 'r') as f:
                        lines = f.readlines()
                        update_modal_type(element_ids, lines)

    df, last_sentence = add_metadata_and_sent_number(df, last_sentence)

    df.to_csv('./tagged_gme_conll/tmp/{}.csv'.format(doc), sep='\t')