In [1]:
import pandas as pd
import numpy as np
import csv
import os
import re
import time
from tqdm import tqdm_notebook as tqdm
from random import shuffle


pd.set_option('max_colwidth', 400)

In [12]:
def add_metadata_and_sent_number(df, last_sentence):
    sentence_number = last_sentence
    sentence = ''
    new_df = pd.DataFrame(columns=['ix', 'token', 'is_modal', 'is_prej', 'modal_type', 'sentence_number'])
    first_token = -1
    modal_count = 0
    for i, row in df.iterrows():
        sentence += row['token'] + ' '
        df.at[i, 'sentence_number'] = sentence_number
        if row['is_modal'] in ['S', 'B']:
            modal_count += 1
        if row['token'] in ['.', '?', '!']:
            metadata = pd.DataFrame(data={'ix': [None, '#', '#', '#', '#'], 
                                          'token': [None, 'Sent_number = ', 'sentence_text = ', 'modal_count = ', 'source_document = '], 
                                          'is_modal': [None, sentence_number, sentence, modal_count, doc], 
                                          'is_prej': [None, None, None, None, None], 'modal_type': [None, None, None, None, None], 
                                          'sentence_number': [None, None, None, None, None]})
            new_df = pd.concat([new_df, metadata, df.iloc[first_token+1:i+1]],sort=False).reset_index(drop=True)
    
            sentence_number += 1
            sentence = ''
            first_token = i
            modal_count = 0            
            
    return new_df, sentence_number

def get_elements_ids_and_tokens(file_content):
    tokens = {}
    for row in file_content:
        try:
            if re.search('mark id=', row):
                component_id = re.findall('id=\"(\w+\_*\d+)\"', row)[0]

                tokens[component_id] = re.findall('#sTok(\d+)', row)
        except:
            if not re.search('#sTok(\d+)', row):
                continue
            else:
                raise Exception(file, row)
    return tokens

In [3]:
def tag_BIOSE(token_list, col):
    for token_group in token_list:
        if len(token_group) == 1:
            df.at[df['ix'] == int(token_group[0]), col] = 'S'
        else:
            df.at[df['ix'] == int(token_group[0]), col] = 'B'
            df.at[df['ix'] == int(token_group[-1]), col] = 'E'
            for token in token_group[1:-1]:
                df.at[df['ix'] == int(token), col] = 'I'


In [4]:
def tag_span(element_ids, lines, prejacents):
    modals_with_spans = {}
    for element_id, tokens in element_ids.items():
        if 'modal' in element_id:
            span_particles = []
            for line in lines:
                if re.search(element_id + '\"', line):
                    try:
                        particle = re.findall('href=\"\#(\w+)\"', line)[0]
                        if (particle in prejacents) or ('modal' in particle): 
                            span_particles.append(element_ids[particle])
                            span_tokens = [t for tokens in span_particles for t in tokens]
                            for token in span_tokens:
                                df.at[df['ix'] == int(token), 'span'] = element_id
                    except:
                        raise Exception(line, re.findall('href=\"\#(\w+)\"', line))

In [5]:
def get_prejacent_tokens(element_ids) -> list:
    prejacents = []
    prejacent_tokens = []
    for other_id, tokens in element_ids.items():
        if 'other' in other_id:
            for line in lines:
                if re.search(other_id + '\"', line) and 'prejacent' in line:
                    if tokens not in prejacent_tokens: 
                        prejacent_tokens.append(tokens)
                        prejacents.append(other_id)
    return prejacent_tokens, prejacents


In [6]:
def update_modal_type(element_ids, lines):
    modal_fine_grain = ''
    modal_coarse_grain = ''
    for element_id, tokens in element_ids.items():
        if 'modal' in element_id:
            for line in lines:
                if re.search(element_id + '\"', line):
                    modal_fine_grain = re.findall('value=\"(\w+)\"', line)[0]
                    if modal_fine_grain in ['deontic', 'buletic', 'teleological', 'buletic_teleological', 'priority']:
                        modal_coarse_grain = 'priority'
                    elif modal_fine_grain in ['epistemic_circumstantial', 'epistemic']:
                        modal_coarse_grain = 'epistemic'
                    elif modal_fine_grain in ['ability_circumstantial', 'ability']:
                        modal_coarse_grain = 'ability'
                    elif modal_fine_grain in ['circumstantial']:
                        modal_coarse_grain = 'circumstantial'
                    if modal_coarse_grain:
                        for token in tokens:
                            df.at[df['ix'] == int(token), 'modal_type'] = '{}:{}'.format(modal_coarse_grain, modal_fine_grain)



In [8]:
def add_type_to_bio(row):
    if row['is_modal'] in ['S', 'B', 'I', 'E']:
        return '{}-{}'.format(row['is_modal'], row['modal_type'].split(':')[0])
    elif row['is_modal'] == '_':
        return 'O'
    else:
        return row['is_modal']



In [12]:
docs = [files for root, dirs, files in os.walk('./gme-conll/')]
docs = ['.'.join(file.split('.')[1:-1]) for files in docs for file in files]
shuffle(docs)
docs

['02.54-18922',
 '26.55-3999',
 '03.19-22060',
 '21.24-5526',
 '46.57-22041',
 '05.41-14545',
 '31.10-15710',
 '31.31-4544',
 '59.17-14271',
 '12.17-21564',
 '42.17-18974',
 '43.50-5456',
 '41.01-29272',
 '01.39-17844',
 '46.58-22510',
 '09.25-22686',
 '34.10-25509',
 '30.26-14869',
 '03.47-22424',
 '43.49-8525',
 '52.23-14674',
 '15.23-10154',
 '10.41-18948',
 '21.31-1735',
 '05.06-7601',
 '22.42-14586',
 '14.53-15562',
 '37.28-1540',
 '08.22-24562',
 '03.26-29353',
 '00.49-23712',
 '42.54-29681',
 '08.17-13301',
 '00.59-25256',
 '59.38-27990',
 '01.44-19040',
 '11.12-16761',
 '03.26-15373',
 '23.51-4342',
 '11.35-9355',
 '12.58-29108',
 '31.07-25924',
 '41.29-15150',
 '50.33-2917',
 '03.54-17435',
 '23.00-14747',
 '44.27-28756',
 '10.09-13801',
 '18.24-28147',
 '17.18-25137',
 '34.33-18786',
 '05.39-25211',
 '47.35-12690',
 '12.31-26764',
 '54.31-15463',
 '16.16-1134',
 '37.46-10374',
 '57.16-1319',
 '11.08-11611',
 '23.04-14788',
 '21.24-4640',
 '42.46-13715',
 '11.01-7259',
 '04.33

In [9]:
docs = ['02.54-18922', '26.55-3999', '03.19-22060', '21.24-5526', '46.57-22041', '05.41-14545', '31.10-15710', '31.31-4544', '59.17-14271', '12.17-21564', '42.17-18974', '43.50-5456', '41.01-29272', '01.39-17844', '46.58-22510', '09.25-22686', '34.10-25509', '30.26-14869', '03.47-22424', '43.49-8525', '52.23-14674', '15.23-10154', '10.41-18948', '21.31-1735', '05.06-7601', '22.42-14586', '14.53-15562', '37.28-1540', '08.22-24562', '03.26-29353', '00.49-23712', '42.54-29681', '08.17-13301', '00.59-25256', '59.38-27990', '01.44-19040', '11.12-16761', '03.26-15373', '23.51-4342', '11.35-9355', '12.58-29108', '31.07-25924', '41.29-15150', '50.33-2917', '03.54-17435', '23.00-14747', '44.27-28756', '10.09-13801', '18.24-28147', '17.18-25137', '34.33-18786', '05.39-25211', '47.35-12690', '12.31-26764', '54.31-15463', '16.16-1134', '37.46-10374', '57.16-1319', '11.08-11611', '23.04-14788', '21.24-4640', '42.46-13715', '11.01-7259', '04.33-20423', '01.18-7143', '04.20-23621', '18.25-12166', '15.19-23507', '59.03-19180', '50.13-28912', '24.32-9824', '54.02-18235', '55.10-20068', '17.23-6711', '49.26-27556', '19.24-26050', '04.48-17941', '21.54-4382', '00.30-21204', '54.40-10484', '31.05-16359', '03.16-25474', '57.00-17276', '37.33-17834', '09.23-6361', '26.56-25086', '01.03-24107', '37.03-25968', '55.56-1089', '22.04-29634', '45.06-5529', '45.06-11781', '25.24-13030', '58.30-19320', '29.47-14352', '10.42-27754', '24.21-23558', '44.41-9757', '37.46-9337', '45.44-23455', '45.53-9610', '53.19-28892', '28.50-13504', '27.48-10130', '29.16-13721', '24.29-21670', '48.51-14201', '20.43-12807', '01.27-21386', '35.14-9737', '55.44-16289', '52.55-19163', '45.31-12608', '48.42-17806', '20.54-21851', '23.54-19638', '14.49-23456', '05.07-9115', '53.15-23595', '08.41-17418', '13.42-3998', '40.44-4958', '43.33-11456', '16.19-1897', '22.33-10622', '34.35-3190', '01.07-15764', '05.30-9348', '35.13-26224', '01.33-12919', '41.31-29293', '23.57-11596', '08.06-1812', '46.20-17835', '56.51-26264', '46.36-9539', '33.07-17094', '14.55-18533', '05.15-16517', '54.13-6261', '31.55-8089', '21.45-6259', '27.35-29256', '16.59-4123', '31.39-22620', '00.22-21144', '41.16-27812', '41.37-18755', '38.12-11637', '35.18-3709', '08.51-29183', '21.37-22256', '52.31-12963', '44.52-19992', '27.25-21759', '52.35-10118', '13.06-23605', '04.40-21590', '26.14-23928', '48.28-6852', '03.02-16670', '48.48-11084', '37.46-4752', '39.56-18704', '39.11-29583', '20.15-24907', '31.54-28680', '30.45-27115', '47.43-22487', '42.05-29788', '53.05-28623', '18.53-27931', '23.27-26526', '52.36-18982', '08.27-27397', '48.55-4975', '36.01-11616', '59.27-22386', '16.28-8800', '37.40-9205', '15.10-2135', '07.09-11180', '13.45-21190', '28.14-6829', '00.52-4167', '53.55-17301', '44.14-20223', '25.38-22700', '05.45-10422', '40.05-15087', '09.50-14694', '28.43-12622', '20.40-16093', '41.39-5995', '07.48-9357', '35.25-594', '24.50-2535', '30.32-24917', '33.21-17578', '39.09-12713', '55.32-6296', '47.22-10067', '35.03-16511', '07.32-18094', '45.56-26903', '42.01-25605', '24.49-8480', '24.00-10191', '45.32-26215', '33.16-3417', '05.18-8344', '04.32-17074', '21.48-16389', '22.14-2532', '27.54-12647', '31.28-6696', '17.41-25537', '09.32-24118', '56.20-863', '24.21-25298', '57.02-23111', '41.16-14236', '23.04-4326', '47.26-20990', '23.02-12197', '34.49-13286', '33.55-762', '04.31-28782', '55.24-19278', '51.18-1222', '56.31-3120', '47.41-28228', '06.40-17312', '16.50-28640', '45.17-5753', '12.52-21900', '37.57-3837', '07.32-24343', '02.19-8239', '25.24-12160', '13.14-22330', '42.51-22299', '11.01-23492', '49.12-24038', '21.23-8227', '35.22-9439', '05.55-10723', '57.16-2305', '22.38-4806', '59.09-4817', '58.35-24209', '34.58-23977', '58.34-17613', '11.16-17420', '28.09-24241', '49.56-27748', '58.51-26741', '13.43-28244', '55.20-21157', '40.05-5224', '04.50-29091', '26.01-7285', '29.08-21533', '27.12-885', '19.43-8352', '24.14-26782', '19.18-13373', '20.33-11163', '07.24-24231', '17.52-18926', '00.11-67', '20.20-11694', '31.22-12363', '35.06-27851', '26.05-29959', '37.23-17648', '22.56-6451', '20.33-11983', '29.55-10258', '43.16-10786', '37.54-21168', '20.46-20946', '31.45-20536', '03.28-26714', '59.31-21964', '35.30-7542', '36.00-16525', '03.23-18607', '58.08-21315', '37.28-19761', '27.34-26526', '03.25-11609', '36.15-7509', '42.07-7434', '11.06-28210', '19.38-789', '26.52-10078', '39.02-16166', '07.24-28603', '01.04-6923', '03.05-5625', '32.39-1919', '22.54-17837', '48.23-2528', '04.49-21742', '50.06-12228', '12.51-10332', '30.38-25095', '18.30-8033', '47.23-22498', '21.49-25548', '29.53-18099', '46.06-8042', '03.13-21992', '24.37-12857', '01.38-4843', '41.29-25820', '49.47-8044', '50.32-3597', '04.00-12904', '16.03-15717', '21.04-13527', '58.58-18302', '31.56-18015', '38.59-25700', '43.10-5176', '52.21-20248', '56.01-12278', '06.09-13335', '45.36-19604', '35.40-23372', '19.00-22108', '28.15-21486', '23.24-9583', '19.03-816', '37.52-21155', '47.00-17401', '18.15-25073', '15.47-5091', '01.08-20603', '47.21-9712', '12.58-1993', '38.49-16233', '45.48-7720', '26.48-19550', '23.33-17594', '55.28-22100', '32.22-8496', '15.53-1323', '37.48-18053', '15.19-21938', '05.17-7881', '46.28-13637', '20.02-17431', '11.57-16690', '20.40-8369', '19.35-18077', '24.22-5235', '16.15-1122', '12.40-1611', '46.37-70', '32.10-2528', '07.16-6586', '45.31-22188', '35.03-11178', '46.39-9348', '08.55-4179', '04.01-4695', '40.49-25157', '33.22-7140', '25.44-26373', '53.17-27187', '23.00-19213', '17.08-16542', '36.59-7133', '25.51-11532', '33.06-778', '09.12-23643', '09.35-13708', '58.06-10542', '42.26-19148', '37.46-24515', '48.58-26376', '38.27-3333', '29.33-28852', '57.35-19171', '58.47-19000', '27.21-24397', '24.42-23228', '45.03-5180', '05.30-9608', '59.39-4666', '03.10-21966', '49.23-18398', '36.20-26562', '50.30-97', '22.09-1493', '09.21-9865', '35.17-19238', '04.46-9278', '14.43-703', '34.02-2831', '53.38-28377', '18.05-14490', '45.09-11809', '53.09-11428', '33.44-18068', '38.16-13557', '01.07-18921', '42.08-18791', '07.25-26605', '41.36-1407', '26.05-8627', '23.00-9795', '36.22-2937', '12.01-14811', '49.24-6596', '36.23-26588', '02.46-24802', '09.24-151', '37.20-19607', '47.55-9546', '21.28-26118', '59.14-2538', '54.01-6168', '51.05-27505', '06.39-26143', '33.43-387', '38.10-27533', '23.24-6873', '47.06-11142', '31.14-23484', '36.10-18917', '44.41-4066', '20.14-9569', '42.42-156', '25.31-10464', '32.14-18152', '36.11-21342', '28.17-28408', '55.06-13183', '03.06-12522', '47.18-27024', '56.28-3485', '27.05-27044', '38.42-28823', '32.26-14925', '36.05-804', '28.56-2635', '18.38-22306', '32.32-8613', '29.29-13302', '54.29-27700', '09.29-14261', '55.53-20579', '52.48-16582', '58.35-21375', '11.18-10696', '16.31-13271', '36.18-4189', '41.17-4020', '43.51-9328', '26.43-14352', '40.56-18707', '53.10-15177', '07.05-24942', '51.31-14776', '12.57-28994', '13.16-13370', '22.13-11526', '28.56-23638', '45.53-22539', '17.57-23406', '00.23-1236', '39.51-8366', '12.22-6729', '50.57-15245', '19.21-15525', '00.37-23670', '55.54-27027', '54.07-4914', '01.41-17868', '21.31-27111', '11.15-6639', '22.23-26451', '53.09-20885', '59.38-14856', '16.53-1351', '41.01-8736', '40.04-14266', '03.37-20669', '10.04-18139', '46.47-22286', '58.59-16532', '14.07-203', '24.12-20558', '59.08-16874', '05.55-12013', '11.50-23748', '41.07-12494', '36.40-5626', '56.08-8888', '34.07-17177', '31.55-5725', '44.36-19236', '33.09-24778', '25.32-10485', '42.04-19290', '20.10-3414', '43.13-27926', '45.56-235', '28.11-375', '09.29-13319', '40.44-29118', '44.44-3823', '33.31-29984', '16.02-13111', '34.52-3092', '10.24-16924', '00.52-4729', '42.47-22260', '50.32-16604', '55.04-23296', '22.09-18839', '26.42-7148', '37.27-28157', '20.41-10497', '15.18-84', '48.00-11907', '10.31-12974', '19.13-15223', '55.29-11159', '21.46-15196', '16.57-11242']

In [13]:
last_sentence = 0
all_docs_dfs = []
for doc in tqdm(docs):
    if len(doc) < 3:
        continue
    for root, dirs, files in os.walk('./gme-conll/'):
        for file in files:
            if doc in file:
                df = pd.read_csv(os.path.join(root,file), sep='\t', usecols=[0,1,2,3,4,5,6], 
                                 names=['ix', 'token', 'is_modal', 'is_prej', 'modal_type', 'sentence_number', 'span'])
    for root, dirs, files in os.walk('./xmls/'):
        for file in files:
            if doc in file:
                if 'mark.xml' in file:
                    with open(os.path.join(root,file), 'r') as f:
                        lines = f.readlines()
                        element_ids = get_elements_ids_and_tokens(lines)
                        modal_tokens = [v for k, v in element_ids.items() if 'modal' in k]
                        tag_BIOSE(modal_tokens, 'is_modal')
                if 'mark_label.xml' in file:
                    with open(os.path.join(root,file), 'r') as f:
                        lines = f.readlines()
                        prej_tokens, prejacents = get_prejacent_tokens(element_ids)
                        tag_BIOSE(prej_tokens, 'is_prej')
                if 'mark_modal_id.xml' in file:
                    with open(os.path.join(root,file), 'r') as f:
                        lines = f.readlines() 
                        tag_span(element_ids, lines, prejacents)
                if '.mark_subtype_' in file:
                    with open(os.path.join(root,file), 'r') as f:
                        lines = f.readlines()
                        update_modal_type(element_ids, lines)
    df.loc[:, 'is_modal'] = df.apply(add_type_to_bio, axis=1)
    df, last_sentence = add_metadata_and_sent_number(df, last_sentence)
    all_docs_dfs.append(df)
df
tagged_gme = pd.concat(all_docs_dfs,sort=False).reset_index(drop=True)
    #     df.to_csv('./tagged_gme_conll/tmp/{}.csv'.format(doc), sep='\t', names=['ix', 'token', 'is_modal', 'is_prej', 'modal_type', 'sentence_number', 'span'])

HBox(children=(IntProgress(value=0, max=534), HTML(value='')))

In [14]:
tagged_gme

Unnamed: 0,ix,token,is_modal,is_prej,modal_type,sentence_number,span
0,,,,,,,
1,#,Sent_number =,0,,,,
2,#,sentence_text =,"The American Department of State , in its annual report on human rights , has accused a number of countries , including Iran , of human rights violations .",,,,
3,#,modal_count =,0,,,,
4,#,source_document =,02.54-18922,,,,
5,138,The,O,_,_,0,_
6,139,American,O,_,_,0,_
7,140,Department,O,_,_,0,_
8,141,of,O,_,_,0,_
9,142,State,O,_,_,0,_


In [None]:
def concat_all_docs(docs):
    """return df of all files"""
    all_dfs = []
    for doc in docs:
        df = pd.read_csv('./tagged_gme_conll/tagged_for_modals/{}.csv'.format(doc), sep='\t', 
                         keep_default_na=False, index_col=0)
        all_dfs.append(df)
    gme_df = pd.concat(all_dfs ,sort=False).reset_index(drop=True)
    return gme_df

In [None]:
!pwd

In [None]:
gme_df = concat_all_docs(docs)
gme_df

In [None]:
def make_sets(df, proportions: list):
    sentences = df[df['is_modal'].str.contains('Sent_number')]['is_prej'].unique()
    
    
    return len(sentences)
    
    
    
make_sets(gme_df, [10,80,10])

In [15]:
tagged_gme.to_csv('./tagged_gme_conll/tmp/tokenized_and_tagged_gme_coarse_grained.csv', sep='\t', index=False)

In [None]:
gme_df