In [1]:
import numpy as np
import re
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import cPickle as pickle
wikipedia_pickle = '../Data/WikiAstronauts/WikiAstronauts-DBpedia.p'
medline_pickle = '../Data/MedlinePlus/MedlinePlus-SemRep.p'

In [2]:
wiki_list = pickle.load( open( wikipedia_pickle, "rb" ) )
medline_list = pickle.load( open( medline_pickle, "rb" ) )

In [3]:
def clean_annotations(string):
    formal = True
    annotations = re.findall(r'(\[\[[0-9a-zA-Z_.,\-\(\)\%\'\|\ ]+\]\])', string)
    if len(annotations) > 0:
        for annotation in annotations:
            if formal:
                replacement = re.findall(r'\|([0-9a-zA-Z_.,\-\(\)\%\']+)\]\]', annotation)
            else:
                replacement = re.findall(r'\[\[([0-9a-zA-Z_.,\-\(\)\%\'\ ]+)\|', annotation)
            string = string.replace(annotation, replacement[0])
        clean_annotations(string)
    return string

In [4]:
namespaces = ['(http\:\/\/dbpedia\.org/\w+\/)', 'http\:\/\/www\.w3\.org\/1999\/02\/22\-rdf\-syntax\-ns\#', 'http\:\/\/www\.w3\.org\/2000\/01\/rdf\-schema\#']
def clean_namespace(triples):
    for j in range(0, len(triples)):
        for namespace in namespaces:
            triples[j] = re.sub(re.compile(namespace, re.DOTALL), '', triples[j])\
              .replace('\t', '').replace('\n', '') 
    return triples
print clean_namespace(wiki_list[0]['triples'])

['Soyuz_TM-6 crewLaunching Abdul_Ahad_Mohmand']


In [5]:
def highlight(triples):
    for j in range(0, len(triples)):
        triples[j] = triples[j].split()
        triples[j][0] = '<b><font color="blue">' + triples[j][1] + '</font></b>'
        triples[j][1] = '<b><font color="red">' + triples[j][1] + '</font></b>'
        triples[j][-1] = '<b><font color="blue">' + triples[j][1] + '</font></b>'
        triples[j] = ' '.join(triples[j])
    return triples
print highlight((wiki_list[0]['triples']))[0]

<b><font color="blue">crewLaunching</font></b> <b><font color="red">crewLaunching</font></b> <b><font color="blue"><b><font color="red">crewLaunching</font></b></font></b>


In [6]:
def clean_annotations(original, triples, unlinked_entities):
    #print('Original')
    #print original, triples
    annotations = re.findall(r'(\[\[[0-9a-zA-Z_.,\-\(\)\%\'\|\ ]+\]\])', original)
    if len(annotations) > 0:
        for annotation in annotations:
            formal_part = re.findall(r'\|([0-9a-zA-Z_.,\-\(\)\%\']+)\]\]', annotation)[0]
            informal_part = re.findall(r'\[\[([0-9a-zA-Z_.,\-\(\)\%\'\ ]+)\|', annotation)[0]
            original = original.replace(annotation, '<b><font color="blue">' + informal_part + '</font></b>')
            #print simplification
            #print formal_part
            for i in range(0, len(triples)):
                triples[i] = triples[i].split()
                if formal_part == triples[i][0]:
                    triples[i][0] = informal_part
                    if formal_part in unlinked_entities:
                        unlinked_entities.remove(formal_part)
                if formal_part == triples[i][-1]:
                    triples[i][-1] = informal_part
                    if formal_part in unlinked_entities:
                        unlinked_entities.remove(formal_part)
                triples[i] = ' '.join(triples[i])
                """
                if formal_part in triples[i]:
                    #namespace = re.findall(r'([0-9a-zA-Z_.,\-\(\)\%\'\:\/])+{}'.format(re.escape(formal_part)), triples[i])[0]
                    namespace = re.findall(r'({})'.format(re.escape(formal_part)), triples[i])[0]
                    triples[i] = triples[i].replace(namespace, informal_part)
                """
        clean_annotations(original, triples, unlinked_entities)
    return original, triples, unlinked_entities

In [7]:
def exclude(triples, unlinked_entities):
    for entity in unlinked_entities:
        for i in range(len(triples) - 1, -1, -1):
            triples[i] = triples[i].split()
            if triples[i][0] == entity or triples[i][-1] == entity:
                triples[i] = ' '.join(triples[i])
                print('Removing triple-fact: %s ...' % (triples[i]))
                triples.pop(i)
            else: triples[i] = ' '.join(triples[i])
    return triples

def setup_experiment(list):
    num_sentences = 20
    incl_sentences = []
    while len(incl_sentences) < num_sentences:
        sentence = np.random.randint(0, len(list))
        entities = []
        init_triples = highlight(clean_namespace(list[sentence]['triples']))
        for i in range(0, len(init_triples)):
                init_triples[i] = init_triples[i].split()
                if init_triples[i][0] not in entities: entities.append(init_triples[i][0])
                if init_triples[i][-1] not in entities: entities.append(init_triples[i][-1])
                init_triples[i] = ' '.join(init_triples[i])
        annotated_sentence, triples, unlinked_entities = clean_annotations(list[sentence]['annotated_sentence'].replace('\t', '').replace('\n', ''), \
                                              init_triples, entities)
        triples = exclude(triples, unlinked_entities)
        if len(triples) > 0:
            incl_sentences.append({'Original Sentence': annotated_sentence})

            incl_sentences[len(incl_sentences) - 1]['Triple-Fact 1'] = triples[0]         
            for j in range(1, len(list[sentence]['triples'])):
                incl_sentences[len(incl_sentences) - 1][('Triple-Fact %d' % (j + 1))] = triples[j]
            list.pop(sentence)
    return incl_sentences    

In [8]:
#output_df = pd.DataFrame(setup_experiment(wiki_list + medline_list))
output_df = pd.DataFrame(setup_experiment(wiki_list))
output_df.to_csv('../Data/Shorten-Sentence.csv', index=False)

Removing triple-fact: <b><font color="blue">shipNamesake</font></b> <b><font color="red">shipNamesake</font></b> <b><font color="blue"><b><font color="red">shipNamesake</font></b></font></b> ...
Removing triple-fact: <b><font color="blue">shipNamesake</font></b> <b><font color="red">shipNamesake</font></b> <b><font color="blue"><b><font color="red">shipNamesake</font></b></font></b> ...
Removing triple-fact: <b><font color="blue">location</font></b> <b><font color="red">location</font></b> <b><font color="blue"><b><font color="red">location</font></b></font></b> ...
Removing triple-fact: <b><font color="blue">type</font></b> <b><font color="red">type</font></b> <b><font color="blue"><b><font color="red">type</font></b></font></b> ...
Removing triple-fact: <b><font color="blue">type</font></b> <b><font color="red">type</font></b> <b><font color="blue"><b><font color="red">type</font></b></font></b> ...
Removing triple-fact: <b><font color="blue">type</font></b> <b><font color="red">type

KeyboardInterrupt: 