In [1]:
import csv
import pandas as pd
import nltk
import re
from mosestokenizer import MosesTokenizer
tokenizer = MosesTokenizer('en')

In [2]:
sent_data = pd.read_csv('../valid_sentences.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label', 'sent'], quoting=csv.QUOTE_NONE)

In [3]:
sent_data

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
0,Q496290,Q884,P495,Pasta,South Korea,country of origin,Pasta (Hangul: 파스타) is a 2010 South Korean tel...
1,Q275740,Q869,P131,Kanchanaburi,Thailand,located in the administrative territorial entity,Kanchanaburi is the largest of the western pro...
2,Q1083043,Q215380,P31,Christie,band,instance of,Christie is an English rock band that formed a...
3,Q35666,Q23392,P186,glacier,ice,material used,A glacier is a persistent body of dense ice th...
4,Q1238748,Q41298,P31,Domus,magazine,instance of,Domus is an architecture and design magazine f...
5,Q1238748,Q41298,P31,Domus,magazine,instance of,"Published by Editoriale Domus, the magazine is..."
6,Q265,Q232,P47,Uzbekistan,Kazakhstan,shares border with,Uzbekistan is bordered by five landlocked coun...
7,Q159729,Q5372,P641,San Antonio Spurs,basketball,sport,The San Antonio Spurs are an American professi...
8,Q382389,Q28389,P106,Jacques Audiard,screenwriter,occupation,Jacques Audiard is a French film director and ...
9,Q2333456,Q3372216,P161,Spider-Man,Paul Soles,cast member,The show starred the voice of Paul Soles as Pe...


In [4]:
def normalize_text(text):
    text = text.replace('"', ' ')
    text = text.replace('[note]', ' ')
    text = text.replace('(disambuguation)', ' ')
    text = text.replace('[citation needed]', ' ')
    text = text.replace('[update]', ' ')
    text = text.replace('[contradictory]', ' ')
    text = text.replace('[page needed]', ' ')
    text = re.sub(r'\[relevant\?\s*\–\s*discuss\]', ' ', text)
    text = text.replace('[clarification needed]', ' ')
    text = re.sub(r'\[[a-z]*\?\]', ' ', text)
    text = re.sub(r'\[nb\s[0-9]*\]', ' ', text)
    text = re.sub(r'\[nb\]', ' ', text)
    text = re.sub(r'\[[a-z]\]', ' ', text)
    text = re.sub(r'\[[A-Z]\]', ' ', text)
    text = re.sub(r'\[[0-9]+\]', ' ', text)
    text = re.sub(r'[a-z]\.\^', ' ', text)
    text = text.replace('[.', ' ')
    text = text.replace('[', ' ')
    sent_text = nltk.sent_tokenize(text)
    sent = sent_text[0]
    sent = ' '.join(sent.split()) # remove multiple spaces
    if not sent.endswith('.'):
        sent = sent + '.'
    return sent
    

def is_consecutive_token_in_sent_list(tokens, sent_list):
    idxs = []
    for token in tokens:
        try:
            idx = sent_list.index(token)
            idxs.append(idx)
        except ValueError:
            return False
    
    len_idx = idxs[-1] - idxs[0]
    len_token = len(tokens) - 1
    
    if len_idx != len_token:
        return False
    
    return True


def is_sent_contain_entities(e1, e2, sent):
    e1_tokens = tokenizer(e1)
    e2_tokens = tokenizer(e2)
    sent_tokens = tokenizer(sent)
    
    contain_e1 = is_consecutive_token_in_sent_list(e1_tokens, sent_tokens)
    contain_e2 = is_consecutive_token_in_sent_list(e2_tokens, sent_tokens)
    
    return contain_e1 and contain_e2

In [5]:
remove_idxs = []
len_data = len(sent_data['sent'])
for i in range(len_data):
    sent_data['sent'][i] = normalize_text(sent_data['sent'][i])
    is_valid = is_sent_contain_entities(sent_data['e1_label'][i], sent_data['e2_label'][i], sent_data['sent'][i])
    if is_valid == False:
        remove_idxs.append(i)

In [6]:
len(remove_idxs)

182

In [7]:
sent_data.drop(sent_data.index[remove_idxs], inplace=True)

In [8]:
sent_data

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
1,Q275740,Q869,P131,Kanchanaburi,Thailand,located in the administrative territorial entity,Kanchanaburi is the largest of the western pro...
2,Q1083043,Q215380,P31,Christie,band,instance of,Christie is an English rock band that formed a...
3,Q35666,Q23392,P186,glacier,ice,material used,A glacier is a persistent body of dense ice th...
4,Q1238748,Q41298,P31,Domus,magazine,instance of,Domus is an architecture and design magazine f...
5,Q1238748,Q41298,P31,Domus,magazine,instance of,"Published by Editoriale Domus, the magazine is..."
6,Q265,Q232,P47,Uzbekistan,Kazakhstan,shares border with,Uzbekistan is bordered by five landlocked coun...
7,Q159729,Q5372,P641,San Antonio Spurs,basketball,sport,The San Antonio Spurs are an American professi...
8,Q382389,Q28389,P106,Jacques Audiard,screenwriter,occupation,Jacques Audiard is a French film director and ...
9,Q2333456,Q3372216,P161,Spider-Man,Paul Soles,cast member,The show starred the voice of Paul Soles as Pe...
10,Q465282,Q7850,P1412,Liu Xiang,Chinese,"languages spoken, written or signed","Liu Xiang, born Liu Gengsheng and bearing the ..."


In [9]:
sent_data.to_csv('../valid_sentences_filtered.tsv', sep='\t', header=False, index=False)

### Cut Sentence Number

In [10]:
sent_data = pd.read_csv('../valid_sentences_filtered.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label', 'sent'], quoting=csv.QUOTE_NONE)

In [11]:
sent_data

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
0,Q275740,Q869,P131,Kanchanaburi,Thailand,located in the administrative territorial entity,Kanchanaburi is the largest of the western pro...
1,Q1083043,Q215380,P31,Christie,band,instance of,Christie is an English rock band that formed a...
2,Q35666,Q23392,P186,glacier,ice,material used,A glacier is a persistent body of dense ice th...
3,Q1238748,Q41298,P31,Domus,magazine,instance of,Domus is an architecture and design magazine f...
4,Q1238748,Q41298,P31,Domus,magazine,instance of,"Published by Editoriale Domus, the magazine is..."
5,Q265,Q232,P47,Uzbekistan,Kazakhstan,shares border with,Uzbekistan is bordered by five landlocked coun...
6,Q159729,Q5372,P641,San Antonio Spurs,basketball,sport,The San Antonio Spurs are an American professi...
7,Q382389,Q28389,P106,Jacques Audiard,screenwriter,occupation,Jacques Audiard is a French film director and ...
8,Q2333456,Q3372216,P161,Spider-Man,Paul Soles,cast member,The show starred the voice of Paul Soles as Pe...
9,Q465282,Q7850,P1412,Liu Xiang,Chinese,"languages spoken, written or signed","Liu Xiang, born Liu Gengsheng and bearing the ..."


In [12]:
sent_data = sent_data.drop_duplicates()
sent_data = sent_data.reset_index(drop=True)

In [13]:
sent_data

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
0,Q275740,Q869,P131,Kanchanaburi,Thailand,located in the administrative territorial entity,Kanchanaburi is the largest of the western pro...
1,Q1083043,Q215380,P31,Christie,band,instance of,Christie is an English rock band that formed a...
2,Q35666,Q23392,P186,glacier,ice,material used,A glacier is a persistent body of dense ice th...
3,Q1238748,Q41298,P31,Domus,magazine,instance of,Domus is an architecture and design magazine f...
4,Q1238748,Q41298,P31,Domus,magazine,instance of,"Published by Editoriale Domus, the magazine is..."
5,Q265,Q232,P47,Uzbekistan,Kazakhstan,shares border with,Uzbekistan is bordered by five landlocked coun...
6,Q159729,Q5372,P641,San Antonio Spurs,basketball,sport,The San Antonio Spurs are an American professi...
7,Q382389,Q28389,P106,Jacques Audiard,screenwriter,occupation,Jacques Audiard is a French film director and ...
8,Q2333456,Q3372216,P161,Spider-Man,Paul Soles,cast member,The show starred the voice of Paul Soles as Pe...
9,Q465282,Q7850,P1412,Liu Xiang,Chinese,"languages spoken, written or signed","Liu Xiang, born Liu Gengsheng and bearing the ..."


In [14]:
max_sent = 40
rem_ids = []
len_data = len(sent_data['sent'])
sent_num = 0
prev_triple = sent_data['e1_id'][0] + ' ' + sent_data['rel_id'][0] + ' ' + sent_data['e2_id'][0]
for i in range(len_data):
    curr_triple = sent_data['e1_id'][i] + ' ' + sent_data['rel_id'][i] + ' ' + sent_data['e2_id'][i]
    if prev_triple == curr_triple:
        sent_num += 1
        if sent_num > max_sent:
            rem_ids.append(i)
    else:
        sent_num = 1
        prev_triple = curr_triple

In [15]:
len(rem_ids)

0

In [16]:
sent_data.drop(sent_data.index[rem_ids], inplace=True)

In [17]:
sent_data

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
0,Q275740,Q869,P131,Kanchanaburi,Thailand,located in the administrative territorial entity,Kanchanaburi is the largest of the western pro...
1,Q1083043,Q215380,P31,Christie,band,instance of,Christie is an English rock band that formed a...
2,Q35666,Q23392,P186,glacier,ice,material used,A glacier is a persistent body of dense ice th...
3,Q1238748,Q41298,P31,Domus,magazine,instance of,Domus is an architecture and design magazine f...
4,Q1238748,Q41298,P31,Domus,magazine,instance of,"Published by Editoriale Domus, the magazine is..."
5,Q265,Q232,P47,Uzbekistan,Kazakhstan,shares border with,Uzbekistan is bordered by five landlocked coun...
6,Q159729,Q5372,P641,San Antonio Spurs,basketball,sport,The San Antonio Spurs are an American professi...
7,Q382389,Q28389,P106,Jacques Audiard,screenwriter,occupation,Jacques Audiard is a French film director and ...
8,Q2333456,Q3372216,P161,Spider-Man,Paul Soles,cast member,The show starred the voice of Paul Soles as Pe...
9,Q465282,Q7850,P1412,Liu Xiang,Chinese,"languages spoken, written or signed","Liu Xiang, born Liu Gengsheng and bearing the ..."


In [18]:
sent_data.to_csv('../valid_sentences_filtered.tsv', sep='\t', header=False, index=False)

### Get Sentences Only

In [20]:
sent_data_filtered = pd.read_csv('../train_sentences_filtered.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label', 'sent'], quoting=csv.QUOTE_NONE)

In [21]:
sent_data_filtered

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
0,Q18394838,Q482994,P31,Trick,album,instance of,Trick is the second solo album by Kele Okereke...
1,Q1324135,Q1563,P131,Floridita,Havana,located in the administrative territorial entity,Floridita (Spanish pronunciation: floɾiˈðita )...
2,Q1324135,Q1563,P131,Floridita,Havana,located in the administrative territorial entity,Hemingway's children also noted that in the ea...
3,Q68155,Q65453,P47,Schwyz,Illgau,shares border with,Illgau is a municipality in Schwyz District in...
4,Q1663240,Q11424,P31,Spun,film,instance of,Spun is a 2002 American black comedy crime dra...
5,Q130469,Q889,P17,Mazar-i-Sharif,Afghanistan,country,Mazar-i-Sharif (Dari/Pashto: مزار شریف‎; Mazâr...
6,Q130469,Q889,P17,Mazar-i-Sharif,Afghanistan,country,Mazar-i-Sharif is the regional hub of northern...
7,Q130469,Q889,P17,Mazar-i-Sharif,Afghanistan,country,Thus the ruler of North Central Afghanistan de...
8,Q130469,Q889,P17,Mazar-i-Sharif,Afghanistan,country,Afterwards Mazar-i-Sharif became the de facto ...
9,Q130469,Q889,P17,Mazar-i-Sharif,Afghanistan,country,The 209th Corps (Shaheen) of the Afghan Nation...


In [22]:
sents_only = sent_data_filtered['sent']

In [23]:
sents_only.to_csv('../graphene/train_sentences_filtered', sep='\t', header=False, index=False)

### Divide Text

In [3]:
sents_only = pd.read_csv('../sentences/valid_sentences_filtered.tsv', sep='\t', header=None, names=['sent'], quoting=csv.QUOTE_NONE)

In [5]:
sents_only_1, sents_only_2 = sents_only[:3847], sents_only[3847:]

In [8]:
sents_only_1.to_csv('../sentences/valid_sentences_filtered_1.tsv', sep='\t', header=False, index=False)

In [9]:
sents_only_2.to_csv('../sentences/valid_sentences_filtered_2.tsv', sep='\t', header=False, index=False)

In [60]:
nltk.sent_tokenize('Traditional trumpet repertoire rarely calls for notes beyond this range, and the fingering tables of most method books peak at the high C, two octaves above middle C.'+'\n'+'Several trumpeters have achieved fame for their proficiency in the extreme high register, among them Maynard Ferguson, Cat Anderson, Dizzy Gillespie, Doc Severinsen, and more recently Wayne Bergeron, Thomas Gansch, James Morrison, Jon Faddis and Arturo Sandoval.')

['Traditional trumpet repertoire rarely calls for notes beyond this range, and the fingering tables of most method books peak at the high C, two octaves above middle C.\nSeveral trumpeters have achieved fame for their proficiency in the extreme high register, among them Maynard Ferguson, Cat Anderson, Dizzy Gillespie, Doc Severinsen, and more recently Wayne Bergeron, Thomas Gansch, James Morrison, Jon Faddis and Arturo Sandoval.']