This notebook takes the ontonoes ner dataset, and creates templates (utterances with placeholders) for a PII synthetic data generator to use in order to create new sentences.

The notebook additionally introduces two new entities: TITLE and ROLE, in order to overcome cases like "UK David Scott called his wife", where the original sentence is "UK Prime Minister Boris Johnson called his wife" as "Prime Minister" was originally tagged as PER in the original dataset. Same logic goes for titles, like Mr., Mrs., Ms.

In [22]:
import pandas as pd
pd.options.display.max_rows = 4000
pd.set_option('display.max_colwidth', -1)

In [23]:
## Download OntoNotes data
ontonotes = ""

### To pandas + add sentence_idx

In [24]:
df_list = []
sentence_id = 0
for sentence in ontonotes:
   
    df = pd.DataFrame(sentence,columns = ["word","tag"])
    df["sentence_idx"] = sentence_id
    sentence_id+=1
    df_list.append(df)
ner_dataset = pd.concat(df_list)
ner_dataset.head(10)

In [25]:
sentences = ner_dataset.groupby('sentence_idx')['word'].apply(lambda x: " ".join(x))

In [26]:
print(sentences[:5])

#### Example sentence:

In [27]:
ner_dataset[ner_dataset['sentence_idx']==3]

In [28]:
# Unique entities
ner_dataset['tag'].unique()

Replace tokenization replacements

In [29]:
ner_dataset['word'] = ner_dataset['word']\
.replace('-LRB-','(')\
.replace('-RRB-',')')\
.replace('-LCB-','(')\
.replace('-RCB-',')')\
.replace('``','"')\
.replace("''",'"')\
.replace('/.','.')

In [30]:
# helper columns:
ner_dataset['prev-word'] = ner_dataset.word.shift(1)
ner_dataset['prev-prev-word'] = ner_dataset['word'].shift(2)
ner_dataset['next-word'] = ner_dataset['word'].shift(-1)
ner_dataset['next-next-word'] = ner_dataset['word'].shift(-2)
ner_dataset['prev-tag'] = ner_dataset['tag'].shift(1)
ner_dataset['next-tag'] = ner_dataset['tag'].shift(-1)

#### Remove unneeded (non PII) entities:

In [31]:
TAGS_TO_IGNORE = ['CARDINAL','FAC','LAW','LANGUAGE','TIME','DATE','ORDINAL','EVENT','QUANTITY','WORK_OF_ART','MONEY','PRODUCT','PERCENT']
def remote_unwanted_tags(x):
    if len(x)>1 and x[2:] in TAGS_TO_IGNORE:
        return 'O'
    else:
        return x

ner_dataset['tag'] = ner_dataset['tag'].apply(remote_unwanted_tags)
ner_dataset[ner_dataset['sentence_idx']==3]

#### Remove PERSON tags if preceding word is 'the' (e.g. the Bush administration)

In [32]:
# removing PERSON tags from sentences with a 'the' preceding the person:

def remove_tag_if_the_person(row):
    if row['prev-word'].lower() == 'the' and row['tag']=='B-PERSON':
        return 'O'
    elif row['prev-prev-word'].lower() == 'the' and row['prev-tag']=='I-PERSON' and row['tag']=='B-PERSON':
        return 'O'
    return row['tag']

ner_dataset['prev-word']=ner_dataset['prev-word'].astype('str')
ner_dataset['prev-prev-word']=ner_dataset['prev-prev-word'].astype('str')
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_the_person,axis=1)

#### Remove tag from 's (Joe Wilson's cat)

In [33]:
def remove_tag_if_apostraphe_after_tag(row):
    if row['prev-tag'] != 'O' and row['word']=="'s":
        return 'O'
    return row['tag']
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_the_person,axis=1)

#### Re-tag words from dictionaries (countries, nationalities, roles, titles)

Nationalities and countries:

In [34]:
nationalities = pd.read_csv("../raw_data/nationalities.csv")
nationalities.head()

In [35]:
"algeria" in nationalities['country'].values

In [36]:

ner_dataset['metadata'] = None

def get_nationality_as_metadata(row):
    if row['word'].lower() in nationalities['country'].values:
        return 'COUNTRY'
    elif row['word'].lower() in nationalities['nationality'].values:
        return 'NATIONALITY'
    elif row['word'].lower() in nationalities['man'].values:
        return 'NATION_MAN'
    elif row['word'].lower() in nationalities['woman'].values:
        return 'NATION_WOMAN'
    elif row['word'].lower() in nationalities['plural'].values:
        return 'NATION_PLURAL'
    return row['metadata']

row = pd.Series({'word':'Frenchwoman','metadata':None})
print("Example: Frenchwoman -> ",get_nationality_as_metadata(row))

def update_tag_based_on_metadata(row):
    if row['tag'] != 'O' and row['metadata'] is not None:
        return row['tag'][:2] + row['metadata']
    else:
        return row['tag']



In [37]:
ner_dataset['metadata'] = ner_dataset.apply(get_nationality_as_metadata, axis=1)


#### Titles

In [38]:
MALE_TITLES = ['mr', 'dr', 'professor', 'eng','prof','doctor']
FEMALE_TITLES = ['mrs', 'ms', 'miss', 'dr', 'professor', 'eng', 'prof','doctor']

def get_title_as_metadata(row):
    if row['word'].lower() in MALE_TITLES:
        return 'MALE_TITLE'
    elif row['word'].lower() in FEMALE_TITLES:
        return 'FEMALE_TITLE'
    return row['metadata']


def update_title_tag_if_missing(row):
    if row['word'].lower() in MALE_TITLES and row['tag']=='O':
        return 'B-MALE_TITLE'
    elif row['word'].lower() in FEMALE_TITLES and row['tag']=='O':
        return 'B-FEMALE_TITLE'
    else:
        return row['tag']

ner_dataset['metadata'] = ner_dataset.apply(get_title_as_metadata,axis=1)
ner_dataset['tag'] = ner_dataset.apply(update_title_tag_if_missing,axis=1)

In [39]:
ner_dataset[ner_dataset['sentence_idx']==18]

### Remove 'the' from 'the NORP' if NORP is not in nationalities list.

In [40]:
def remove_tag_if_the_norp(row):
    if row['prev-word'].lower() == 'the' and row['tag']=='B-NORP' and row['metadata'] is None:
        return 'O'
    elif row['prev-prev-word'].lower() == 'the' and row['prev-tag']=='I-NORP' and row['tag']=='B-NORP' and row['metadata'] is None:
        return 'O'
    return row['tag']
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_the_norp,axis=1)

### Remove sentences with adjacent different entities (e.g calling from New York Larry King)

In [41]:
ner_dataset['entity'] = ner_dataset['tag'].str[2:]
ner_dataset['next-entity']=ner_dataset['next-tag'].str[2:]
adjacent_idc = (ner_dataset['tag'] != 'O') & (ner_dataset['next-tag'] != 'O') & (ner_dataset['entity'] != ner_dataset['next-entity'])
sentences_to_remove = ner_dataset[adjacent_idc]['sentence_idx'].values
sentences_to_remove

ner_dataset=ner_dataset[~ner_dataset['sentence_idx'].isin(sentences_to_remove)]

#### Update tag for discovered metadata values (eg. nationalities)

In [42]:
ner_dataset['tag'] = ner_dataset.apply(update_tag_based_on_metadata, axis=1)

In [43]:
ner_dataset

### Create templates base on NER dataset

In [331]:
import re
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
    @staticmethod    
    def cleanse_template(template, ents):
        # Remove whitespace before certain punctuation marks
        template = re.sub(r'\s([?,:.!](?:|$))+', r'\1', template)
        
        # Remove whitespaces within double quotes
        template = re.sub('\"\s*([^\"]*?)\s*\"', r'"\1"', template)    
        
        # Remove whitespaces within quotes
        template = re.sub("\'\s*([^\']*?)\s*\'", r"'\1'", template)    
        
        # Remove whitespaces within parentheses
        template = re.sub('\(\s*([^\(]*?)\s*\)', r'(\1)', template)    
        
        for ent in ents:
            #Turn PERSON PERSON into PERSON
            duplicates = "[{}] [{}]".format(ent,ent)
            template = template.replace(duplicates,"[{}]".format(ent))
        
        
        # Replace additional weird templates:
        to_replace = {
            "[LOCATION] says" : "[PERSON] says",
            "[LOCATION] said" : "[PERSON] said",
            "[ORGANIZATION] of [ORGANIZATION]" : "[ORGANIZATION]",
            "the [COUNTRY]" : "[COUNTRY]",
            " 's ":"'s",
            "] 's ":"]'s ",
            "] 's,":"]'s,",
            "] 's.":"]'s.",
            " n't" : "n't",
            "/?":"?",
            "%u":"u",
            "%m":"m",
            "%e":"e",  
            "%h":"h",  
            "%a":"a",
            " %":"%",
            " ?":"?",
            " /?":"?",
            " ' .":"'.",
            "[ ":"(",
            " ]":")",
            "[PERSON] -- [PERSON]":"[PERSON]",
            "[COUNTRY] -- [ORGANIZATION]":"[ORGANIZATION]",
            "Jews" : "[NATIONALITY]",
            "Chinese" : "[NATIONALITY]",
            "Dutch" : "[NATIONALITY]",
            "[LOCATION], [LOCATION]":"[LOCATION]"
        }
        
        for weird in to_replace.keys():
            #if weird in template:
            #    print("Weird sentence",template)
            template = template.replace(weird,to_replace[weird])
  
        template = template.replace(" -- "," - ")
        
        #Ignore templates that are incomplete
        if "/-" in template:
            template = ""
            
        if template.count('"') == 1:
            template = template.replace('"','')

        return template
    
    @staticmethod    
    def get_template(grouped,entity_name_replace_dict):
        template = ""
        i=0
        cur_index = 0
        ents = []
        for token in grouped:
            # remove brackets as they interefere with the data generation process
            token_text = token[0].replace("[", "(").replace("]",")")
            token_text = token[0].replace("{", "(").replace("}",")")
            token_tag = token[1]
            token_entity = token_tag[2:] if len(token_tag)>1 else token_tag
            
            if token_entity == 'O':
                template += " " + token_text
            elif 'B-' in token_tag and token_entity not in TAGS_TO_IGNORE:
                #print("found entity: {}".format(token_entity))
                ent = entity_name_replace_dict[token_entity]
                ents.append(ent)
                 
                template += " [" + ent + "]"
            #print("template: ",template)
        
        template = SentenceGetter.cleanse_template(template, ents)
        
        return template.strip()
    
getter = SentenceGetter(ner_dataset)

In [321]:
ENTITIES_DICTIONARY = {"PERSON":"PERSON",
                       "GPE":"COUNTRY",
                       "NORP":"LOCATION",
                       "LOC":"LOCATION",
                       "ORG":"ORGANIZATION",
                       "MALE_TITLE":"MALE_TITLE",
                       "FEMALE_TITLE":"FEMALE_TITLE",
                       "COUNTRY":"COUNTRY",
                       "NATIONALITY":"NATIONALITY",
                       "NATION_WOMAN":"NATION_WOMAN",
                       "NATION_MAN":"NATION_MAN",
                       "NATION_PLURAL":"NATION_PLURAL"}
                      


sentences = getter.sentences

sent_id = 445

print("original:",sentences[sent_id])
print("template:", getter.get_template(sentences[sent_id],entity_name_replace_dict=ENTITIES_DICTIONARY))

In [322]:
all_templates = [getter.get_template(sentence,entity_name_replace_dict=ENTITIES_DICTIONARY) for sentence in sentences]

In [323]:
print("original length of templates: {}".format(len(all_templates)))
all_templates = list(set(all_templates))
print("length after duplicates removal: {}".format(len(all_templates)))

In [324]:
# save to file

with open("../raw_data/ontonotes_based_templates.txt","w+",encoding='utf-8') as f:
    for template in all_templates:
        f.write("%s\n" % template)
        

In [330]:
template = "[NATIONALITY]'s[MALE_TITLE]'"

template = getter.cleanse_template(template,[])
#template = re.sub('\(\s*([^\(]*?)\s*\)', r'(\1)', template)    
template

In [326]:
if template.count("'")==1:
    print(True)
    template = template.replace("'",'')

In [327]:
template