## Run this notebook to understand how to get keywords/phrases for the gazetteer features

In [1]:
import os
import json
from pathlib import Path
import spacy
from tqdm import tqdm

root_directory = Path(os.path.abspath('')).parent
nlp = spacy.load('en_core_web_sm')

In [None]:
with open(root_directory / 'data/train.json') as fname:
    data_train_json = json.load(fname)

const_dir_keywords = []
obj_dir_keywords = []

for item in tqdm(data_train_json[0]['paragraphs']):
    
    entities = item['entities']
    raw_string = " ".join([token['orth'] for token in item['sentences'][0]['tokens']])
    
    raw_string_nlp = nlp(raw_string)
    
    for entity in entities:
        if entity[-1] == 'CONST_DIR':
            const_dir_keywords.append(raw_string_nlp.char_span(entity[0], entity[1]).text.lower())
        elif entity[-1] == 'OBJ_DIR':
            obj_dir_keywords.append(raw_string_nlp.char_span(entity[0], entity[1]).text.lower())

const_dir_keywords = list(set(const_dir_keywords))
const_dir_keywords.sort(key=lambda x: len(x.split()), reverse=True)
obj_dir_keywords = list(set(obj_dir_keywords))

print("Unique const_dir keywords: ", const_dir_keywords)
print("\nUnique obj_dir keywords: ", obj_dir_keywords)