In [17]:
from copy import deepcopy
from tqdm import tqdm
import pandas as pd

## Create a dictionary for each type of relation

In [18]:
ner_types = ['loc', 'per', 'fac', 'org', 'char']
colours = {'loc': 'green', 'per': 'blue', 'fac': 'yellow', 'org': 'red', 'char': 'grey'}

unselected = {
    "class": "text__unselected",
    # "contentLength": 
}

pattern = {
    "class": "text__selected", 

    "textInputClasses": 
    [
        "text__input",
        # "text__input_{COLOUR}"
    ],
    
    "tagSelectorClasses": 
    [
        "text__selected-inner",
        # "text__selected-inner_{COLOUR}"
    ],
    
    # "value": "NER_TYPE"
}

In [19]:
def ner_template(pattern, ner_type):
    templ = deepcopy(pattern)
    templ['textInputClasses'].append(f"text__input_{colours[ner_type]}")
    templ['tagSelectorClasses'].append(f"text__selected-inner_{colours[ner_type]}")
    templ['value'] = ner_type
    return templ

# Split the data

- by NER tokens, their types and start positions.

In [4]:
def split_data(text: str, bio: str) -> (list, list, list):

    text_split = text.split()
    bio_split = bio.split()

    left_context = 0
    tokens = []
    types = []
    start_positions = []
    phrase = ''
    ner_type = ''

    for i in range(len(text_split)):
        token = text_split[i]
        ner = bio_split[i]

        if ner.startswith('B-') and phrase == '':
            phrase = token
            ner_type = ner

            start_positions.append(left_context)
            left_context += len(token) + 1

        elif ner.startswith('B-') and phrase != '':
            tokens.append(phrase)
            types.append(ner_type[2:].lower())
            phrase = token
            ner_type = ner

            start_positions.append(left_context)
            left_context += len(token) + 1

        elif ner.startswith('I-'):
            phrase += ' ' + token
            left_context += len(token) + 1

        elif ner == 'O':
            left_context += len(token) + 1

    types.append(ner_type[2:].lower())
    tokens.append(phrase)

    return tokens, types, start_positions

# Create 'result' markup

In [5]:
def create_result(tokens: list, types: list, start_positions: list) -> dict:
    result_dict = {t: [] for t in types}

    for token, t_type, start in zip(tokens, types, start_positions):
        res = {
            'position': start,
            'value': token
        }
        result_dict[t_type].append(res)

    return result_dict

# Create 'text_review_mode' markup

In [6]:
def get_unselected(text: str, tokens: list, start_positions: list) -> list:
    unselected_values = []
    prev_sum = 0
    
    token_lens = [len(i) for i in tokens]
    total_len = len(text)

    unselected['contentLength'] = start_positions[0]
    unselected_values.append(deepcopy(unselected))

    for i in range(len(token_lens)):
        start = start_positions[i]
        length = token_lens[i]
        prev_sum = start + length
        try:
            unselected['contentLength'] = start_positions[i+1] - prev_sum
        except IndexError:
            unselected['contentLength'] = total_len - prev_sum
        unselected_values.append(deepcopy(unselected))
        
    return unselected_values

In [7]:
def get_selected(tokens: list, types: list, ner_templates: dict) -> list:
    selected_result = []
    token_lens = [len(i) for i in tokens]
    
    for token_len, ner_type in zip(token_lens, types):
        
        selected = deepcopy(ner_templates[ner_type])
        selected['contentLength'] = token_len
        selected_result.append(selected)
    return selected_result

In [8]:
def create_text_review_mode(
    text: str, 
    tokens: list, 
    types: list, 
    start_positions: list,
    ner_templates: dict
) -> list:
    
    selected_values = get_selected(tokens, types, ner_templates)
    unselected_values = get_unselected(text, tokens, start_positions)
    
    for i, j in zip(
        range(0, len(selected_values) * 2 + 1, 2), 
        range(len(unselected_values))
    ):
        selected_values.insert(i, unselected_values[j])
    
    return selected_values

# Main function

In [9]:
def bio_to_js(text, bio, ner_templates):
    
    tokens, types, start_positions = split_data(text, bio)
    result = create_result(tokens, types, start_positions)
    text_review_mode = create_text_review_mode(text, tokens, types, start_positions, ner_templates)
    
    text_review_mode = str(text_review_mode)[1:-1].replace("'", '"')
    result = str(result).replace("'", '"')
    
    return text, result, text_review_mode

# Import the data

In [10]:
train_df = pd.read_csv('train_df_new.csv')[['tokens', 'BIO_str']]
val_df = pd.read_csv('val_df_new.csv')[['tokens', 'BIO_str']]
test_df = pd.read_csv('test_df_new.csv')[['tokens', 'BIO_str']]

# Annotate

In [11]:
ner_templates = {
    'loc': ner_template(pattern, 'loc'), 
    'per': ner_template(pattern, 'per'), 
    'org': ner_template(pattern, 'org'), 
    'char': ner_template(pattern, 'char'), 
    'fac': ner_template(pattern, 'fac')
}

In [12]:
df_js = pd.DataFrame(columns=['INPUT:input', 'INPUT:result', 'INPUT:text_review_mode'])

In [13]:
for index, row in test_df.iterrows():
    text = row['tokens']
    bio = row['BIO_str']
    
    if set(bio.split()) != {'O'}: # проверяем, что в строке есть ner'ы
        text, result, text_review_mode = bio_to_js(text, bio, ner_templates)
        df_js.loc[len(df_js.index)] = [text, result, text_review_mode]

In [272]:
df_js.to_csv('full_pool.tsv', sep='\t', index=False)

# For tests

In [14]:
text = 'В нем говорилось : > « Уважаемые Александр Сергеевич и Людмила Алексеевна !'
bio = 'O O O O O O O B-PER I-PER O B-PER I-PER O'

In [16]:
tokens, types, start_positions = split_data(text, bio)
tokens, types, start_positions

(['Александр Сергеевич', 'Людмила Алексеевна'], ['per', 'per'], [33, 55])

In [None]:
text_review_mode = create_text_review_mode(text, tokens, types, start_positions, ner_templates)    
text_review_mode = str(text_review_mode)[1:-1].replace("'", '"')


In [275]:
bio_to_js(text, bio, ner_templates)

('В нем говорилось : > « Уважаемые Александр Сергеевич и Людмила Алексеевна !',
 '{"per": [{"position": 33, "value": "Александр Сергеевич"}, {"position": 55, "value": "Людмила Алексеевна"}]}',
 '{"class": "text__unselected", "contentLength": 33}, {"class": "text__selected", "textInputClasses": ["text__input", "text__input_blue"], "tagSelectorClasses": ["text__selected-inner", "text__selected-inner_blue"], "value": "per", "contentLength": 19}, {"class": "text__unselected", "contentLength": 3}, {"class": "text__selected", "textInputClasses": ["text__input", "text__input_blue"], "tagSelectorClasses": ["text__selected-inner", "text__selected-inner_blue"], "value": "per", "contentLength": 18}, {"class": "text__unselected", "contentLength": 2}')