In [None]:
pip install datasets

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import numpy as np
import pandas as pd
from datasets import load_from_disk

In [None]:
MODEL_NAME = 'updated_roberta_1k_2'
OUTPUT_DIR = f'ner_research/models/{MODEL_NAME}'
pipe = pipeline('ner', model=OUTPUT_DIR, tokenizer='roberta-base')

In [None]:
color_map = {0: 'black',
             1: 'red',
             2: 'red',
             3: 'blue',
             4: 'blue',
             5: 'green',
             6: 'green',
             7: 'purple',
             8: 'purple'}

In [None]:
def ner_tokens_to_html(tokens):
    html_code = ""
    previous_color = None
    for string, color in tokens:
        if string=='\n':
            string = '<br>'
        if color!=previous_color:
            if previous_color is not None:
                html_code += '</span>'
            html_code += f"<span style='color: {color_map[color]};'>{string}"
        else:
            html_code += string
        previous_color = color
    html_code += '</span>'
    return html_code

In [None]:
TRAIN_DATASET = 'filtered_dataset_train_modif_2'
dataset_dict = load_from_disk(f'ner_research/{TRAIN_DATASET}/')

In [None]:
samples = np.random.randint(0, dataset_dict['test'].num_rows, size=5)

In [None]:
for j in samples:
    j = int(j)
    tokens = [(dataset_dict['test'][i]['tokens'][i], dataset_dict['test'][i]['ner_tags'][i]) for i in range(len(dataset_dict['test'][j]['tokens']))]
    displayHTML(ner_tokens_to_html(tokens))

In [None]:
for i in samples:
    i = int(i)
    text= ' '.join(dataset_dict['test'][i]['tokens'])
    input_data = pipe.tokenizer(text, return_tensors='tf', truncation=True)
    output_data = pipe.model(**input_data)
    tokens = [(pipe.tokenizer.decode(input_data['input_ids'][0][i]), np.argmax(output_data['logits'], axis=2)[0][i]) for i in range(output_data['logits'].shape[1])]
    displayHTML(ner_tokens_to_html(tokens[1:-1]))

In [None]:
data = pd.read_parquet('/dbfs/mnt/ds-prod-assets/community_signatures/outputs/temp_1M_signatures_df.parquet')
has_details_filter = (data['emails'].apply(len).gt(0) |
                      data['phones'].apply(len).gt(0) |
                      data['address'].notnull() |
                      data['company'].notnull() |
                      datal['job_titles'].apply(len).gt(0) |
                      data['socials'].apply(len).gt(0))
num_lines_filter = (data['end_pos'] - data['start_pos']).between(4, 10)
data = data[num_lines_filter & has_details_filter]
data = data.drop_duplicates(subset='body', keep='first')
sample = data[:25]

In [None]:
for i in range(sample.shape[0]):
    text= sample['body'].iat[i]
    input_data = pipe.tokenizer(text, return_tensors='tf', truncation=True)
    output_data = pipe.model(**input_data)
    tokens = [(pipe.tokenizer.decode(input_data['input_ids'][0][i]), np.argmax(output_data['logits'], axis=2)[0][i]) for i in range(output_data['logits'].shape[1])]
    displayHTML(ner_tokens_to_html(tokens[1:-1]))