In [67]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf

In [68]:
!pip install -q transformers
!pip install -qU hazm

In [69]:
import math
import numpy as np
import pandas as pd

import hazm

import transformers 
from transformers import AutoTokenizer, AutoConfig
from transformers import TFAutoModelForTokenClassification

import os
from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout

import matplotlib.pyplot as plt

print()
print('tensorflow', tf.__version__)
print('transformers', transformers.__version__)
print('numpy', np.__version__)
print('pandas', pd.__version__)
print()

if tf.test.gpu_device_name() != '/device:GPU:0':
    print()
    print('WARNING: GPU device not found.')
else:
    print()
    print('SUCCESS: Found GPU: {}'.format(tf.test.gpu_device_name()))


tensorflow 2.3.0
transformers 4.0.1
numpy 1.18.5
pandas 1.1.5




# Setup Neural Based Clasifier

In [70]:
normalizer = hazm.Normalizer()


def cleanize(text):
    """A way to normalize and even clean the text"""
    # clean text
    # do some fns
    return normalizer.normalize(text)

def colorize(text, classes):
    green = str(255*classes[1])
    blue = str(255*classes[2])
    opacity = str(.5-classes[0]/2)
    return "<span style='background-color: rgba(0, " + green + ", " + blue + ", " + opacity + ")'>"+text+"</span>"

def parsbert_ner_load_model(model_name):
    """Load the model"""
    try:
        config = AutoConfig.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = TFAutoModelForTokenClassification.from_pretrained(model_name)
        labels = list(config.label2id.keys())

        return model, tokenizer, labels
    except:
        return [None] * 3

def parsbert_ner(texts, model_name, label_translate, visualize=False):
    """Predict and visualize the NER!"""
    global css_is_load
    
    css_is_load = False
    css = """<style>
    .ner-box {
        direction: rtl;
        font-size: 18px !important;
        line-height: 20px !important;
        margin: 0 0 15px;
        padding: 10px;
        text-align: justify;
        color: #343434 !important;
    }
    .token, .token span {
        display: inline-block !important;
        padding: 2px;
        margin: 2px 0;
    }
    .token.token-ner {
        background-color: #f6cd61;
        font-weight: bold;
        color: #000;
    }
    .token.token-ner .ner-label {
        color: #9a1f40;
        margin: 0px 2px;
    }
    </style>"""

    if not css_is_load:
        display(HTML(css))
        css_is_load = True

    model, tokenizer, labels = parsbert_ner_load_model(model_name)

    if not model or not tokenizer or not labels:
        return 'Something wrong has been happened!'
    
    output_predictions = []
    for sequence in texts:
        sequence = cleanize(sequence)
        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
        inputs = tokenizer.encode(sequence, return_tensors="tf")
        outputs = model(inputs)[0]
        predictions = tf.argmax(outputs, axis=2)
        predictions = [(token, label_translate[labels[prediction]]) for token, prediction in zip(tokens, predictions[0].numpy())]
        
        if not visualize:
            output_predictions.append(predictions)
        else:
            display(HTML("<hr>"))
            O=outputs[:,:,14][0]-4

            B_DAT=outputs[:,:,0][0]
            I_DAT=outputs[:,:,7][0]
            B_TIM=outputs[:,:,6][0]
            I_TIM=outputs[:,:,13][0]

            B_LOC=outputs[:,:,1][0]
            I_LOC=outputs[:,:,8][0]
            B_ORG=outputs[:,:,1][0]
            I_ORG=outputs[:,:,8][0]


            if visualize > 1:
                index=np.arange(len(tokens))
                width=0.2

                plt.figure(figsize=(15,5))

                plt.bar(index, O, width, color="black", label="None")
                plt.bar(index + width, B_TIM + B_DAT + I_TIM + I_DAT, width, color="blue", label="Date And Time")
                plt.bar(index+2*width, B_LOC + B_ORG + I_LOC + I_ORG, width, color="seagreen", label="Location")

                plt.xticks(index+width*1.5, tokens)

                plt.legend(loc="best")
                plt.show()


            location = B_LOC + B_ORG + I_LOC + I_ORG
            date_and_time = B_TIM + B_DAT + I_TIM + I_DAT

            
            colorized = []

            for i, token in enumerate(tokens):
                if token not in ["[SEP]", "[CLS]"]:
                    classes = [O[i], location[i], date_and_time[i]]
                    classes = np.exp(classes)/sum(np.exp(classes))

                    colorized.append(colorize(token, classes ))

            html = " ".join(colorized)
            html = "<p style='direction: rtl; font-size: 20px;'>" + html + "</p>"
            display(HTML(html))

    return output_predictions

# Setup Rule-Based classifier

# Persian PEYMA NER

In [71]:
peyma_translate = {
    "B_DAT": "تاریخ",
    "B_LOC": "موقعیت",
    "B_MON": "پول",
    "B_ORG": "سازمان",
    "B_PCT": "درصد",
    "B_PER": "شخص",
    "B_TIM": "زمان",
    "I_DAT": "تاریخ",
    "I_LOC": "موقعیت",
    "I_MON": "پول",
    "I_ORG": "سازمان",
    "I_PCT": "درصد",
    "I_PER": "شخص",
    "I_TIM": "زمان",
    "O": None,
}

In [72]:
#@title Live Playground { display-mode: "form" }
submit_wd = widgets.Button(description='Send', disabled=False, button_style='success', tooltip='Submit')
text_wd = widgets.Textarea(placeholder='Please enter you text ...', rows=5, layout=Layout(width='90%'))
output_wd = widgets.Output()

display(text_wd)
display(submit_wd)
display(output_wd)

def submit_text(sender):
    with output_wd:
        clear_output(wait=True)
        text = text_wd.value

        model_name = 'HooshvareLab/bert-base-parsbert-peymaner-uncased'
        print("Please wait...   0-0")
        output = parsbert_ner([text], model_name, peyma_translate, visualize=2)


submit_wd.on_click(submit_text)


Textarea(value='', layout=Layout(width='90%'), placeholder='Please enter you text ...', rows=5)

Button(button_style='success', description='Send', style=ButtonStyle(), tooltip='Submit')

Output()

In [73]:
texts = [
         "حسین میخواهد وضعیت هوای تهران را بداند.",
         "اذان ظهر مسکو با اذان مغرب چقدر تفاوت دارد؟",
         "آب و هوای فردای تهران چطور است؟",
         "فردا ساعت ۶ بعد از ظهر هوای شیراز چگونه است؟",
         "دمای هوای مسکو در روز جمعه ۹ آبان چند درجه است؟",
         "اذان ظهر تهران چه موقعی است؟ ",
         "نیمه شب شرعی تورنتو چه زمانی است؟",
         "اذان مغرب فردای قم چه ساعتی است؟",
         "فاصله ی اذان مغرب تهران و اذان صبح مشهد",
         "ساعت چند است؟ ",
         "الان در نیویورک ساعت چند است؟ "
         ]

model_name = 'HooshvareLab/bert-base-parsbert-peymaner-uncased'
print("Please wait...   0-0")
output = parsbert_ner(texts, model_name, peyma_translate, visualize=1)


Please wait...   0-0


Some layers from the model checkpoint at HooshvareLab/bert-base-parsbert-peymaner-uncased were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-peymaner-uncased and are newly initialized: ['dropout_911']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
