In [3]:
!pip install transformers
!pip install gradio

Collecting gradio
  Downloading gradio-2.3.9-py3-none-any.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 5.1 MB/s 
[?25hCollecting flask-cachebuster
  Downloading Flask-CacheBuster-1.0.0.tar.gz (3.1 kB)
Collecting markdown2
  Downloading markdown2-2.4.1-py2.py3-none-any.whl (34 kB)
Collecting pycryptodome
  Downloading pycryptodome-3.11.0-cp35-abi3-manylinux2010_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 55.1 MB/s 
[?25hCollecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting analytics-python
  Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)
Collecting Flask-Cors>=3.0.8
  Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting Flask-Login
  Downloading Flask_Login-0.5.0-py2.py3-none-any.whl (16 kB)
Collecting paramiko
  Downloading paramiko-2.8.0-py2.py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 67

In [4]:

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from transformers import AutoConfig, TFAutoModelForTokenClassification
from transformers import AutoTokenizer
from tensorboard import notebook
import gradio as gr

In [5]:
!mkdir -p data/raw
!curl https://github.com/elenanereiss/Legal-Entity-Recognition/raw/master/data/dataset_courts.zip -L -o data/raw/raw.zip
!unzip data/raw/raw.zip -d data/raw

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   168  100   168    0     0    290      0 --:--:-- --:--:-- --:--:--   290
100 4289k  100 4289k    0     0  4436k      0 --:--:-- --:--:-- --:--:-- 4436k
Archive:  data/raw/raw.zip
  inflating: data/raw/bfh.conll      
  inflating: data/raw/bgh.conll      
  inflating: data/raw/bpatg.conll    
  inflating: data/raw/bsg.conll      
  inflating: data/raw/bverfg.conll   
  inflating: data/raw/bverwg.conll   
  inflating: data/raw/bag.conll      


In [6]:
def load_data(filename: str):
    with open(filename, 'r') as file:
        lines = [line[:-1].split() for line in file]
    samples, start = [], 0
    for end, parts in enumerate(lines):
        if not parts:
            sample = [(token, tag.split('-')[-1]) 
                          for token, tag in lines[start:end]]
            samples.append(sample)
            start = end + 1
    if start < end:
        samples.append(lines[start:end])
    return samples

train_samples = load_data('data/raw/bag.conll')
val_samples = load_data('data/raw/bgh.conll')
samples = train_samples + val_samples
schema = ['_'] + sorted({tag for sentence in samples 
                             for _, tag in sentence})
schema

['_',
 'AN',
 'EUN',
 'GRT',
 'GS',
 'INN',
 'LD',
 'LDS',
 'LIT',
 'MRK',
 'O',
 'ORG',
 'PER',
 'RR',
 'RS',
 'ST',
 'STR',
 'UN',
 'VO',
 'VS',
 'VT']

In [7]:
MODEL_NAME = 'bert-base-german-cased' 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=len(schema))
model = TFAutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)
model.summary()

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/474k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/508M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_token_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108490752 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  16149     
Total params: 108,506,901
Trainable params: 108,506,901
Non-trainable params: 0
_________________________________________________________________


In [8]:
def tokenize(sample):
    seq = [
               (subtoken, tag)
               for token, tag in sample
               for subtoken in tokenizer(token)['input_ids'][1:-1]
           ]
    return [(3, 'O')] + seq + [(4, 'O')]

def preprocessing(samples):
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(tqdm(map(tokenize, samples)))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i,j] = tag_index[tag]
    return X, y

X_train, y_train = preprocessing(train_samples[:500])
X_val, y_val = preprocessing(val_samples[:100])

500it [00:00, 569.23it/s]
100it [00:00, 455.38it/s]


In [9]:
NR_EPOCHS=3
BATCH_SIZE=32

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')

history = model.fit(tf.constant(X_train), tf.constant(y_train),
                    validation_split=0.2, epochs=NR_EPOCHS, 
                    callbacks=[tensorboard_callback],
                    batch_size=BATCH_SIZE, verbose=1)
  

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
def tokens_gen(text):
  return [(t, 'O') for t in text.split()]

def agg(sample, predictions):
    results = []
    i = 1
    for token in sample:
        nr_subtoken = len(tokenizer(token)['input_ids']) - 2
        pred = predictions[i:i+nr_subtoken]
        i += nr_subtoken
        y_pred = schema[np.argmax(np.sum(pred, axis=0))]
        results.append((token, y_pred))
    return results

input_sample = tokens_gen('Von einer unangemessenen Verfahrensdauer, wie sie der Entschädigungsanspruch nach §198 Abs 1 Satz 1 GVG voraussetzt')

input_sample_val = preprocess([input_sample])[0]
y_probs = model.predict([input_sample_val])[0]
predictions = [agg(sample, predictions) for sample, predictions in zip(input_sample, y_probs)]
predictions

1it [00:00, 220.38it/s]


[[('Von', 'O'), ('O', 'O')]]

In [11]:
def predeict(text):
  input_sample = text_to_token(text)
  input_sample_val = preprocess([input_sample])[0]
  y_probs = model.predict([input_sample_val])[0]
  predictions = [aggregate(sample, predictions) for sample, predictions in zip(input_sample, y_probs)]
  return str(predictions)

iface = gr.Interface(fn=predict, inputs="text", outputs=["text"])
iface.launch()


Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
This share link will expire in 72 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted
Running on External URL: https://22223.gradio.app


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7860/',
 'https://22223.gradio.app')