In [119]:
from flask import Flask

app = Flask(__name__)

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, valid_ids=None, label_mask=None):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.valid_ids = valid_ids
        self.label_mask = label_mask

def convert_examples_to_features(all_sentences, all_labels, label_list, max_seq_length, tokenizer):
    """
    :input: examples
    :return: list of features
    :input_ids, input_mask, segment_ids, label_ids, valid_ids, label_mask
    """
    label_map = {label : i for i, label in enumerate(label_list,1)}
    print('label_map', label_map)
    print('\n')

    features = []
    for index, sentence in enumerate(all_sentences):
        textlist = sentence.split(' ')
        print('textlist', textlist)
        labellist = all_labels[index]
        print('labellist', labellist)
        tokens = []
        labels = []
        valid = []
        label_mask = []
        for i, word in enumerate(textlist):
            token = tokenizer.tokenize(word)
            token = [ t for t in token if "##" not in t ]
            tokens.extend(token)
            label_1 = labellist[i]
            for m in range(len(token)):
                if m == 0:
                    labels.append(label_1)
                    valid.append(1)
                    label_mask.append(1)
                else:
                    valid.append(0)
        if len(tokens)>= max_seq_length -1 :
            tokens = tokens[0:(max_seq_length - 1)]
            labels = labels[0:(max_seq_length - 1)]
            valid = valid[0:(max_seq_length - 1)]
            label_mask = label_mask[0:(max_seq_length - 1)]
        
        segment_ids = []
        label_ids = []
        label_mask.insert(0,1)
        for i, token in enumerate(tokens):
            segment_ids.append(0)
            if len(labels) > i:
                label_ids.append(label_map[labels[i]])
        label_mask.append(1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        label_mask = [1] * len(label_ids)

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            label_ids.append(0)
            valid.append(0)
            label_mask.append(0)
        while len(label_ids) < max_seq_length:
            label_ids.append(0)
            label_mask.append(0)
        
        print('input_ids', input_ids)
        print('input_mask', input_mask)
        print('segment_ids', segment_ids)
        print('label_ids', label_ids)
        print('valid', valid)
        print('label_mask', label_mask)
        print('\n')
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length
        assert len(valid) == max_seq_length
        assert len(label_mask) == max_seq_length

        if index < 3:
            app.logger.info("*** Example ***")
            app.logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            app.logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            app.logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            app.logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            app.logger.info("valid: %s" % " ".join([str(x) for x in valid]))
#             app.logger.info("label: %s (id = %d)" % (labellist, label_ids))
            app.logger.info("label_mask: %s" % " ".join([str(x) for x in label_mask]))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_ids,
                              valid_ids=valid,
                              label_mask=label_mask))
    return features



class IswPreprocessor:
    def __init__(self, filename='data/test-full-isw-release.tsv'):
        app.logger.info('------ Preprocssing ISW German corpus ------')
        self.file = open(filename, encoding='utf-8')
        self.sentences, self.labels, self.flat_labels = self.get_sentences_and_labels()
        self.label_list = list(map(lambda x: x if x != 'NONE' else 'O', set(self.flat_labels)))
        
        app.logger.info("Number of sentences: {0} ".format(len(self.sentences)))
        app.logger.info("Number of tags: {0} ".format(len(self.label_list)))

    def get_sentences_and_labels(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        labels, label, sentences, sentence, flat_labels = [], [], [], [], []
        for line in self.file:
            if line.startswith("idx") or line.startswith("0") or line.startswith("NONE"):
                continue
            line = line.strip()
            splits = line.split("\t")
            if '?' in splits[2] or '.' in splits[2] :
                if len(label)>0 and len(sentence)>0:
                    sentences.append(" ".join(sentence))
                    labels.append(label)
                    sentence = []
                    label = []
                continue
            if splits[3] != 'NONE':
                sentence.append(splits[3])
                label.append(splits[6])
                flat_labels.append(splits[6])

        if len(label)>0 and len(sentence)>0:
            sentences.append(" ".join(sentence))
            labels.append(label)

        labels = [list(map(lambda x: x if x != 'NONE' else 'O', i)) for i in labels]

        return sentences, labels, flat_labels

    def get_tag2idx_idx2tag(self):
        """
        return : dict of tag2idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        return : dict of idx2tag : inverted
        """
        tag2idx = {t: i for i, t in enumerate(sorted(self.label_list), 0)}
        idx2tag = {i: t for t, i in tag2idx.items()}
        return tag2idx, idx2tag


class TweetPreprocessor:
    def __init__(self, filename='data/merged_headlines_annos.compact.tsv'):
        app.logger.info('------ Preprocssing Tweets corpus ------')
        self.file = open(filename, encoding='utf-8')
        self.ners_vals=[]

    def get_sentences_and_labels(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        labels, label, sentences, sentence, flat_labels = [], [], [], [], []
        for line in self.file:
            if line.startswith("#"):
                continue
            line = line.strip()
            splits = line.split("\t")
            if line.startswith("NONE"):
                if len(label)>0 and len(sentence)>0:
                    sentences.append(" ".join(sentence))
                    labels.append(label)
                    sentence = []
                    label = []
                continue
            sentence.append(splits[1])
            label.append(splits[3])
            flat_labels.append(splits[3])
        
        if len(label)>0 and len(sentence)>0:
            sentences.append(" ".join(sentence))
            labels.append(label)
            
        labels = [list(map(lambda x: x if x != 'NONE' else 'O', i)) for i in labels]
        self.ners_vals = list(map(lambda x: x if x != 'NONE' else 'O', set(flat_labels)))

        app.logger.info("Number of sentences: {0} ".format(len(sentences)))
        app.logger.info("Number of tags: {0} ".format(len(self.ners_vals)))

        return sentences, labels

    def get_tag2idx_idx2tag(self):
        """
        return : dict of tag2idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        return : dict of idx2tag : inverted
        """
        tag2idx = {t: i for i, t in enumerate(sorted(self.ners_vals), 1)}
        idx2tag = {i: t for t, i in tag2idx.items()}
        return tag2idx, idx2tag

## For ISW data set

In [120]:
filename='data/test-full-isw-release.tsv'
isw_pre = IswPreprocessor(filename)
sentences = isw_pre.sentences
labels = isw_pre.labels
label_list = isw_pre.label_list



print("number of sentences:", len(sentences))
print('num of tags :', len(label_list))
i = 2
print(sentences[:i])
print(labels[:i])
print(label_list)

number of sentences: 35
num of tags : 18
['nach Palästina nachher zu kommen', 'Also Sie haben uns ja jetzt schon sehr viel Interessantes erzählt und von Ihrer Jugendzeit in Wien könnten Sie uns da noch einmal ihre Eindrücke vermitteln']
[['O', 'B-GPE', 'B-TIME', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DUR', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'B-SORD', 'I-SORD', 'O', 'O', 'O']]
['O', 'I-SORD', 'I-GPE', 'B-FREQ', 'B-AGE', 'B-NRP', 'B-DATE', 'I-ADD', 'B-SORD', 'B-TIME', 'B-TITLE', 'I-PER', 'B-ADD', 'B-GPE', 'B-PER', 'B-DUR', 'B-CARDINAL', 'B-FAC']


In [121]:
tag2idx, idx2tag = isw_pre.get_tag2idx_idx2tag()
tag2idx

{'B-ADD': 0,
 'B-AGE': 1,
 'B-CARDINAL': 2,
 'B-DATE': 3,
 'B-DUR': 4,
 'B-FAC': 5,
 'B-FREQ': 6,
 'B-GPE': 7,
 'B-NRP': 8,
 'B-PER': 9,
 'B-SORD': 10,
 'B-TIME': 11,
 'B-TITLE': 12,
 'I-ADD': 13,
 'I-GPE': 14,
 'I-PER': 15,
 'I-SORD': 16,
 'O': 17}

In [124]:
from keras.preprocessing.sequence import pad_sequences

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=10, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
tags

array([[17,  7, 11, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 11, 17, 17, 17, 17],
       [17, 17, 17,  7, 17, 11, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17,  6, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 11, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 11, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17, 17, 17, 17, 17, 17],
       [17, 17, 17, 11, 17, 17, 17, 17, 17,  7],
       [17, 17, 17, 17,  7, 17, 17, 17, 17, 17],
       [17, 17, 17, 

In [113]:
from transformers import BertTokenizer, BertForTokenClassification
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased", do_lower_case=False)

features = convert_examples_to_features(all_sentences=sentences, all_labels=labels, label_list=label_list, max_seq_length=10, tokenizer=tokenizer)



label_map {'O': 1, 'I-SORD': 2, 'I-GPE': 3, 'B-FREQ': 4, 'B-AGE': 5, 'B-NRP': 6, 'B-DATE': 7, 'I-ADD': 8, 'B-SORD': 9, 'B-TIME': 10, 'B-TITLE': 11, 'I-PER': 12, 'B-ADD': 13, 'B-GPE': 14, 'B-PER': 15, 'B-DUR': 16, 'B-CARDINAL': 17, 'B-FAC': 18}


textlist ['nach', 'Palästina', 'nachher', 'zu', 'kommen']
labellist ['O', 'B-GPE', 'B-TIME', 'O', 'O']
input_ids [188, 24999, 188, 81, 1561, 0, 0, 0, 0, 0]
input_mask [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
segment_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label_ids [1, 14, 10, 1, 1, 0, 0, 0, 0, 0]
valid [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
label_mask [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


textlist ['Also', 'Sie', 'haben', 'uns', 'ja', 'jetzt', 'schon', 'sehr', 'viel', 'Interessantes', 'erzählt', 'und', 'von', 'Ihrer', 'Jugendzeit', 'in', 'Wien', 'könnten', 'Sie', 'uns', 'da', 'noch', 'einmal', 'ihre', 'Eindrücke', 'vermitteln']
labellist ['O', 'O', 'O', 'O', 'O', 'B-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DUR', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'B-SORD', 'I-SOR

In [84]:
tt = {label : i for i, label in enumerate(label_list,1)}
tt

{'O': 1,
 'I-SORD': 2,
 'I-GPE': 3,
 'B-FREQ': 4,
 'B-AGE': 5,
 'B-NRP': 6,
 'B-DATE': 7,
 'I-ADD': 8,
 'B-SORD': 9,
 'B-TIME': 10,
 'B-TITLE': 11,
 'I-PER': 12,
 'B-ADD': 13,
 'B-GPE': 14,
 'B-PER': 15,
 'B-DUR': 16,
 'B-CARDINAL': 17,
 'B-FAC': 18}

In [78]:
i = 0
label_map =  {'O': 1, 'I-SORD': 2, 'I-GPE': 3, 'B-FREQ': 4, 'B-AGE': 5, 'B-NRP': 6, 'B-DATE': 7, 'I-ADD': 8, 'B-SORD': 9, 'B-TIME': 10, 'B-TITLE': 11, 'I-PER': 12, 'B-ADD': 13, 'B-GPE': 14, 'B-PER': 15, 'B-DUR': 16, 'B-CARDINAL': 17, 'B-FAC': 18}
textlist = ['nach', 'Palästina', 'nachher', 'zu', 'kommen']
labellist = ['O', 'B-GPE', 'B-TIME', 'O', 'O']
print(features[i])
print('input_ids',features[i].input_ids)
print('input_mask',features[i].input_mask)
print('segment_ids',features[i].segment_ids)
print('label_id',features[i].label_id)
print('valid_ids',features[i].valid_ids)
print('label_mask',features[i].label_mask)

<__main__.InputFeatures object at 0x14099c828>
input_ids [188, 24999, 188, 81, 1561, 0, 0, 0, 0, 0]
input_mask [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
segment_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label_id [1, 14, 10, 1, 1, 0, 0, 0, 0, 0]
valid_ids [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
label_mask [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [74]:
len(features)

35

In [127]:
label_list.append("[CLS]")

In [128]:
label_list.append("[SEP]")

In [129]:
label_list

['O',
 'I-SORD',
 'I-GPE',
 'B-FREQ',
 'B-AGE',
 'B-NRP',
 'B-DATE',
 'I-ADD',
 'B-SORD',
 'B-TIME',
 'B-TITLE',
 'I-PER',
 'B-ADD',
 'B-GPE',
 'B-PER',
 'B-DUR',
 'B-CARDINAL',
 'B-FAC',
 '[CLS]',
 '[SEP]']

In [72]:
filename='data/merged_headlines_annos.compact.tsv'

tweet_pre = TweetPreprocessor(filename)
sentences, labels = tweet_pre.get_list_of_sentences_labels()
tag2idx, idx2tag = tweet_pre.get_tag2idx_idx2tag()

print("number of sentences:", len(sentences))
print('num of tags :', len(tweet_pre.ners_vals))
i = 2
print(sentences[i])
print(labels[i])
print(tag2idx)
print(idx2tag)
print(tweet_pre.ners_vals)

 ------ Preprocssing Tweets corpus ------
Total number of tweets 8957
Total number of ner tags in tweets 63
number of sentences: 8957
num of tags : 63
Wann auch immer der #Brexit kommen mag - die #IHK bereitet die Unternehmen in #Rheinhessen darauf vor und empfiehlt , vom Worst-Case-Szenario auszugehen . [ plus-Inhalt ]
['O', 'O', 'O', 'O', 'B-EVT', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
{'B-AGE': 0, 'B-ART': 1, 'B-CARDINAL': 2, 'B-CREAT': 3, 'B-DATE': 4, 'B-DUR': 5, 'B-EVT': 6, 'B-FAC': 7, 'B-FRAC': 8, 'B-FREQ': 9, 'B-GPE': 10, 'B-LAN': 11, 'B-LAW': 12, 'B-LOC': 13, 'B-MED': 14, 'B-MISC': 15, 'B-MON': 16, 'B-NRP': 17, 'B-ORDINAL': 18, 'B-ORG': 19, 'B-PER': 20, 'B-PERC': 21, 'B-PRODUCT': 22, 'B-PROJ': 23, 'B-QUANT': 24, 'B-RATE': 25, 'B-SCORE': 26, 'B-SORD': 27, 'B-TIME': 28, 'B-TITLE': 29, 'B-URL': 30, 'I-AGE': 31, 'I-ART': 32, 'I-CARDINAL': 33, 'I-CREAT': 34, 'I-DATE': 35, 'I-DUR': 36, 'I-EVT': 37, 'I-FAC'

In [75]:
set(tweet_pre.ners_vals)-set(isw_pre.ners_vals)

{'B-SCORE', 'B-URL', 'I-CREAT', 'I-SCORE', 'I-URL'}

In [76]:
set(isw_pre.ners_vals)-set(tweet_pre.ners_vals)

{'B-ADD', 'I-ADD'}

In [13]:
from transformers import BertTokenizer, BertForTokenClassification

tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased", do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
tokenized_texts = [[ll for ll in e if "##" not in ll] for e in tokenized_texts ]
tokenized_texts

[['I', 'Also', 'Sie', 'haben', 'uns', 'ja', 'jetzt'],
 ['Die', 'zwei'],
 ['Jetzt'],
 ['Ja', 'dann'],
 ['nach'],
 ['Nach'],
 ['Jetzt', 'Jetzt']]

In [14]:
labels

[['O', 'O', 'O', 'O', 'O', 'O', 'B-TIME'],
 ['O', 'B-CARDINAL'],
 ['B-TIME'],
 ['O', 'B-TIME'],
 ['B-TIME'],
 ['B-TIME'],
 ['B-TIME', 'B-TIME']]

In [13]:
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [21]:
max_len=9
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
#                                         maxlen=max_len, 
                          dtype="long", truncating="post", padding="post")
input_ids

array([[  103, 26938, 12482,   371,   474,  2099,  3278,  1868],
       [  125,   382,     0,     0,     0,     0,     0,     0],
       [ 5072,     0,     0,     0,     0,     0,     0,     0],
       [ 6802,   670,     0,     0,     0,     0,     0,     0],
       [  188,   320,     0,     0,     0,     0,     0,     0],
       [  326,   320,     0,     0,     0,     0,     0,     0],
       [ 5072,  5072,     0,     0,     0,     0,     0,     0]])

In [22]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
#                                         maxlen=max_len,  
                                         padding="post",
                                        dtype="long", truncating="post")
tags

array([[2, 2, 2, 2, 2, 2, 1],
       [2, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [2, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0]])

In [20]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
attention_masks

[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [343]:
import os
import torch
import numpy as np
import pandas as pd
import datetime as dt
import json
import nltk
from nltk import word_tokenize
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tqdm import tqdm, trange
import torch.nn.functional as F

from preprocessor.preprocessor import *
from transformers import BertTokenizer, BertForTokenClassification

In [421]:
class Ner:
    def __init__(self,model_dir: str):
        self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
        self.label_map = self.model_config["label_map"]
        self.max_seq_length = self.model_config["max_seq_length"]
        self.max_seq_length = 10
        self.label_map = {int(k):v for k,v in self.label_map.items()}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)
        self.model.eval()

    def load_model(self, model_dir: str, model_config: str = "model_config.json"):
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
#         model = BertForTokenClassification.from_pretrained(model_dir)
#         tokenizer = BertTokenizer.from_pretrained(model_dir)
        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        return model, tokenizer, model_config

    def tokenize(self, text: str):
        """ tokenize input"""
        words = word_tokenize(text)
        tokens = []
        valid_positions = []
        for i,word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            token = [ t for t in token if "##" not in t ]
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def preprocess(self, text: str):
        """ preprocess """
        tokens, valid_positions = self.tokenize(text)
        segment_ids = []
        for i in range(len(tokens)):
            segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            valid_positions.append(0)
        return input_ids,input_mask,segment_ids,valid_positions

    def predict(self, text: str):
        input_ids,input_mask,segment_ids,valid_ids = self.preprocess(text)
        input_ids = torch.tensor([input_ids],dtype=torch.long,device=self.device)
        input_mask = torch.tensor([input_mask],dtype=torch.long,device=self.device)
        segment_ids = torch.tensor([segment_ids],dtype=torch.long,device=self.device)
        valid_ids = torch.tensor([valid_ids],dtype=torch.long,device=self.device)
#         print('input_ids', input_ids)
#         print('input_mask', input_mask)
#         print('segment_ids', segment_ids)
#         print('valid_ids', valid_ids)
#         print('valid_ids[0]', valid_ids[0])
        with torch.no_grad():
            outputs = self.model(input_ids, segment_ids, input_mask,valid_ids)
            logits = outputs[0]
#             print('logit type', type(logits))
#             print('logit ', logits)
        logits = F.softmax(logits,dim=2)
        logits_label = torch.argmax(logits,dim=2)
        logits_label = logits_label.detach().cpu().numpy().tolist()[0]
#         print('logits_label:', logits_label)
        logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)]
#         print('logits_confidence', logits_confidence)

        logits = []
        for index,mask in enumerate(valid_ids[0]):
            if mask == 1:
#                 print('hi', mask)
                logits.append((logits_label[index], logits_confidence[index]))
#                 print('app logits', logits)
            else:
                pass
        print('label_map', self.label_map)
#         print('logit.pop', logits)
        labels = [(self.label_map[label],confidence) for label,confidence in logits]
        words = word_tokenize(text)
#         print('words:', words)
#         print('labels:', labels)
        assert len(labels) == len(words)
        output = [{"word":word,"tag":label,"confidence":confidence} for word,(label,confidence) in zip(words,labels)]
        return output


In [422]:
sentences ='jetzt bin ich zwölf'

In [423]:
sentences

'jetzt bin ich zwölf'

In [424]:
ner = Ner("/Users/steve.chen/thesis/thesis-ner-co-tri-training/models")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…

KeyboardInterrupt: 

In [411]:
tokens, valid_positions = ner.tokenize(sentences)
print(tokens)
print(valid_positions)

['jetzt', 'bin', 'ich', 'zwölf']
[1, 1, 1, 1]


In [412]:
input_ids,input_mask,segment_ids,valid_positions = ner.preprocess(sentences)

In [415]:
print(input_ids)
print(input_mask)
print(segment_ids)
print(valid_positions)

[1868, 4058, 1169, 4420, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]


In [416]:
ner.predict(sentences)

label_map {0: 'B-ADD', 1: 'B-AGE', 2: 'B-CARDINAL', 3: 'B-DATE', 4: 'B-DUR', 5: 'B-FAC', 6: 'B-FREQ', 7: 'B-GPE', 8: 'B-NRP', 9: 'B-PER', 10: 'B-SORD', 11: 'B-TIME', 12: 'B-TITLE', 13: 'I-ADD', 14: 'I-GPE', 15: 'I-PER', 16: 'I-SORD', 17: 'O'}


[{'word': 'jetzt', 'tag': 'B-ADD', 'confidence': 0.6784576177597046},
 {'word': 'bin', 'tag': 'B-ADD', 'confidence': 0.5905336737632751},
 {'word': 'ich', 'tag': 'B-ADD', 'confidence': 0.7070490717887878},
 {'word': 'zwölf', 'tag': 'B-ADD', 'confidence': 0.6636413931846619}]

In [359]:
sentences ='jetzt bin ich zwölf'
ner.predict(sentences)

input_ids tensor([[1169, 1631,  188,    0,    0,    0,    0,    0,    0,    0]])
input_mask tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])
segment_ids tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
valid_ids tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])
valid_ids[0] tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
logit type <class 'torch.Tensor'>
logit  tensor([[[-0.1963,  0.8812, -0.6778],
         [-0.1679,  0.6354, -0.4804],
         [-0.4352,  0.9377, -0.6370],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636]]])
logits_label: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
logits_confidence [0.6448203325271606, 0.5632150173187256, 0.684721052646637, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963]
0 tensor(1)
hi tensor(1)


[{'word': 'ich', 'tag': 'B-TIME', 'confidence': 0.6448203325271606},
 {'word': 'werde', 'tag': 'B-TIME', 'confidence': 0.5632150173187256},
 {'word': 'nacher', 'tag': 'B-TIME', 'confidence': 0.684721052646637}]

In [101]:
import os
import torch
import json
from nltk import word_tokenize
from transformers import BertTokenizer, BertForTokenClassification
import torch.nn.functional as F



class Ner:
    def __init__(self,model_dir: str):
        self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
        self.label_map = self.model_config["label_map"]
        self.max_seq_length = self.model_config["max_seq_length"]
        self.label_map = {int(k):v for k,v in self.label_map.items()}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)
        self.model.eval()

    def load_model(self, model_dir: str, model_config: str = "model_config.json"):
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
        model = BertForTokenClassification.from_pretrained(model_dir)
        tokenizer = BertTokenizer.from_pretrained(model_dir)
        return model, tokenizer, model_config

    def tokenize(self, text: str):
        """ tokenize input"""
        words = word_tokenize(text)
        tokens = []
        valid_positions = []
        for i,word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            token = [ t for t in token if "##" not in t ]
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def preprocess(self, text: str):
        """ preprocess """
        tokens, valid_positions = self.tokenize(text)
        segment_ids = []
        for i in range(len(tokens)):
            segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            valid_positions.append(0)
        return input_ids,input_mask,segment_ids,valid_positions

    def predict(self, text: str):
        input_ids,input_mask,segment_ids,valid_ids = self.preprocess(text)
        input_ids = torch.tensor([input_ids],dtype=torch.long,device=self.device)
        input_mask = torch.tensor([input_mask],dtype=torch.long,device=self.device)
        segment_ids = torch.tensor([segment_ids],dtype=torch.long,device=self.device)
        valid_ids = torch.tensor([valid_ids],dtype=torch.long,device=self.device)

        with torch.no_grad():
            outputs = self.model(input_ids, segment_ids, input_mask,valid_ids)
            logits = outputs[0]

        logits = F.softmax(logits,dim=2)
        logits_label = torch.argmax(logits,dim=2)
        logits_label = logits_label.detach().cpu().numpy().tolist()[0]
        print('logits_label: ', logits_label)
        logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)]
        print('logits_confidence: ', logits_confidence)
        print('valid_ids[0]: ', valid_ids[0])
        logits = []
        for index,mask in enumerate(valid_ids[0]):
            if mask == 1:
                logits.append((logits_label[index], logits_confidence[index]))
            else:
                pass
        print('logits: ', logits)
        print('self.label_map: ', self.label_map)
        labels = [(self.label_map[label],confidence) for label,confidence in logits]
        print('labels: ', labels)
        words = word_tokenize(text)
        assert len(labels) == len(words)
        pre_tags = [self.label_map[label] for label, _ in logits]
        print('!!!! pre_tags: ', pre_tags)
        output = [{"word:":word,"tag":label,"confidence":confidence} for word,(label,confidence) in zip(words,labels)]
        return output


class Preditor:
    """
    This class will make predictions for unlabeled data:
    :input : list of sentences
    :return : list of predicted tags
    """
    def __init__(self,model_dir: str):
        self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
        self.label_map = self.model_config["label_map"]
        self.max_seq_length = self.model_config["max_seq_length"]
        self.label_map = {int(k):v for k,v in self.label_map.items()}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)
        self.model.eval()

    def load_model(self, model_dir: str, model_config: str = "model_config.json"):
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
        model = BertForTokenClassification.from_pretrained(model_dir)
        tokenizer = BertTokenizer.from_pretrained(model_dir)
        return model, tokenizer, model_config
    
    def tokenize(self, sentences:list):
        """
        :input: list of sentences
        :return: list of tokens, list of valid_positions
        """
        all_tokens = []
        all_valid_positions = []
        for text in sentences:
            words = word_tokenize(text)
            tokens = []
            valid_positions = []
            for i, word in enumerate(words):
                token = self.tokenizer.tokenize(word)
                token = [t for t in token if "##" not in t]
                tokens.extend(token)
                for i in range(len(token)):
                    if i == 0:
                        valid_positions.append(1)
                    else:
                        valid_positions.append(0)
            all_tokens.append(tokens)
            all_valid_positions.append(valid_positions)
        
        return all_tokens, all_valid_positions

    def preprocess(self, sentences:list):
        all_tokens, all_valid_positions = self.tokenize(sentences)

        all_input_ids = []
        all_input_mask = []
        all_segment_ids = []
        all_pad_valid_positions = []
        for index, tokens in enumerate(all_tokens):
            segment_ids = []
            for i in range(len(tokens)):
                segment_ids.append(0)
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)
                all_valid_positions[index].append(0)
            all_input_ids.append(input_ids)
            all_input_mask.append(input_mask)
            all_segment_ids.append(segment_ids)
            all_pad_valid_positions.append(all_valid_positions[index])
        
        return all_input_ids, all_input_mask, all_segment_ids, all_pad_valid_positions
    
    def predict(self, sentences:list):
        all_input_ids, all_input_mask, all_segment_ids, all_pad_valid_positions = self.preprocess(sentences)
        all_input_ids=torch.tensor(all_input_ids)
        all_input_mask=torch.tensor(all_input_mask)
        all_segment_ids=torch.tensor(all_segment_ids)
        all_pad_valid_positions=torch.tensor(all_pad_valid_positions)

        with torch.no_grad():
            outputs = self.model(all_input_ids, all_segment_ids, all_input_mask,all_pad_valid_positions)
            logits = outputs[0]
#             print('outputs', outputs)
#             print('outputs[0]', logits)

        logits = F.softmax(logits,dim=2)
        logits_label = [list(p) for p in torch.argmax(logits, axis=2)]
        print('logits_label before: ', logits_label)
        logits_label = [logits_label[i].detach().cpu().numpy().tolist()[0] for i in range(len(logits_label))]
        print('logits_label: ', logits_label)
        logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)]
        print('logits_confidence: ', logits_confidence)
        print('valid_ids[0]: ', valid_ids[0])
        logits = []
        for index,mask in enumerate(valid_ids[0]):
            if mask == 1:
                logits.append((logits_label[index], logits_confidence[index]))
            else:
                pass
        print('logits: ', logits)
        print('self.label_map: ', self.label_map)
        labels = [(self.label_map[label],confidence) for label,confidence in logits]
        print('labels: ', labels)
        words = word_tokenize(text)
        assert len(labels) == len(words)
        pre_tags = [self.label_map[label] for label, _ in logits]
        print('!!!! pre_tags: ', pre_tags)
        output = [{"word:":word,"tag":label,"confidence":confidence} for word,(label,confidence) in zip(words,labels)]
        return output


In [106]:
model_dir = "./models"
model = Ner(model_dir)
sentences = 'Also Sie haben uns ja jetzt'
all_tokens, all_valid_positions = model.tokenize(sentences)
print(all_tokens)
print(all_valid_positions)

['Also', 'Sie', 'haben', 'uns', 'ja', 'jetzt']
[1, 1, 1, 1, 1, 1]


In [110]:
input_ids = model.tokenizer.convert_tokens_to_ids(all_tokens)
input_ids

[12482, 371, 474, 2099, 3278, 1868]

In [107]:
all_input_ids, all_input_mask, all_segment_ids, all_valid_positions = model.preprocess(sentences)
print('all_input_ids', all_input_ids)
print('\n all_input_mask', all_input_mask)
print('\n all_segment_ids', all_segment_ids)
print('\n all_valid_positions', all_valid_positions)

all_input_ids [12482, 371, 474, 2099, 3278, 1868, 0, 0, 0, 0]

 all_input_mask [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

 all_segment_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

 all_valid_positions [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]


In [46]:
all_input_ids, all_input_mask, all_segment_ids, all_valid_positions = model.preprocess(sentences)
print('all_input_ids', all_input_ids)
print('\n all_input_mask', all_input_mask)
print('\n all_segment_ids', all_segment_ids)
print('\n all_valid_positions', all_valid_positions)

all_input_ids [[12482, 371, 474, 2099, 3278, 1868, 764, 1120, 870, 13769], [14066, 198, 88, 2319, 88, 2421, 0, 0, 0, 0], [12482, 1169, 7404, 1169, 466, 356, 4493, 6515, 12059, 0], [3147, 783, 1169, 93, 705, 0, 0, 0, 0, 0]]

 all_input_mask [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]

 all_segment_ids [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

 all_valid_positions [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]


In [47]:
output = model.predict(sentences)

logits_label:  [[tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17)], [tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17)], [tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17)], [tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17), tensor(17)]]


AttributeError: 'list' object has no attribute 'detach'

In [None]:
logits_label before detach tensor([[17, 17, 17, 17, 17, 17, 17, 17, 17, 17]])
logits_label:  [17, 17, 17, 17, 17, 17, 17, 17, 17, 17]
logits_confidence:  [0.9269393086433411, 0.9271261692047119, 0.9179024696350098, 0.9214142560958862, 0.9159783124923706, 0.8886045217514038, 0.8886045217514038, 0.8886045217514038, 0.8886046409606934, 0.8886046409606934]
valid_ids[0]:  tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
logits:  [(17, 0.9269393086433411), (17, 0.9271261692047119), (17, 0.9179024696350098), (17, 0.9214142560958862), (17, 0.9159783124923706)]
self.label_map:  {0: 'B-ADD', 1: 'B-AGE', 2: 'B-CARDINAL', 3: 'B-DATE', 4: 'B-DUR', 5: 'B-FAC', 6: 'B-FREQ', 7: 'B-GPE', 8: 'B-NRP', 9: 'B-PER', 10: 'B-SORD', 11: 'B-TIME', 12: 'B-TITLE', 13: 'I-ADD', 14: 'I-GPE', 15: 'I-PER', 16: 'I-SORD', 17: 'O'}
labels:  [('O', 0.9269393086433411), ('O', 0.9271261692047119), ('O', 0.9179024696350098), ('O', 0.9214142560958862), ('O', 0.9159783124923706)]

In [92]:
x = [210, 1, 2, 20,1 ,20 ,210, 6, 210]

In [94]:
list(set(x))

[1, 2, 6, 210, 20]

In [95]:
from transformers import BertTokenizer, BertForTokenClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
model = BertForTokenClassification.from_pretrained('bert-base-german-cased')

input_ids = torch.tensor(tokenizer.encode("ich bin jetzet zehn", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)

loss, scores = outputs[:2]



In [99]:
input_ids

tensor([[    3,  1169,  4058, 11991,    75,  1969,     4]])

In [100]:
labels

tensor([[1, 1, 1, 1, 1, 1, 1]])

In [98]:
outputs

(tensor(0.4811, grad_fn=<NllLossBackward>),
 tensor([[[-0.2903,  0.1528],
          [-0.2493,  0.2733],
          [-0.0438,  0.0960],
          [ 0.1222,  0.6178],
          [-0.3988, -0.0919],
          [-0.2724,  0.2836],
          [-0.7181,  0.3331]]], grad_fn=<AddBackward0>))

In [96]:
loss

tensor(0.4811, grad_fn=<NllLossBackward>)

In [97]:
scores

tensor([[[-0.2903,  0.1528],
         [-0.2493,  0.2733],
         [-0.0438,  0.0960],
         [ 0.1222,  0.6178],
         [-0.3988, -0.0919],
         [-0.2724,  0.2836],
         [-0.7181,  0.3331]]], grad_fn=<AddBackward0>)

In [None]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)