In [69]:
import pandas as pd


class IswPreprocessor:
    def __init__(self, filename):
        print(' ------ Preprocssing ISW German corpus ------')
        self.file = self.load_isw_tsv_file(filename)
        self.ners_vals=[]

    def load_isw_tsv_file(self, filename='data/test-full-isw-release.tsv'):
        file = open(filename, encoding='utf-8')
        return file

    def get_list_of_sentences_labels(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        labels, label, sentences, sentence, flat_labels = [], [], [], [], []
        for line in self.file:
            if line.startswith("idx") or line.startswith("0") or line.startswith("NONE"):
                continue
            line = line.strip()
            splits = line.split("\t")
            if '?' in splits[2] or '.' in splits[2] :
                if len(label)>0 and len(sentence)>0:
                    sentences.append(" ".join(sentence))
                    labels.append(label)
                    sentence = []
                    label = []
                continue
            if splits[3] != 'NONE':
                sentence.append(splits[3])
                label.append(splits[6])
                flat_labels.append(splits[6])

        if len(label)>0 and len(sentence)>0:
            sentences.append(" ".join(sentence))
            labels.append(label)

        labels = [list(map(lambda x: x if x != 'NONE' else 'O', i)) for i in labels]
        self.ners_vals = list(map(lambda x: x if x != 'NONE' else 'O', set(flat_labels)))
        
        print("number of sentences:", len(sentences))
        print('num of tags :', len(self.ners_vals))

        return sentences, labels

    def get_tag2idx_idx2tag(self):
        """
        return : dict of tag2idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        return : dict of idx2tag : inverted
        """
        tag2idx = {t: i for i, t in enumerate(sorted(self.ners_vals))}
        idx2tag = {i: t for t, i in tag2idx.items()}
        return tag2idx, idx2tag


class TweetPreprocessor:
    def __init__(self, filename='data/merged_headlines_annos.compact.tsv'):
        print(' ------ Preprocssing Tweets corpus ------')
        self.file = open(filename, encoding='utf-8')
        self.ners_vals=[]

    def get_list_of_sentences_labels(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        labels, label, sentences, sentence, flat_labels = [], [], [], [], []
        for line in self.file:
            if line.startswith("#"):
                continue
            line = line.strip()
            splits = line.split("\t")
            if line.startswith("NONE"):
                if len(label)>0 and len(sentence)>0:
                    sentences.append(" ".join(sentence))
                    labels.append(label)
                    sentence = []
                    label = []
                continue
            sentence.append(splits[1])
            label.append(splits[3])
            flat_labels.append(splits[3])
        
        if len(label)>0 and len(sentence)>0:
            sentences.append(" ".join(sentence))
            labels.append(label)
            
        labels = [list(map(lambda x: x if x != 'NONE' else 'O', i)) for i in labels]
        self.ners_vals = list(map(lambda x: x if x != 'NONE' else 'O', set(flat_labels)))
        print("Total number of tweets", len(sentences))
        print("Total number of ner tags in tweets", len(self.ners_vals))

        return sentences, labels

    def get_tag2idx_idx2tag(self):
        """
        return : dict of tag2idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        return : dict of idx2tag : inverted
        """
        tag2idx = {t: i for i, t in enumerate(sorted(self.ners_vals))}
        idx2tag = {i: t for t, i in tag2idx.items()}
        return tag2idx, idx2tag

## For ISW data set

In [70]:
filename='data/full-isw-release.tsv'
isw_pre = IswPreprocessor(filename)
sentences, labels = isw_pre.get_list_of_sentences_labels()
tag2idx, idx2tag = isw_pre.get_tag2idx_idx2tag()

print("number of sentences:", len(sentences))
print('num of tags :', len(isw_pre.ners_vals))
i = 1
print(sentences[i])
print(labels[i])
print(tag2idx)
print(idx2tag)
print(isw_pre.ners_vals)

 ------ Preprocssing ISW German corpus ------
number of sentences: 16084
num of tags : 60
number of sentences: 16084
num of tags : 60
Meine Eindrücke von Wien von damals
['O', 'O', 'O', 'B-GPE', 'O', 'B-TIME']
{'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3, 'B-CREAT': 4, 'B-DATE': 5, 'B-DUR': 6, 'B-EVT': 7, 'B-FAC': 8, 'B-FRAC': 9, 'B-FREQ': 10, 'B-GPE': 11, 'B-LAN': 12, 'B-LAW': 13, 'B-LOC': 14, 'B-MED': 15, 'B-MISC': 16, 'B-MON': 17, 'B-NRP': 18, 'B-ORDINAL': 19, 'B-ORG': 20, 'B-PER': 21, 'B-PERC': 22, 'B-PRODUCT': 23, 'B-PROJ': 24, 'B-QUANT': 25, 'B-RATE': 26, 'B-SORD': 27, 'B-TIME': 28, 'B-TITLE': 29, 'I-ADD': 30, 'I-AGE': 31, 'I-ART': 32, 'I-CARDINAL': 33, 'I-DATE': 34, 'I-DUR': 35, 'I-EVT': 36, 'I-FAC': 37, 'I-FRAC': 38, 'I-FREQ': 39, 'I-GPE': 40, 'I-LAN': 41, 'I-LAW': 42, 'I-LOC': 43, 'I-MED': 44, 'I-MISC': 45, 'I-MON': 46, 'I-NRP': 47, 'I-ORDINAL': 48, 'I-ORG': 49, 'I-PER': 50, 'I-PERC': 51, 'I-PRODUCT': 52, 'I-PROJ': 53, 'I-QUANT': 54, 'I-RATE': 55, 'I-SORD': 56, 'I-TIME

In [72]:
filename='data/merged_headlines_annos.compact.tsv'

tweet_pre = TweetPreprocessor(filename)
sentences, labels = tweet_pre.get_list_of_sentences_labels()
tag2idx, idx2tag = tweet_pre.get_tag2idx_idx2tag()

print("number of sentences:", len(sentences))
print('num of tags :', len(tweet_pre.ners_vals))
i = 2
print(sentences[i])
print(labels[i])
print(tag2idx)
print(idx2tag)
print(tweet_pre.ners_vals)

 ------ Preprocssing Tweets corpus ------
Total number of tweets 8957
Total number of ner tags in tweets 63
number of sentences: 8957
num of tags : 63
Wann auch immer der #Brexit kommen mag - die #IHK bereitet die Unternehmen in #Rheinhessen darauf vor und empfiehlt , vom Worst-Case-Szenario auszugehen . [ plus-Inhalt ]
['O', 'O', 'O', 'O', 'B-EVT', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
{'B-AGE': 0, 'B-ART': 1, 'B-CARDINAL': 2, 'B-CREAT': 3, 'B-DATE': 4, 'B-DUR': 5, 'B-EVT': 6, 'B-FAC': 7, 'B-FRAC': 8, 'B-FREQ': 9, 'B-GPE': 10, 'B-LAN': 11, 'B-LAW': 12, 'B-LOC': 13, 'B-MED': 14, 'B-MISC': 15, 'B-MON': 16, 'B-NRP': 17, 'B-ORDINAL': 18, 'B-ORG': 19, 'B-PER': 20, 'B-PERC': 21, 'B-PRODUCT': 22, 'B-PROJ': 23, 'B-QUANT': 24, 'B-RATE': 25, 'B-SCORE': 26, 'B-SORD': 27, 'B-TIME': 28, 'B-TITLE': 29, 'B-URL': 30, 'I-AGE': 31, 'I-ART': 32, 'I-CARDINAL': 33, 'I-CREAT': 34, 'I-DATE': 35, 'I-DUR': 36, 'I-EVT': 37, 'I-FAC'

In [75]:
set(tweet_pre.ners_vals)-set(isw_pre.ners_vals)

{'B-SCORE', 'B-URL', 'I-CREAT', 'I-SCORE', 'I-URL'}

In [76]:
set(isw_pre.ners_vals)-set(tweet_pre.ners_vals)

{'B-ADD', 'I-ADD'}

In [13]:
from transformers import BertTokenizer, BertForTokenClassification

tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased", do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
tokenized_texts = [[ll for ll in e if "##" not in ll] for e in tokenized_texts ]
tokenized_texts

[['I', 'Also', 'Sie', 'haben', 'uns', 'ja', 'jetzt'],
 ['Die', 'zwei'],
 ['Jetzt'],
 ['Ja', 'dann'],
 ['nach'],
 ['Nach'],
 ['Jetzt', 'Jetzt']]

In [14]:
labels

[['O', 'O', 'O', 'O', 'O', 'O', 'B-TIME'],
 ['O', 'B-CARDINAL'],
 ['B-TIME'],
 ['O', 'B-TIME'],
 ['B-TIME'],
 ['B-TIME'],
 ['B-TIME', 'B-TIME']]

In [13]:
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [21]:
max_len=9
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
#                                         maxlen=max_len, 
                          dtype="long", truncating="post", padding="post")
input_ids

array([[  103, 26938, 12482,   371,   474,  2099,  3278,  1868],
       [  125,   382,     0,     0,     0,     0,     0,     0],
       [ 5072,     0,     0,     0,     0,     0,     0,     0],
       [ 6802,   670,     0,     0,     0,     0,     0,     0],
       [  188,   320,     0,     0,     0,     0,     0,     0],
       [  326,   320,     0,     0,     0,     0,     0,     0],
       [ 5072,  5072,     0,     0,     0,     0,     0,     0]])

In [22]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
#                                         maxlen=max_len,  
                                         padding="post",
                                        dtype="long", truncating="post")
tags

array([[2, 2, 2, 2, 2, 2, 1],
       [2, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [2, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0]])

In [20]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
attention_masks

[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [343]:
import os
import torch
import numpy as np
import pandas as pd
import datetime as dt
import json
import nltk
from nltk import word_tokenize
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tqdm import tqdm, trange
import torch.nn.functional as F

from preprocessor.preprocessor import *
from transformers import BertTokenizer, BertForTokenClassification

In [421]:
class Ner:
    def __init__(self,model_dir: str):
        self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
        self.label_map = self.model_config["label_map"]
        self.max_seq_length = self.model_config["max_seq_length"]
        self.max_seq_length = 10
        self.label_map = {int(k):v for k,v in self.label_map.items()}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)
        self.model.eval()

    def load_model(self, model_dir: str, model_config: str = "model_config.json"):
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
#         model = BertForTokenClassification.from_pretrained(model_dir)
#         tokenizer = BertTokenizer.from_pretrained(model_dir)
        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        return model, tokenizer, model_config

    def tokenize(self, text: str):
        """ tokenize input"""
        words = word_tokenize(text)
        tokens = []
        valid_positions = []
        for i,word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            token = [ t for t in token if "##" not in t ]
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def preprocess(self, text: str):
        """ preprocess """
        tokens, valid_positions = self.tokenize(text)
        segment_ids = []
        for i in range(len(tokens)):
            segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            valid_positions.append(0)
        return input_ids,input_mask,segment_ids,valid_positions

    def predict(self, text: str):
        input_ids,input_mask,segment_ids,valid_ids = self.preprocess(text)
        input_ids = torch.tensor([input_ids],dtype=torch.long,device=self.device)
        input_mask = torch.tensor([input_mask],dtype=torch.long,device=self.device)
        segment_ids = torch.tensor([segment_ids],dtype=torch.long,device=self.device)
        valid_ids = torch.tensor([valid_ids],dtype=torch.long,device=self.device)
#         print('input_ids', input_ids)
#         print('input_mask', input_mask)
#         print('segment_ids', segment_ids)
#         print('valid_ids', valid_ids)
#         print('valid_ids[0]', valid_ids[0])
        with torch.no_grad():
            outputs = self.model(input_ids, segment_ids, input_mask,valid_ids)
            logits = outputs[0]
#             print('logit type', type(logits))
#             print('logit ', logits)
        logits = F.softmax(logits,dim=2)
        logits_label = torch.argmax(logits,dim=2)
        logits_label = logits_label.detach().cpu().numpy().tolist()[0]
#         print('logits_label:', logits_label)
        logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)]
#         print('logits_confidence', logits_confidence)

        logits = []
        for index,mask in enumerate(valid_ids[0]):
            if mask == 1:
#                 print('hi', mask)
                logits.append((logits_label[index], logits_confidence[index]))
#                 print('app logits', logits)
            else:
                pass
        print('label_map', self.label_map)
#         print('logit.pop', logits)
        labels = [(self.label_map[label],confidence) for label,confidence in logits]
        words = word_tokenize(text)
#         print('words:', words)
#         print('labels:', labels)
        assert len(labels) == len(words)
        output = [{"word":word,"tag":label,"confidence":confidence} for word,(label,confidence) in zip(words,labels)]
        return output


In [422]:
sentences ='jetzt bin ich zwölf'

In [423]:
sentences

'jetzt bin ich zwölf'

In [424]:
ner = Ner("/Users/steve.chen/thesis/thesis-ner-co-tri-training/models")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…

KeyboardInterrupt: 

In [411]:
tokens, valid_positions = ner.tokenize(sentences)
print(tokens)
print(valid_positions)

['jetzt', 'bin', 'ich', 'zwölf']
[1, 1, 1, 1]


In [412]:
input_ids,input_mask,segment_ids,valid_positions = ner.preprocess(sentences)

In [415]:
print(input_ids)
print(input_mask)
print(segment_ids)
print(valid_positions)

[1868, 4058, 1169, 4420, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]


In [416]:
ner.predict(sentences)

label_map {0: 'B-ADD', 1: 'B-AGE', 2: 'B-CARDINAL', 3: 'B-DATE', 4: 'B-DUR', 5: 'B-FAC', 6: 'B-FREQ', 7: 'B-GPE', 8: 'B-NRP', 9: 'B-PER', 10: 'B-SORD', 11: 'B-TIME', 12: 'B-TITLE', 13: 'I-ADD', 14: 'I-GPE', 15: 'I-PER', 16: 'I-SORD', 17: 'O'}


[{'word': 'jetzt', 'tag': 'B-ADD', 'confidence': 0.6784576177597046},
 {'word': 'bin', 'tag': 'B-ADD', 'confidence': 0.5905336737632751},
 {'word': 'ich', 'tag': 'B-ADD', 'confidence': 0.7070490717887878},
 {'word': 'zwölf', 'tag': 'B-ADD', 'confidence': 0.6636413931846619}]

In [359]:
sentences ='jetzt bin ich zwölf'
ner.predict(sentences)

input_ids tensor([[1169, 1631,  188,    0,    0,    0,    0,    0,    0,    0]])
input_mask tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])
segment_ids tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
valid_ids tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])
valid_ids[0] tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
logit type <class 'torch.Tensor'>
logit  tensor([[[-0.1963,  0.8812, -0.6778],
         [-0.1679,  0.6354, -0.4804],
         [-0.4352,  0.9377, -0.6370],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636],
         [-0.1695,  1.2770, -0.6636]]])
logits_label: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
logits_confidence [0.6448203325271606, 0.5632150173187256, 0.684721052646637, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963, 0.7251530289649963]
0 tensor(1)
hi tensor(1)


[{'word': 'ich', 'tag': 'B-TIME', 'confidence': 0.6448203325271606},
 {'word': 'werde', 'tag': 'B-TIME', 'confidence': 0.5632150173187256},
 {'word': 'nacher', 'tag': 'B-TIME', 'confidence': 0.684721052646637}]