In [2]:
import random
import string
from datetime import datetime
from gutenberg.textpreparation.methods import replace_regex, remove_regex
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
import keras
import numpy as np
from gutenberg.classification.keras import mlp
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
def mask_corpus(corpus, seed_list, mask_name):
    return [replace_regex(doc, regex=[f'\\b{e}\\b' for e in seed_list], 
                          replacement=f'\u0002{mask_name}\u0002') 
            for doc in corpus]

def get_vectorizer(masked_corpus, mask_name):
    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.5, analyzer='char_wb', ngram_range=(3,5))
    vectorizer.fit([remove_regex(doc, regex=f'\u0002{mask_name}\u0002') for doc in masked_corpus])
    return vectorizer

def get_training_data(masked_corpus, mask_name, window, vectorizer):
    y_train = []
    X_train = None

    for document in masked_corpus:
        tokenized = document.split()
        tokenized_padded = ['']*window + tokenized + ['']*window
        for i in range(window,len(tokenized)+window):
            pre_vector = vectorizer.transform([' '.join(tokenized_padded[i-window-1:i-1])])
            post_vector = vectorizer.transform([' '.join(tokenized_padded[i+1:i+window+1])])
            final_vector = hstack((pre_vector, post_vector))
            if X_train == None:
                X_train = final_vector
            else:

                X_train = vstack((X_train, final_vector))

            if  f'\u0002{mask_name}\u0002' in tokenized_padded[i]:
                y_train.append(1)
                X_train = vstack((X_train, final_vector))
            else:
                y_train.append(0)
                
    y_train = np.array(y_train)
    y_train = to_categorical(y_train, num_classes=None)
    return X_train, y_train


def train_classifier(X, y):
    CLF_PARAMS_DEFINE_MODEL = {
        'in_dim': X.shape[1],
        'out_dim': y.shape[1],
        'use_embedding_layer': False,
        'num_hidden_layers': 1,
        'num_units': [100],
        'use_bias': False,
        'use_batch_norm': True,
        'activation': 'relu',
        'activation_last_layer': 'softmax',
        'dropout': 0.5,
    }

    CLF_PARAMS_COMPILE_MODEL = {
        'optimizer': keras.optimizers.Adam(amsgrad=False,
                                           beta_1=0.9,
                                           beta_2=0.999,
                                           decay=0.00,
                                           epsilon=1e-8,
                                           lr=0.001),
        'loss': 'categorical_crossentropy',
        'metrics': ['accuracy'],
        'num_gpus': 1,
    }

    CLF_PARAMS_TRAIN = {
        'batch_size': 5,
        'epochs': 10,
        'shuffle': True,
        'verbose': 1
    }


    model = mlp.define_architecture(**CLF_PARAMS_DEFINE_MODEL)

    clf = KerasClassifier(build_fn=mlp.compile_model,
                               model=model,
                               **CLF_PARAMS_COMPILE_MODEL,
                               **CLF_PARAMS_TRAIN)
    history = clf.fit(x=X.todense(), y=y)
    
    return clf
    
def get_name_probabilities(corpus, vectorizer, classifier, window):
    name_probas = []
    for document in corpus:
        vecs = None
        tokenized = document.split()
        tokenized_padded = ['']*window + tokenized + ['']*window
        for i in range(window,len(tokenized)+window):
            pre_vector = vectorizer.transform(tokenized_padded[i-window:i-1])
            post_vector = vectorizer.transform(tokenized_padded[i+1:i+window])
            stacked_vec = hstack((pre_vector, post_vector))
            if vecs == None:
                vecs = stacked_vec
            else:
                vecs = vstack((vecs, stacked_vec))
        name_probas.append(list(zip(tokenized, classifier.predict_proba(vecs.todense())[:,1])))
    return name_probas

In [5]:
with open('deu_news_2015_1M-words.txt', 'r') as f:
    corpus = f.readlines()

In [97]:
# List of Strings
CORPUS = ['Hallo liebes Otto Team, ich heiße Olga Fischer. Ich bin erbost! VG O. Fischer',
          'Ich würde gerne meine Adresse ändern.  Aber wie nur? liebe Grüße, Olga Schulz',
          'Ich heiße Marius - warum genau weiß nicht nicht. Alles Gute Marius Fischer',
          'blablabla Ich heiße Anette. Wer hätte das gedacht? hahaha VG Anette Bukowski <html//...>']

ENTITY = 'NAME'
WINDOW_SIZE = 2
SEED_LIST = ['O.', 'Fischer', 'Anette', 'Bukowski']

mc = mask_corpus(CORPUS, SEED_LIST, ENTITY)
vec = get_vectorizer(mc, ENTITY)
X, y = get_training_data(mc, ENTITY, WINDOW_SIZE, vec)
clf = train_classifier(X, y)
name_probas = get_name_probabilities(CORPUS, vec, clf, WINDOW_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [98]:
print(sorted([x for sublist in name_probas for x in sublist], key=lambda x: x[1], reverse=True))

[('Bukowski', 0.93043846), ('O.', 0.9128145), ('Ich', 0.9099202), ('Fischer', 0.874008), ('Anette', 0.8027109), ('heiße', 0.79287505), ('Marius', 0.751707), ('VG', 0.6666137), ('Wer', 0.65651304), ('ich', 0.16999942), ('Fischer.', 0.15111297), ('Ich', 0.15111297), ('blablabla', 0.15111297), ('Ich', 0.15111297), ('Marius', 0.1268803), ('Ich', 0.10157568), ('Fischer', 0.09750612), ('genau', 0.086602814), ('Olga', 0.075252056), ('Alles', 0.062577605), ('erbost!', 0.05647908), ('Anette.', 0.038898602), ('würde', 0.031528313), ('Hallo', 0.028852971), ('Team,', 0.025913384), ('Schulz', 0.023723705), ('nur?', 0.023605812), ('nicht', 0.023324596), ('-', 0.02200066), ('liebes', 0.02177464), ('Grüße,', 0.01609526), ('bin', 0.015925078), ('<html//...>', 0.015608985), ('heiße', 0.013326765), ('hätte', 0.010833359), ('liebe', 0.010016993), ('Aber', 0.008710182), ('nicht.', 0.0086892415), ('wie', 0.008491383), ('hahaha', 0.008173939), ('warum', 0.0071243225), ('heiße', 0.006926505), ('Olga', 0.00541

In [99]:
SEED_LIST = ['O', 'Fischer', 'Anette', 'Bukowski', 'Marius']

mc = mask_corpus(CORPUS, SEED_LIST, ENTITY)
vec = get_vectorizer(mc, ENTITY)
X, y = get_training_data(mc, ENTITY, WINDOW_SIZE, vec)
clf = train_classifier(X, y)
name_probas = get_name_probabilities(CORPUS, vec, clf, WINDOW_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [100]:
print(sorted([x for sublist in name_probas for x in sublist], key=lambda x: x[1], reverse=True))

[('O.', 0.996414), ('Marius', 0.99117684), ('-', 0.98489064), ('Bukowski', 0.96321946), ('Ich', 0.90885305), ('heiße', 0.83889604), ('Fischer', 0.8017361), ('Gute', 0.77318066), ('Anette', 0.6641807), ('Wer', 0.3766854), ('Anette.', 0.21449465), ('ich', 0.16215149), ('Fischer.', 0.1426241), ('Ich', 0.1426241), ('heiße', 0.1426241), ('blablabla', 0.1426241), ('Ich', 0.1426241), ('<html//...>', 0.14229205), ('Schulz', 0.07342339), ('hätte', 0.06984317), ('VG', 0.04627905), ('ändern.', 0.044727873), ('Team,', 0.043830343), ('VG', 0.042113457), ('Adresse', 0.039288096), ('nicht', 0.0390156), ('Olga', 0.03743849), ('das', 0.03716049), ('Marius', 0.029196005), ('Ich', 0.022962805), ('liebes', 0.020000061), ('Aber', 0.01814909), ('Fischer', 0.017445108), ('Olga', 0.016784677), ('gerne', 0.014359055), ('weiß', 0.012942421), ('würde', 0.011884689), ('nur?', 0.011348455), ('gedacht?', 0.011232781), ('Grüße,', 0.0096255895), ('Hallo', 0.0076198652), ('erbost!', 0.0045330366), ('warum', 0.00449256