In [3]:
import random
import string
from datetime import datetime
from gutenberg.textpreparation.methods import replace_regex, remove_regex
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
import keras
import numpy as np
from gutenberg.classification.keras import mlp
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical
from collections import defaultdict

In [132]:
def get_token_count(corpus):
    token_count = defaultdict(lambda: 0)
    overall_tokens = 0
    for doc in corpus:
        for token in re.split('[^\w]+', doc):
            overall_tokens += 1
            token_count[token.lower()]+=1
    
    return {k: v/overall_tokens for k, v in token_count.items()} 

def mask_corpus(corpus, seed_list, mask_name):
    return [replace_regex(doc, regex=[f'\\b{e}\\b' for e in seed_list], 
                          replacement=f'\u0002{mask_name}\u0002', ignore_case=True) 
            for doc in corpus]

def get_vectorizer(masked_corpus, mask_name):
    vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,4), max_features=10000)
    vectorizer.fit(masked_corpus)
    return vectorizer

def get_training_data(masked_corpus, mask_name, window, vectorizer):
    y_train = []
    X_train = None
    for doc_num, document in enumerate(masked_corpus):
        if (doc_num+1)%50 == 0:
            print(f'processed {doc_num+1}/{len(masked_corpus)}')
        tokenized = re.split('[^\w\u0002]+', document)
        tokenized_padded = ['']*window + tokenized + ['']*window
        for i, token in enumerate(tokenized):
            pre_vector = vectorizer.transform([' '.join(tokenized_padded[i:i+window])])
            post_vector = vectorizer.transform([' '.join(tokenized_padded[i+window+1:i+2*window+1])])
            final_vector = hstack((pre_vector, post_vector))
            if X_train == None:
                X_train = final_vector
            else:

                X_train = vstack((X_train, final_vector))

            if  f'\u0002{mask_name}\u0002' in token:
                y_train.append(1)
            else:
                y_train.append(0)
                
    y_train = np.array(y_train)
    y_train = to_categorical(y_train, num_classes=None)
    return X_train, y_train


def setup(GPU_ID=None, clear_session=False):
    """Classification setup

    :param config: configuration
    :param clear_session: flag whether Tensorflow session is cleaned
    """
    if clear_session:
        K.clear_session()

    if GPU_ID is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
        runtime_classifier_config = tf.ConfigProto(allow_soft_placement=True)
        runtime_classifier_config.gpu_options.allow_growth = True
        sess = tf.Session(config=runtime_classifier_config)
    else:
        os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
        runtime_classifier_config = tf.ConfigProto(device_count={'GPU': 0})
        sess = tf.Session(config=runtime_classifier_config)

    K.set_session(sess)

    
def train_classifier(X, y):
    CLF_PARAMS_DEFINE_MODEL = {
        'in_dim': X.shape[1],
        'out_dim': y.shape[1],
        'use_embedding_layer': False,
        'num_hidden_layers': 1,
        'num_units': [100],
        'use_bias': False,
        'use_batch_norm': True,
        'activation': 'relu',
        'activation_last_layer': 'softmax',
        'dropout': 0.5,
    }

    CLF_PARAMS_COMPILE_MODEL = {
        'optimizer': keras.optimizers.Adam(amsgrad=False,
                                           beta_1=0.9,
                                           beta_2=0.999,
                                           decay=0.00,
                                           epsilon=1e-8,
                                           lr=0.001),
        'loss': 'categorical_crossentropy',
        'metrics': ['accuracy'],
        'num_gpus': 1,
    }

    CLF_PARAMS_TRAIN = {
        'batch_size': 5,
        'epochs': 10,
        'shuffle': True,
        'verbose': 1
    }


    model = mlp.define_architecture(**CLF_PARAMS_DEFINE_MODEL)

    clf = KerasClassifier(build_fn=mlp.compile_model,
                               model=model,
                               **CLF_PARAMS_COMPILE_MODEL,
                               **CLF_PARAMS_TRAIN)
    history = clf.fit(x=X.todense(), y=y)
    
    return clf
    
def get_name_probabilities(corpus, vectorizer, classifier, window, token_count, thres):
    name_probas = []
    for document in corpus:
        vecs = None
        tokenized = re.split('[^\w\u0002]+', document)
        tokenized_padded = ['']*window + tokenized + ['']*window
        for i, token in enumerate(tokenized):
            pre_vector = vectorizer.transform([' '.join(tokenized_padded[i:i+window])])
            post_vector = vectorizer.transform([' '.join(tokenized_padded[i+window+1:i+2*window+1])])
            stacked_vec = hstack((pre_vector, post_vector))
            if vecs == None:
                vecs = stacked_vec
            else:
                vecs = vstack((vecs, stacked_vec))
        name_probas.append(list(zip(tokenized, classifier.predict_proba(vecs.todense())[:,1])))
    return name_probas

def duplicate_positives(X, y):
    new_positive_idx = np.random.choice(np.argwhere(y[:,1]==1).reshape(-1,),
                                        int(y.shape[0] - np.sum(y[:,1])))
    X = X.tocsr()
    X = vstack((X, X[new_positive_idx,:]))
    
    y = np.append(y, np.array(len(new_positive_idx)*[[0,1]]).reshape(-1,2), axis=0)
    return X, y

In [133]:
y.shape

(1902, 2)

In [144]:
# List of Strings
CORPUS = ['Hallo liebes Otto Team, ich heiße Olga Fischer. Ich bin erbost! VG O. Fischer',
          'Ich würde gerne meine Adresse ändern.  Aber wie nur? liebe Grüße, Olga Schulz',
          'Ich heiße Marius - warum genau weiß nicht nicht. Alles Gute Marius Fischer',
          'blablabla Ich heiße Anette. Wer hätte das gedacht? hahaha VG Anette Bukowski <html//...>']

ENTITY = 'NAME'
WINDOW_SIZE = 2
SEED_LIST = ['O.', 'Fischer', 'Olga', 'Schulz']

token_rel = get_token_count(CORPUS)

t = datetime.now()
print(f'{(datetime.now()-t)} -- Mask Corpus...') 
mc = mask_corpus(CORPUS, SEED_LIST, ENTITY)                         # t = 0:01 m
print(f'{(datetime.now()-t)} -- Create Vectorizer...')
vec = get_vectorizer(mc, ENTITY)                                    # t = 0:00,16 m
print(f'{(datetime.now()-t)} -- Generate Train & Test...')
X, y = get_training_data(mc, ENTITY, WINDOW_SIZE, vec)              # t = 07:00 m 
print(f'{(datetime.now()-t)} -- Duplicate Positives...')
X, y = duplicate_positives(X, y)
print(f'{(datetime.now()-t)} -- Train Classifier...')
clf = train_classifier(X, y)
print(f'{(datetime.now()-t)} -- Calculate Name Probabilities...')
name_probas = get_name_probabilities(CORPUS, vec, clf, WINDOW_SIZE,token_rel, 1)

0:00:00.000074 -- Mask Corpus...
0:00:00.000814 -- Create Vectorizer...
0:00:00.005557 -- Generate Train & Test...
0:00:00.173297 -- Duplicate Positives...
0:00:00.174954 -- Train Classifier...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0:00:08.507753 -- Calculate Name Probabilities...


In [145]:
print(sorted([x for sublist in name_probas for x in sublist if not x[0] in SEED_LIST], key=lambda x: x[1], reverse=True))

[('Grüße', 0.037077684), ('warum', 0.015481879), ('Anette', 0.014906197), ('blablabla', 0.013013004), ('O', 0.011844105), ('Ich', 0.0116691515), ('heiße', 0.008858857), ('heiße', 0.0071677812), ('', 0.006684412), ('Alles', 0.0052263397), ('gerne', 0.004716766), ('Ich', 0.004690479), ('Marius', 0.004404278), ('Ich', 0.0037930894), ('Marius', 0.0037120483), ('würde', 0.003395978), ('Hallo', 0.003060839), ('Ich', 0.0024337696), ('heiße', 0.0022659018), ('erbost', 0.0020008103), ('html', 0.0016377432), ('meine', 0.0015781426), ('genau', 0.0015290531), ('Team', 0.0013206884), ('Adresse', 0.0011763335), ('Otto', 0.0010172601), ('Wer', 0.0007112263), ('hätte', 0.0007025649), ('VG', 0.0006440501), ('Anette', 0.00048776015), ('nicht', 0.00047979638), ('Gute', 0.0004489051), ('nur', 0.00044219548), ('Bukowski', 0.00039082346), ('bin', 0.000336014), ('das', 0.0003244043), ('liebe', 0.00030454333), ('wie', 0.0002861562), ('ändern', 0.00023489875), ('gedacht', 0.00022169587), ('liebes', 0.000179102

In [114]:
print(sorted([x for sublist in name_probas for x in sublist if not x[0] in SEED_LIST], key=lambda x: x[1], reverse=True))

[('Olga', 0.861947), ('VG', 0.82714397), ('Marius', 0.52457404), ('Wer', 0.43621957), ('html', 0.37051007), ('heiße', 0.32507634), ('O', 0.09527617), ('Ich', 0.065555796), ('hahaha', 0.024592336), ('', 0.021146769), ('blablabla', 0.019551756), ('Ich', 0.018869512), ('hätte', 0.01646993), ('gerne', 0.016128035), ('das', 0.014390469), ('Schulz', 0.013798401), ('Gute', 0.012183242), ('Ich', 0.010419562), ('gedacht', 0.008621378), ('ändern', 0.0068459725), ('Alles', 0.0062777954), ('Otto', 0.0062291925), ('bin', 0.004623598), ('nicht', 0.004613282), ('heiße', 0.003694078), ('VG', 0.0034506102), ('Grüße', 0.0034505085), ('Marius', 0.0032813202), ('Ich', 0.0030251488), ('genau', 0.0028159316), ('heiße', 0.0026944727), ('Hallo', 0.0026158604), ('weiß', 0.002189594), ('nicht', 0.001906073), ('Adresse', 0.0016501271), ('erbost', 0.0015771948), ('Team', 0.0015620489), ('liebes', 0.0015105457), ('meine', 0.0014741501), ('würde', 0.0014587694), ('Aber', 0.0012878149), ('warum', 0.00092750334), ('l