# Get your Data
Normally a large dataset will be loaded, here we write a small one out for demsontration purposes.
Say we have a set of Tweets. Surely they are very constructed but the principle works if the corpus is large enough.

In [25]:
corpus = [
    'OMG! I love driving my new Mercedes. It is so fast.',
    'Guys see how cool my friend looks driving my new VW. He\'s loving it!',
    'I have always dreamt of buying a campervan from my friend.'
    'Today I will finally get my new Computer.'
    'I have always dreamt of a campervan from VW.'
    'He doesn\'t seem to like driving my new Lamborghini! Maybe not fast enough :D'
         ]

# Define a seedlist
The goal is to find entities in your corpus without the need of using pretrained models to do so. This makes it more robust to spelling or grammer mistakes (especially in non-english contexts) and also lifts limitations of what an entity might be.

In this case we want to identify words IN CONTEXT that might be car brands:

In [26]:
seed_list = ['mercedes', 'lamborghini']

# Create Tagger
This is the main framework to do the iterative training described in README.md 
To apply less restrictions (since corpora may be very different in nature) the actual model definition and embedding used will be defined separately.


In [35]:
from src.NER import NERTagger

tagger = NERTagger(corpus,
                   entities = [{'name': 'CAR_BRAND', 'seed': seed_list}],
                   seed = 1234, # for reproducability
                   window = 3, # context window around desired word ( designed for not using advanced layers like LSTM)
                   n_jobs = 1, # for large datasets, multiple jobs will be faster
                   train_min_pos_rate = 0.5 # How confident does the model have to be in order to adjust the seed list
                   )



# Create Embedding and Model
Since this is a Proof of Concept, we will be using very simple word embeddings (basically counts) and neural networks since our corpus is too small for anything else.

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Dropout, Input
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


EMBEDDING_SIZE = 100
model_dims = tagger.get_required_dimensions()

mlp_model = Sequential()
mlp_model.add(Embedding(model_dims['num_labels'], EMBEDDING_SIZE, input_length=model_dims['in_dim']))
mlp_model.add(Flatten())
mlp_model.add(Dense(20, activation='relu'))
mlp_model.add(Dropout(0.5))
mlp_model.add(Dense(model_dims['out_dim'], activation='softmax'))

MODEL_PARAMS = {
    "epochs": 10,
    "batch_size": 5,
    "loss": "categorical_crossentropy",
    "metrics": ["accuracy"],
    "optimizer": Adam(amsgrad=False,
                      beta_1=0.9,
                      beta_2=0.999,
                      decay=0.00,
                      epsilon=1e-8,
                      lr=0.01),
}

def compile_model(model, loss, optimizer, metrics):
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    return model

mlp_model.compile(loss= "categorical_crossentropy",
    metrics= ["accuracy"],
    optimizer= Adam(amsgrad=False,
                      beta_1=0.9,
                      beta_2=0.999,
                      decay=0.00,
                      epsilon=1e-8,
                      lr=0.01))
model = KerasClassifier(build_fn=compile_model, model=mlp_model, **MODEL_PARAMS)

tagger.set_model(model)

# Train

In [37]:
import os

generate_config = {
    'max_iterations': 20,
    'min_probability': 0.3,
    'min_update_rate': 0.02
}

os.makedirs('example_run', exist_ok=True)
    
tagger.generate_predictive_rules(iteration_save_path='example_run',
                                  save_iterations=list(range(generate_config['max_iterations']+1)),
                                  **generate_config)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
tagger.predict_token_probabilities('Guys see how cool my friend looks driving my new VW. He\'s loving it!')

[('guys', 6.7550886e-06),
 ('see', 1.04563114e-07),
 ('how', 2.1449303e-08),
 ('cool', 2.4688078e-09),
 ('my', 1.4934152e-08),
 ('friend', 1.0560224e-05),
 ('looks', 5.091005e-06),
 ('driving', 2.5658253e-05),
 ('my', 3.4715442e-06),
 ('new', 0.0009779446),
 ('vw', 0.0065938407),
 ("he's", 3.4171895e-05),
 ('loving', 7.38642e-05),
 ('it', 1.0)]