In [1]:
import numpy as np
import pandas as pd
import random

from deap import base, creator, tools, algorithms
from scipy.stats import bernoulli
from bitstring import BitArray

import parameters
import ner

Using cuDNN version 6021 on context None
Mapped name None to device cuda: GeForce GTX 1070 (0000:01:00.0)


In [2]:
def replace(individual, indexes, value):
    binary_format = '{0:0' + str(indexes[1] - indexes[0])  + 'b}'
    binary_value = binary_format.format(value)
    index_list = list(range(*indexes))
    assert len(index_list) == len(binary_value), 'Binary representation must be the same size as the indexes interval'
    for (index, replacement) in zip(index_list, binary_value):
        individual[index] = int(replacement)
    return individual

In [3]:
def check_and_replace_if_needed(individual, indexes, min_value, max_value):
    value_bits = BitArray(individual[slice(*indexes)])
    int_value = value_bits.uint
    if int_value < min_value:
        print('adjusting individual to lower bound, switching', int_value, 'at position', indexes, 'for', min_value)
        return replace(indexes=indexes, individual=individual, value=min_value)
    elif int_value > max_value:
        print('adjusting individual to upper bound, switching', int_value, 'at position', indexes, 'for', max_value)
        return replace(indexes=indexes, individual=individual, value=max_value)
    else:
        return individual

In [4]:
def adjust_individual(individual, min_char_lstm_dim=10, max_char_lstm_dim=50, min_word_lstm_dim=50, max_word_lstm_dim=200):
    individual = check_and_replace_if_needed(individual=individual, 
                                             indexes=(2,8), 
                                             min_value=min_char_lstm_dim, 
                                             max_value=max_char_lstm_dim)
    individual = check_and_replace_if_needed(individual=individual, 
                                             indexes=(9,17), 
                                             min_value=min_word_lstm_dim, 
                                             max_value=max_word_lstm_dim)
    return individual

In [5]:
def check_individual():
    def decorator(func):
        def wrapper(*args, **kargs):
            individual = func(*args, **kargs)
            return adjust_individual(individual=individual)
        return wrapper
    return decorator

In [6]:
def check_mate():
    def decorator(func):
        def wrapper(*args, **kargs):
            children = func(*args, **kargs)
            child1, child2 = children
            child1 = adjust_individual(child1)
            child2 = adjust_individual(child2)
            return (child1, child2)
        return wrapper
    return decorator

In [7]:
def check_mutation():
    def decorator(func):
        def wrapper(*args, **kargs):
            mutant = func(*args, **kargs)
            mutant = adjust_individual(mutant[0])
            return mutant
        return wrapper
    return decorator

In [8]:
def train_evaluate(ga_individual_solution):
    training_parameters = parameters.get_parameters_from_individual(ga_individual_solution=ga_individual_solution, 
                                                                    train='dataset/train.txt', 
                                                                    dev='dataset/dev.txt', 
                                                                    test='dataset/test.txt', 
                                                                    tag_scheme='iob', 
                                                                    char_dim=25, 
                                                                    word_dim=100, 
                                                                    pre_emb=None, 
                                                                    all_emb=False, 
                                                                    crf=True, 
                                                                    dropout=0.5, 
                                                                    lr_method='sgd-lr_.005',
                                                                    reload=False)
    print('Lower text?', training_parameters['lower'], 
          'Replace digits with zero?', training_parameters['zeros'], 
          'Dimension of LSTM for chars:', training_parameters['char_lstm_dim'],
          'Use bidirectional LSTM for chars?', training_parameters['char_bidirect'],
          'Dimension of LSTM for words:', training_parameters['word_lstm_dim'],
          'Use bidirectional LSTM for words?', training_parameters['word_bidirect'],
          'Dimension of capitalization features:', training_parameters['cap_dim'])
    model = ner.NER(parameters=training_parameters)
    f1_score = model.train(n_epochs=5, verbose=False)
    return f1_score,

In [9]:
population_size = 4
num_generations = 4
gene_length = 19

In [None]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register('binary', bernoulli.rvs, 0.5)
toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.binary, n = gene_length)
toolbox.decorate('individual', check_individual())
toolbox.register('population', tools.initRepeat, list , toolbox.individual)

toolbox.register('mate', tools.cxOrdered)
toolbox.decorate('mate', check_mate())
toolbox.register('mutate', tools.mutShuffleIndexes, indpb = 0.6)
toolbox.decorate('mutate', check_mutation())
toolbox.register('select', tools.selTournament, tournsize=2)
toolbox.register('evaluate', train_evaluate)

In [None]:
population = toolbox.population(n = population_size)
r = algorithms.eaSimple(population, toolbox, cxpb = 0.4, mutpb = 0.1, ngen = num_generations, verbose = False)

('adjusting individual to upper bound, switching', 63, 'at position', (2, 8), 'for', 50)
('adjusting individual to upper bound, switching', 57, 'at position', (2, 8), 'for', 50)
('adjusting individual to upper bound, switching', 229, 'at position', (9, 17), 'for', 200)
('Lower text?', True, 'Replace digits with zero?', False, 'Dimension of LSTM for chars:', 38, 'Use bidirectional LSTM for chars?', True, 'Dimension of LSTM for words:', 130, 'Use bidirectional LSTM for words?', False, 'Dimension of capitalization features:', 0)
Model location: ./models/tag_scheme=iob,lower=True,zeros=False,char_dim=25,char_lstm_dim=38,char_bidirect=True,word_dim=100,word_lstm_dim=130,word_bidirect=False,pre_emb=None,all_emb=False,cap_dim=0,crf=True,dropout=0.5,lr_method=sgd-lr_.005
Found 12769 unique words (62480 in total)
Found 123 unique characters
Found 21 unique named entity tags
2955 / 529 / 854 sentences in train / dev / test.
Saving the mappings to disk...
Compiling...
('Starting epoch 0 at...', '

Epoch 2 done. Average cost: 7.274146
('Starting epoch 3 at...', 'Thu Jan 25 16:39:44 2018')
processed 8831 tokens with 709 phrases; found: 579 phrases; correct: 206.
accuracy:  88.05%; precision:  35.58%; recall:  29.06%; FB1:  31.99
       ABSTRACCAO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
    ACONTECIMENTO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            COISA: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            LOCAL: precision:  31.60%; recall:  57.25%; FB1:  40.72  250
             OBRA: precision:   0.00%; recall:   0.00%; FB1:   0.00  20
      ORGANIZACAO: precision:  22.73%; recall:  12.50%; FB1:  16.13  22
            OUTRO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
           PESSOA: precision:  44.77%; recall:  51.44%; FB1:  47.87  239
            TEMPO: precision:  31.25%; recall:  12.30%; FB1:  17.65  48
            VALOR: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
7776/8831 (88.05345%)
Score on dev: 31.99000
New 

Epoch 0 done. Average cost: 16.271173
('Starting epoch 1 at...', 'Thu Jan 25 16:43:12 2018')
processed 8831 tokens with 709 phrases; found: 0 phrases; correct: 0.
accuracy:  84.58%; precision:   0.00%; recall:   0.00%; FB1:   0.00
       ABSTRACCAO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
    ACONTECIMENTO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            COISA: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            LOCAL: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
             OBRA: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
      ORGANIZACAO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            OUTRO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
           PESSOA: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            TEMPO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            VALOR: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
7469/8831 (84.57706%)
Score on dev: 0.00000
processed 8831 

Epoch 3 done. Average cost: 5.737442
('Starting epoch 4 at...', 'Thu Jan 25 16:46:51 2018')
processed 8831 tokens with 709 phrases; found: 614 phrases; correct: 269.
accuracy:  89.64%; precision:  43.81%; recall:  37.94%; FB1:  40.67
       ABSTRACCAO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
    ACONTECIMENTO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            COISA: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            LOCAL: precision:  48.60%; recall:  37.68%; FB1:  42.45  107
             OBRA: precision:   0.00%; recall:   0.00%; FB1:   0.00  2
      ORGANIZACAO: precision:  24.56%; recall:  35.00%; FB1:  28.87  57
            OUTRO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
           PESSOA: precision:  45.21%; recall:  72.60%; FB1:  55.72  334
            TEMPO: precision:  45.13%; recall:  41.80%; FB1:  43.40  113
            VALOR: precision: 100.00%; recall:   2.44%; FB1:   4.76  1
7916/8831 (89.63877%)
Score on dev: 40.67000
New 

Epoch 1 done. Average cost: 9.633136
('Starting epoch 2 at...', 'Thu Jan 25 16:50:29 2018')
processed 8831 tokens with 709 phrases; found: 341 phrases; correct: 153.
accuracy:  87.49%; precision:  44.87%; recall:  21.58%; FB1:  29.14
       ABSTRACCAO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
    ACONTECIMENTO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            COISA: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            LOCAL: precision:  38.59%; recall:  51.45%; FB1:  44.10  184
             OBRA: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
      ORGANIZACAO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            OUTRO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
           PESSOA: precision:  61.64%; recall:  21.63%; FB1:  32.03  73
            TEMPO: precision:  44.05%; recall:  30.33%; FB1:  35.92  84
            VALOR: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
7726/8831 (87.48726%)
Score on dev: 29.14000
New bes

Epoch 4 done. Average cost: 4.773287
('Lower text?', True, 'Replace digits with zero?', False, 'Dimension of LSTM for chars:', 50, 'Use bidirectional LSTM for chars?', True, 'Dimension of LSTM for words:', 200, 'Use bidirectional LSTM for words?', False, 'Dimension of capitalization features:', 1)
Model location: ./models/tag_scheme=iob,lower=True,zeros=False,char_dim=25,char_lstm_dim=50,char_bidirect=True,word_dim=100,word_lstm_dim=200,word_bidirect=False,pre_emb=None,all_emb=False,cap_dim=1,crf=True,dropout=0.5,lr_method=sgd-lr_.005
Found 12769 unique words (62480 in total)
Found 123 unique characters
Found 21 unique named entity tags
2955 / 529 / 854 sentences in train / dev / test.
Saving the mappings to disk...
Compiling...
('Starting epoch 0 at...', 'Thu Jan 25 16:53:54 2018')
processed 8831 tokens with 709 phrases; found: 0 phrases; correct: 0.
accuracy:  84.58%; precision:   0.00%; recall:   0.00%; FB1:   0.00
       ABSTRACCAO: precision:   0.00%; recall:   0.00%; FB1:   0.00 

Epoch 2 done. Average cost: 7.126061
('Starting epoch 3 at...', 'Thu Jan 25 16:56:40 2018')
processed 8831 tokens with 709 phrases; found: 625 phrases; correct: 229.
accuracy:  88.02%; precision:  36.64%; recall:  32.30%; FB1:  34.33
       ABSTRACCAO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
    ACONTECIMENTO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            COISA: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            LOCAL: precision:  43.45%; recall:  45.65%; FB1:  44.52  145
             OBRA: precision:   7.14%; recall:   3.85%; FB1:   5.00  28
      ORGANIZACAO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
            OUTRO: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
           PESSOA: precision:  38.40%; recall:  66.83%; FB1:  48.77  362
            TEMPO: precision:  27.78%; recall:  20.49%; FB1:  23.58  90
            VALOR: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
7773/8831 (88.01948%)
Score on dev: 34.33000
New b

In [None]:
best_individuals = tools.selBest(population,k = 1)

lower = None
zeros = None
char_lstm_dim = None
char_bidirect = None
word_lstm_dim = None
word_bidirect = None
cap_dim = None

for bi in best_individuals:
    lower_bit = bi[0]
    zeros_bit = bi[1]
    char_lstm_dim_bits = BitArray(bi[2:8])
    char_bidirect_bit = bi[8]
    word_lstm_dim_bits = BitArray(bi[9:17])
    word_bidirect_bit = bi[17]
    cap_dim_bit = bi[18]
    
    lower = lower_bit == 1
    zeros = zeros_bit == 1
    char_lstm_dim = char_lstm_dim_bits.uint
    char_bidirect = char_bidirect_bit == 1
    word_lstm_dim = word_lstm_dim_bits.uint
    word_bidirect = word_bidirect_bit == 1
    cap_dim = cap_dim_bit
    
    print('Lower text?', lower, 
          'Replace digits with zero?', zeros, 
          'Dimension of LSTM for chars:', char_lstm_dim,
          'Use bidirectional LSTM for chars?', char_bidirect,
          'Dimension of LSTM for words:', word_lstm_dim,
          'Use bidirectional LSTM for words?', word_bidirect,
          'Dimension of capitalization features:', cap_dim,
          'F1 score:', bi.fitness.values)

In [None]:
toolbox.select(population, len(population))

In [None]:
population = toolbox.population(n=5)
print(type(population), population)

In [None]:
children = toolbox.mate(population[0], population[1])

In [None]:
child1, child2 = children

In [None]:
mutant = toolbox.mutate(child1)

In [None]:
print(mutant)
toolbox.evaluate(mutant)

In [None]:
mutant1 = mutant[0]

In [None]:
print(type(mutant), type(mutant1))
print(mutant1)

In [None]:
print(child1)

In [None]:
print(mutant)

In [None]:
toolbox.select(population)