## Proof of Concept
- generating a regex expression
- individual has a dynamic length, can grow / shrink

In [1]:
%config IPCompleter.greedy=True

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import re
import numpy as np

print('re:', re.__version__)
print('numpy:', np.__version__)

re: 2.2.1
numpy: 1.16.3


In [4]:
import sys

sys.path.append('..')

from package.ga import BinaryGeneFactory, AbstractFitness, SimpleHillClimber
from package.transformers import IntegerToBinaryString, StringToMapping, KeyArrayToRegex

In [5]:
%%html
<h4>1. text, expected text</h4>

In [6]:
## 1. text -> '{expected string}' within,
expected_text = 'cost: 1500'
expected_text_length = len(expected_text)
text = 'a bird in hand is worth two in the bush?\n' \
     + 'these watches ' + expected_text + '!\n' \
     + 'the ball is in court 1500.' \

print(text)

a bird in hand is worth two in the bush?
these watches cost: 1500!
the ball is in court 1500.


In [7]:
%%html
<h4>2. setup</h4>

In [8]:
consts = 'abcdefghijklmnopqrstuvwxyz'
regexes = [
    r'\s',
    r'\d',
    r'[a-z]',
    r'[:]',
    r'[!?.]',
    r'[0-9]'
]

complete_set = [ c for c in consts ] + regexes

binary_start = 0
binary_end = len(complete_set) -1 # hard end, values < binary_end
 
integer_to_binary_transformer = IntegerToBinaryString(5)
gene_factory = BinaryGeneFactory(binary_start, binary_end, 5)

binary_to_regex = {}
for i in range(binary_end):
    key = integer_to_binary_transformer.transform(i)
    binary_to_regex[key] = complete_set[i]

string_mapper = StringToMapping(binary_to_regex)
to_regex = KeyArrayToRegex(string_mapper)

In [9]:
class Fitness(AbstractFitness):
    to_regex = None
    expected_match = ''
    
    def __init__(self, to_regex, expected_match, text):
        self.to_regex = to_regex
        self.expected_match = expected_match
        self.text = text
        
        super()

    def evaluate(self, individual, display_logging = False):
        fitness = 0.0
        
        regexes = self.to_regex.transform_to_array(individual)
        
        regexes_length = len(regexes)
        expected_text_length = len(self.expected_match)
        
        ## 1. regex is the same length
        if regexes_length == expected_text_length:
            fitness += 1.0
        elif expected_text_length > regexes_length:
            fitness += (1 - ((expected_text_length - regexes_length) / expected_text_length))
        else:
            fitness += (1 - ((regexes_length - expected_text_length) / regexes_length))
            
        if display_logging:
            print('rule 1:', fitness)
            
        compare_individual_elements = np.array([
            re.match(ai, self.expected_match[i]) != None
            for i, ai
            in enumerate(regexes)
            if i < expected_text_length
        ]).astype(int).sum()
        
        fitness += (compare_individual_elements / expected_text_length)

        if display_logging:
            print('rule 2:', fitness)

        regex = self.to_regex.transform(individual)
        
        pattern = re.compile(regex)
        matches = pattern.findall(self.text)
        if len(matches) > 0:
            against_first_match = matches[0]
            
            comparision = np.array([
                self.expected_match[i] == against_first_match[i]
                for i 
                in range(len(against_first_match)) if i < expected_text_length
            ]).astype(int).sum()
            
            fitness += (comparision / expected_text_length)

            if display_logging:
                print('rule 2:', fitness)

        return fitness / 3
    
fitness_evaluator = Fitness(to_regex, expected_text, text)

In [10]:
def gene_mutator(gene, display_logging = False):
    precentage = np.random.rand()
    if precentage < .08:
        gene = gene_factory.create()
        
    return gene

def individual_height_mutator(individual, display_logging = False):
    precentage = np.random.rand()
    if precentage < .10:
        gene = gene_factory.create()
        individual += [gene]
    
    length = len(individual)
    if precentage > .90 and length > 0:
        individual = individual[:len(individual)-1]
        
    return individual

hill_climber = SimpleHillClimber(fitness_evaluator, [ gene_mutator ], [ individual_height_mutator ])

In [11]:
%%html
<h4>3. create individual</h4>

In [12]:
individual = gene_factory.create_many(4)

print('binary:', '|'.join(individual))
print('regex: ', '/'+ ''.join(to_regex.transform_and_compress(individual)) + '/gimu')

binary: 01011|00011|10011|01010
regex:  /ldtk/gimu


In [13]:
%%html
<h4>4. run</h4>

In [14]:
number_of_iterations = 50000
result = hill_climber.run(individual, number_of_iterations)

final_individual = result[0]
final_fitness = result[1]
final_iteration = result[2]

print(
    'compressed:',
    '/' + to_regex.transform_and_compress(final_individual) + '/gimu',
    '~',
    '"' + expected_text + '"',
    '~',
    final_fitness,
    '~',
    final_iteration
)

print(
    'original:',
    '/' + to_regex.transform_and_compress(individual) + '/gimu',
)

compressed: /[a-z]+t[:]\s\d+/gimu ~ "cost: 1500" ~ 1.0 ~ 1045
original: /ldtk/gimu
