## Proof of Concept
- generate multiple distinct regex expressions

In [1]:
%config IPCompleter.greedy=True

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import re
import numpy as np

print('re:', re.__version__)
print('numpy:', np.__version__)

import sys

sys.path.append('..')

from package.ga import BinaryGeneFactory, AbstractFitness, SimpleHillClimber
from package.transformers import IntegerToBinaryString, StringToMapping, KeyArrayToRegex

re: 2.2.1
numpy: 1.16.3


In [4]:
%%html
<h4>1. document setup</h4>

In [5]:
text = 'ab mn gd: 0.33\n' \
     + 'cd mn gd: 0.44\n' \
     + 'de mn gd: 0.55\n' \
     + 'fg mn gd: 0.66\n' \

static_ending = r'\W([\d.]*\d[.\d]*)\b'

expected_numbers = list(map(lambda n: float(n), re.compile(static_ending).findall(text)))
print(expected_numbers)

[0.33, 0.44, 0.55, 0.66]


In [6]:
%%html
<h4>2. setup</h4>

In [7]:
consts = 'abcdefghijklmnopqrstuvwxyz'
regexes = [
    r'\s',
    r'\d',
    r'[a-z]',
    r'[:]',
    r'[!?.]',
    r'\W'
]

complete_set = [ c for c in consts ] + regexes

binary_start = 0
binary_end = len(complete_set) -1 # hard end, values < binary_end
 
integer_to_binary_transformer = IntegerToBinaryString(5)
gene_factory = BinaryGeneFactory(binary_start, binary_end, 5)

binary_to_regex = {}
for i in range(binary_end):
    key = integer_to_binary_transformer.transform(i)
    binary_to_regex[key] = complete_set[i]

string_mapper = StringToMapping(binary_to_regex)
to_regex = KeyArrayToRegex(string_mapper)

In [8]:
from package.ga.setups import DynamicWithStaticEnding

mutator = DynamicWithStaticEnding.Mutator(gene_factory)

In [9]:
%%html
<h4>3. generate</h4>

In [10]:
output = []
number_of_iterations = 10000
for expected_number in expected_numbers:
    individual = gene_factory.create_many(10)
    
    fitness_evaluator = DynamicWithStaticEnding.Fitness(to_regex, static_ending, expected_number, text)
    hill_climber = SimpleHillClimber(fitness_evaluator, [ mutator.gene_mutator ], [ mutator.individual_height_mutator ])
    
    result = hill_climber.run(individual, number_of_iterations, False)
    final_individual = result[0]
    final_fitness = result[1]
    final_iteration = result[2]
    
    print(
        'compressed:',
        '/' + to_regex.transform_and_compress(final_individual) + static_ending + '/gimu',
        '~',
        final_fitness,
        '~',
        final_iteration,
        '==',
        expected_number,
    )

    output.append(
        (to_regex.transform_and_compress(final_individual) + static_ending, final_fitness)
    )   

compressed: /b\s[a-z]n\s[a-z]+[:]\W([\d.]*\d[.\d]*)\b/gimu ~ 1.0 ~ 2939 == 0.33
compressed: /c[a-z]\smn\s[a-z]d[:]\W([\d.]*\d[.\d]*)\b/gimu ~ 1.0 ~ 7428 == 0.44
compressed: /e\s[a-z]n\s[a-z]+[:]\W([\d.]*\d[.\d]*)\b/gimu ~ 1.0 ~ 3218 == 0.55
compressed: /g\s[a-z]+\sg[a-z][:]\W([\d.]*\d[.\d]*)\b/gimu ~ 1.0 ~ 2829 == 0.66


In [11]:
print(text)

ab mn gd: 0.33
cd mn gd: 0.44
de mn gd: 0.55
fg mn gd: 0.66

