## Proof of Concept

In [1]:
%config IPCompleter.greedy=True

In [2]:
import warnings

warnings.filterwarnings('ignore')

import sys

sys.path.append('..')

In [3]:
import re
import numpy as np

In [4]:
document = """

  mother:  Tree either call task until red push toward produce property if green eight set.
  service:  Everybody market argue center crime approach actually they would artist system and.
  everyone:  Because do guy xyz = 50 claim sit reveal walk.
  grow:  Security improve situation we later pull save thank not edge in.
  state:  Probably money long six religious manage hear pass best work.
  treatment:  Situation of water though it war.
  report:  Final factor last care would I eight four.
  American:  Magazine will mm = 51 third itself direction well party local concern site law word response.
  instead:  After front speech hope turn decade both worry.
  type:  Stop small job gas way memory south environment hundred lawyer staff bring decision option third full.

     aabc: 0     aefg: 10     ahij: 20    
     babc: 1     befg: 11     bhij: 21    
     cabc: 2     cefg: 12     chij: 22    
     dabc: 3     defg: 13     dhij: 23    

     eabc: 4     eefg: 14     ehij: 24    
     fabc: 5     fefg: 15     fhij: 25    
     gabc: 6     gefg: 16     ghij: 26    
     habc: 7     hefg: 17     hhij: 27    

"""

In [5]:
%%html

<h5>1. split document</h5>

In [6]:
from package.transformers import DocumentToMatrix
from package.estimators import CorrelationEstimator
from package.splitters import SingleThresholdSplitter

In [7]:
document_transformer = DocumentToMatrix.BasicDocumentToMatrix(document)
document_as_matrix = document_transformer.transform()

In [8]:
estimator = CorrelationEstimator(document_transformer)
y = np.array(estimator.evaluate()).flatten().tolist()

In [9]:
splitter = SingleThresholdSplitter(y, .35)
sections = splitter.split_document(document_transformer.lines_in_document)

print('# of sections:', len(sections))

# of sections: 2


In [10]:
%%html

<h5>2. build regex expressions per section</h5>

In [11]:
from package.ga import BinaryGeneFactory, AbstractFitness, SimpleHillClimber
from package.transformers import IntegerToBinaryString, StringToMapping, KeyArrayToRegex
from package.ga.setups import DynamicWithStaticEnding

In [12]:
static_ending = r'\W([\d.]*\d[.\d]*)\W'
consts = 'abcdefghijklmnopqrstuvwxyz'
regexes = [
    r'\s',
    r'\d',
    r'[a-z]',
    r'[:=]',
    r'[!?.]',
    r'\W'
]

complete_set = [ c for c in consts ] + regexes

binary_start = 0
binary_end = len(complete_set) -1 # hard end, values < binary_end
 
integer_to_binary_transformer = IntegerToBinaryString(5)
gene_factory = BinaryGeneFactory(binary_start, binary_end, 5)

binary_to_regex = {}
for i in range(binary_end):
    key = integer_to_binary_transformer.transform(i)
    binary_to_regex[key] = complete_set[i]

string_mapper = StringToMapping(binary_to_regex)
to_regex = KeyArrayToRegex(string_mapper)

In [13]:
mutator = DynamicWithStaticEnding.Mutator(gene_factory)

In [14]:
output = []
number_of_iterations = 10000
for section_index, text in enumerate(sections):
    regex_section_expressions = []
    expected_numbers = list(map(lambda n: float(n), re.compile(static_ending).findall(text)))
    for expected_number in expected_numbers:
        individual = gene_factory.create_many(10)

        fitness_evaluator = DynamicWithStaticEnding.Fitness(to_regex, static_ending, expected_number, text)
        hill_climber = SimpleHillClimber(fitness_evaluator, [ mutator.gene_mutator ], [ mutator.individual_height_mutator ])

        result = hill_climber.run(individual, number_of_iterations, False)
        final_individual = result[0]
        final_fitness = result[1]
        final_iteration = result[2]

        print(
            'compressed:',
            '/' + to_regex.transform_and_compress(final_individual) + static_ending + '/gimu',
            '~',
            final_fitness,
            '~',
            final_iteration,
            '==',
            expected_number,
        )

        regex_section_expressions.append(
            to_regex.transform_and_compress(final_individual) + static_ending
        )
    
    print()
    print('- section', section_index+1, 'complete -')
    print()
    
    output.append(regex_section_expressions)

compressed: /guy\s[a-z]{3}\s[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 1404 == 50.0
compressed: /n[a-z]\s[a-z]{4}\s[a-z]m\s[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 2287 == 51.0

- section 1 complete -

compressed: /a[a-z]{2}c[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 3502 == 0.0
compressed: /\sa[a-z]f[a-z][:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 1825 == 10.0
compressed: /\d{2}\s{5}a[a-z]ij[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 4646 == 20.0
compressed: /\s{2}ba[a-z]c[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 1565 == 1.0
compressed: /b[a-z]f[a-z][:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 1959 == 11.0
compressed: /\s{5}bh[a-z]j[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 6001 == 21.0
compressed: /\s{2}ca[a-z]{2}[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 2786 == 2.0
compressed: /ce[a-z]g[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 988 == 12.0
compressed: /c[a-z]ij[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 1434 == 22.0
compressed: /dabc[:=]+\W([\d.]*\d[.\d]*)\W/gimu ~ 1.0 ~ 572 == 3.0
compressed: /\s{5}de[a-z]{2}[:=]+\W([\d.]*\d[

In [15]:
for regexes in output:
    for regex in regexes:
        match = re.compile(regex).findall(document)
        print(regex, ' - ', match)
    
    print()
    print('- section complete -')
    print()

guy\s[a-z]{3}\s[:=]+\W([\d.]*\d[.\d]*)\W  -  ['50']
n[a-z]\s[a-z]{4}\s[a-z]m\s[:=]+\W([\d.]*\d[.\d]*)\W  -  ['51']

- section complete -

a[a-z]{2}c[:=]+\W([\d.]*\d[.\d]*)\W  -  ['0']
\sa[a-z]f[a-z][:=]+\W([\d.]*\d[.\d]*)\W  -  ['10']
\d{2}\s{5}a[a-z]ij[:=]+\W([\d.]*\d[.\d]*)\W  -  ['20']
\s{2}ba[a-z]c[:=]+\W([\d.]*\d[.\d]*)\W  -  ['1']
b[a-z]f[a-z][:=]+\W([\d.]*\d[.\d]*)\W  -  ['11']
\s{5}bh[a-z]j[:=]+\W([\d.]*\d[.\d]*)\W  -  ['21']
\s{2}ca[a-z]{2}[:=]+\W([\d.]*\d[.\d]*)\W  -  ['2']
ce[a-z]g[:=]+\W([\d.]*\d[.\d]*)\W  -  ['12']
c[a-z]ij[:=]+\W([\d.]*\d[.\d]*)\W  -  ['22']
dabc[:=]+\W([\d.]*\d[.\d]*)\W  -  ['3']
\s{5}de[a-z]{2}[:=]+\W([\d.]*\d[.\d]*)\W  -  ['13']
\s{2}dh[a-z]j[:=]+\W([\d.]*\d[.\d]*)\W  -  ['23']
\se[a-z]b[a-z][:=]+\W([\d.]*\d[.\d]*)\W  -  ['4']
e{2}f[a-z][:=]+\W([\d.]*\d[.\d]*)\W  -  ['14']
\seh[a-z]j[:=]+\W([\d.]*\d[.\d]*)\W  -  ['24']
f[a-z]b[a-z][:=]+\W([\d.]*\d[.\d]*)\W  -  ['5']
fe[a-z]g[:=]+\W([\d.]*\d[.\d]*)\W  -  ['15']
\s{4}fhij[:=]+\W([\d.]*\d[.\d]*)\W  -  ['2