In [1]:
from metasynth.distribution.regex2 import RegexNewDistribution
from metasynth.distribution.string import RegexDistribution, DigitRegex, SingleRegex, AnyRegex, UppercaseRegex, find_best_gradient
from metasynth.distribution.util import RegexOptimizer, get_energy
import pandas as pd
from metasynth import MetaDataset
import wget
from pathlib import Path
import json
import string
import re
import numpy as np

In [2]:
dtypes = {
    "Survived": "category",
    "Pclass": "category",
    "Name": "string",
    "Sex": "category",
    "SibSp": "category",
    "Parch": "category",
    "Ticket": "string",
    "Cabin": "string",
    "Embarked": "category"
}
titanic_fp = Path("titanic.csv")
if not titanic_fp.is_file():
    wget.download("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
df = pd.read_csv(titanic_fp, dtype=dtypes)

In [3]:
#regex = RegexDistribution.fit(df["Cabin"])

In [4]:
values = df["Cabin"].dropna().values

In [5]:
def create_spans(values):
    return DigitRegex.all_spans(values)

In [6]:
spans = DigitRegex.all_spans(values)

In [7]:
optimizer = RegexOptimizer(values, spans)
optimizer.energy

50.23065693700087

In [8]:
optimizer.set_span(0, 0)

In [9]:
optimizer.energy

50.23065693700087

In [10]:
optimizer.optimize()

In [11]:
optimizer.energy

43.45227405390325

In [12]:
get_energy(values)

99.38816110875968

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
def create_spans_char(values, *characters):
    char_list = "".join(re.escape(c) for c in characters)
    regex = re.compile(r"[" + char_list + r"]")
    return [
        [match.span() for match in regex.finditer(val)]
        for val in values
    ]


In [15]:
def information_budget(n_char):
    n_param = n_char + 1
    return 2*n_param + 2*np.log(n_char)

In [16]:
values = np.append(np.append(["A"]*100, ["B"]*100), ["C"])

In [17]:
values = df["Ticket"].dropna().values

In [18]:
print(RegexDistribution.fit(df["Ticket"]).to_dict())

{'name': 'RegexDistribution', 'parameters': {'re_list': [('[S]', 0.06509539842873176), ('.[]{0,4}', 0.25813692480359146), ('[/]', 0.10998877665544332), ('.[]{0,6}', 0.6868686868686869), ('[ ]', 0.2536475869809203), ('[2]', 0.013468013468013467), ('[.]', 0.013468013468013467), ('[B]', 0.001122334455667789), ('[a]', 0.001122334455667789), ('[s]', 0.001122334455667789), ('[l]', 0.001122334455667789), ('[e]', 0.001122334455667789), ('[ ]', 0.014590347923681257), ('\\d{1,5}', 0.35802469135802467), ('[0]', 0.1537598204264871), ('\\d{1,4}', 0.18518518518518517)]}}


In [19]:
new_values, gradient, regex = DigitRegex.fit(values)
print(gradient, regex, regex.frac_used)

7.308196071082557 \d{1,7} 0.9955106621773289


In [20]:
_, gradient, regex = UppercaseRegex.fit(values)
print(gradient, regex, regex.frac_used)

4.887198678166282 [A-Z]{1,5} 0.25813692480359146


In [26]:
new_values, gradient, regex = SingleRegex.fit(values)
print(gradient, regex, regex.frac_used)

41.80575189220164 [ ] 0.2536475869809203


In [33]:
x = np.arange(10)
x[3:2:-1] +=  10
x

array([ 0,  1,  2, 13,  4,  5,  6,  7,  8,  9])

In [23]:
for regex_class in RegexDistribution.all_regex_classes():
    new_values, gradient, regex = regex_class.fit(new_values[1])
    print(regex_class)

<class 'metasynth.distribution.string.DigitRegex'>
<class 'metasynth.distribution.string.AlphaNumericRegex'>


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
for i in range(len(values)):
    print(values[i], "x", new_values[0][i], "x", new_values[1][i])

In [None]:
values = ['', '', '', '', '', '', '', 'C25', '', '', '', '', '', '', '', 'G73', 'C25', '', '', 'D12', '', '', 'B60', '', '', 'E69', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'C26', '', 'B60', '', 'C26', '', '', '', 'B59', '', '', '', '', '', '', '', '', '', '', '', 'C25', '', '', '', '', '', '', '', 'B98', '', '', '', '', '', '', 'B98', 'C25', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'C26', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'B53', '', '', '', '', 'G63', 'C64', '', '', '', '', '', 'G73', '', '', '', '', '', '', '', 'B59', '', '', '', '', 'B98', '', '', '', '', '', '', 'B84', '', 'B98', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'B53', '', '', '']
sorted_counts = [('B', 11), ('C', 8), ('2', 8), ('5', 8), ('6', 8), ('9', 7), ('3', 5), ('8', 5), ('G', 3), ('7', 2), ('0', 2), ('4', 2), ('D', 1), ('1', 1), ('E', 1)]

print(find_best_gradient(values, sorted_counts))

In [None]:
vec = CountVectorizer(analyzer="char", lowercase=False)
counts = vec.fit_transform(values).getnnz(0)
count_dict = {c: counts[i] for c, i in vec.vocabulary_.items()}
sorted_counts = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)

In [None]:
def find_best_gradient(start_char=None):
    if start_char is None:
        start_char = []
    start_energy = get_energy(values)
    best_solution = (None, 0)
    for character, char_count in sorted_counts:
        spans = create_spans_char(values, character, *start_char)
        optimizer = RegexOptimizer(values, spans)
        optimizer.optimize()
        energy_grad = (start_energy - optimizer.energy)/information_budget(1+len(start_char))
        if energy_grad > best_solution[1]:
            best_solution = (character, energy_grad)
    return best_solution

In [None]:
first_solution = find_best_gradient()
second_solution = find_best_gradient(first_solution[0])
print(first_solution)
print(second_solution)

In [None]:
last_char = best_solution[0]
start_energy = get_energy(values)
best_solution = (None, 0)
for character, char_count in sorted_counts:
    if character == last_char:
        continue
    spans = create_spans_char(values, last_char, character)
    optimizer = RegexOptimizer(values, spans)
    optimizer.optimize()
    print(optimizer.energy)
    energy_grad = (start_energy - optimizer.energy)/information_budget(2)
    if energy_grad > best_solution[1]:
        best_solution = (character, energy_grad)
    print(character, char_count, energy_grad)
print(best_solution)

In [None]:
start_energy

In [None]:
information_budget(2)

In [None]:
information_budget(1)

In [None]:
optimizer.energy