In [1]:
import pandas as pd
from metasynth import MetaDataset, MetaVar
from metasynth.distribution.util import _get_all_distributions
import wget
from pathlib import Path
import json

In [2]:
dtypes = {
    "Survived": "category",
    "Pclass": "category",
    "Name": "string",
    "Sex": "category",
    "SibSp": "category",
    "Parch": "category",
    "Ticket": "string",
    "Cabin": "string",
    "Embarked": "category"
}

In [3]:
titanic_fp = Path("titanic.csv")
if not titanic_fp.is_file():
    wget.download("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")

In [4]:
df = pd.read_csv(titanic_fp, dtype=dtypes)

In [5]:
from metasynth.distribution.regex.optimizer import RegexOptimizer
from metasynth.distribution.regex.element import AlphaNumericRegex, AnyRegex, DigitRegex
import numpy as np

In [6]:
new_values, gradient, regex = AlphaNumericRegex.fit(df["Cabin"].dropna().values)
any_values, any_gradient, any_regex = AnyRegex.fit(df["Cabin"].dropna().values)

16.126702318724174 5489.152286310665
42.1855530822143 7850.8536343362675


In [7]:
stats = {
    "n_values": 5,
    "frac_used": 0.02,
    "digits_used": [0, 1, 1, 1, 1, 1]
}

print("digit", DigitRegex(1, 5).information_budget(stats))
print("any", AnyRegex(1, 5).information_budget(stats))

digit 128.29216196844385
any 197.36971475826522


In [8]:
RegexOptimizer.energy_from_values(new_values[0]), RegexOptimizer.energy_from_values(new_values[1])

(0.0, 26.058850763490124)

In [9]:
RegexOptimizer.energy_from_values(df["Cabin"].dropna())

42.1855530822143

In [10]:
len(np.array(new_values[1])[np.array(new_values[1]) != ""])

24

In [11]:
len(new_values[1])

204

In [12]:
var = MetaVar.detect(df["Cabin"])
var.fit()

8.282230063296673 2324.888645842747
\d{1,3} 0.003562420109069118
----------------
16.126702318724174 5489.152286310665
\w{1,4} 0.0029379221922740924
----------------
-13.730843181406101 4
[a-z]{1,1} -3.4327107953515252
----------------
5.323009979138412 1333.3033875127646
[A-Z]{1,1} 0.003992347149937359
----------------
5.323009979138412 1616.1074371812224
[a-zA-Z]{1,1} 0.003293722840866746
----------------
2.5925373139803582 151.78190731126318
2.700528736852 232.3222360246993
[ ] 0.017080674237831082
----------------
42.1855530822143 7850.8536343362675
.[]{1,15} 0.005373371488892976
----------------
best [ ]
values_left [24] ['C23', 'F', 'C23', 'D10', 'B58', 'F', 'C22', 'B58', 'C22', 'B57']
values_right[204] ['C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C25 C27', 'B78', 'D33']
+++++++++++++++++++++++++


6.089044875446845 319.07411518569756
\d{2,2} 0.01908348119026543
----------------
9.307920700315046 732.7884984531042
\w{1,3} 0.012702056213987806
----------------
0.0 4
[a-z]{1,

1.1755733298042381 71.50094640320015
2.197224577336219 84.59130129215927
2.197224577336219 93.07874302188989
[52] 0.025974592467226637
----------------
6.591673732008658 292.5491153306286
.[]{3,3} 0.022531853239614087
----------------
best \d{2,2}
values_left [8] ['C', 'C', 'B', 'C', 'C', 'B', 'B', 'B']
values_right[0] []
+++++++++++++++++++++++++


0.0 4
\d{1,1} 0.0
----------------
2.1972245773362196 137.5350965639216
\w{1,1} 0.01597573733708781
----------------
0.0 4
[a-z]{1,1} 0.0
----------------
2.1972245773362196 123.63049101154385
[A-Z]{1,1} 0.01777251355517998
----------------
2.1972245773362196 134.72084590050298
[a-zA-Z]{1,1} 0.01630946244918149
----------------
0.5877866649021193 43.37565598026651
2.1972245773362196 84.59130129215927
0 0
[CB] 0.02597459246722664
----------------
2.1972245773362196 145.18366937900961
.[]{1,1} 0.015134102800503334
----------------
best [CB]
values_left [0] []
values_right[0] []
+++++++++++++++++++++++++


12.676679217969127 2324.888645842747


In [13]:
print(var.to_dict())

{'name': 'Cabin', 'type': 'string', 'dtype': 'string', 'prop_missing': 0.7710437710437711, 'distribution': {'name': 'RegexDistribution', 'parameters': {'re_list': [('[BCFD]', 0.11764705882352941), ('\\d{2,2}', 0.09803921568627451), ('[ ]', 0.11764705882352941), ('[B]', 0.00980392156862745), ('\\d{2,2}', 0.00980392156862745), ('[ ]', 0.00980392156862745), ('[CB]', 0.5196078431372549), ('\\d{2,2}', 0.0392156862745098), ('[D E]', 0.3627450980392157), ('[AFGCBT]', 0.19607843137254902), ('\\d{1,3}', 0.9803921568627451)]}}}


In [14]:
df["Cabin"].dropna().values.tolist()

['C85',
 'C123',
 'E46',
 'G6',
 'C103',
 'D56',
 'A6',
 'C23 C25 C27',
 'B78',
 'D33',
 'B30',
 'C52',
 'B28',
 'C83',
 'F33',
 'F G73',
 'C23 C25 C27',
 'E31',
 'A5',
 'D10 D12',
 'D26',
 'C110',
 'B58 B60',
 'E101',
 'D26',
 'F E69',
 'D47',
 'C123',
 'B86',
 'F2',
 'C2',
 'E33',
 'B19',
 'A7',
 'C49',
 'F4',
 'A32',
 'F2',
 'B4',
 'B80',
 'G6',
 'A31',
 'D36',
 'D15',
 'C93',
 'C83',
 'C78',
 'D35',
 'G6',
 'C87',
 'B77',
 'E67',
 'B94',
 'C125',
 'C99',
 'C118',
 'D7',
 'A19',
 'B49',
 'D',
 'C22 C26',
 'C106',
 'B58 B60',
 'E101',
 'C22 C26',
 'C65',
 'E36',
 'C54',
 'B57 B59 B63 B66',
 'C7',
 'E34',
 'C32',
 'D',
 'B18',
 'C124',
 'C91',
 'C2',
 'E40',
 'T',
 'F2',
 'C23 C25 C27',
 'F33',
 'C128',
 'E33',
 'D37',
 'B35',
 'E50',
 'C82',
 'B96 B98',
 'D36',
 'G6',
 'C78',
 'E10',
 'C52',
 'E44',
 'B96 B98',
 'C23 C25 C27',
 'A34',
 'C104',
 'C111',
 'C92',
 'E38',
 'D21',
 'E12',
 'E63',
 'D',
 'A14',
 'B49',
 'C93',
 'B37',
 'C30',
 'D20',
 'C22 C26',
 'B79',
 'C65',
 'E25',
 'D

In [15]:
print(var.distribution)

[BCFD]\d{2,2}[ ][B]\d{2,2}[ ][CB]\d{2,2}[D E][AFGCBT]\d{1,3}


In [19]:
var.draw_series(1000).dropna()

1           D40
4          C G4
5           BC2
6             6
8           C42
         ...   
979          C6
980        B859
981    52 CE436
988          G9
993       C 985
Length: 255, dtype: string

In [16]:
dataset = MetaDataset.from_dataframe(df)

{} None
{} None
{} None
{} None
-152.13905733679155 4
\d{1,1} -38.03476433419789
----------------
35.581439770811926 51095.79854322473
\w{1,13} 0.0006963672314605602
----------------
26.6543812008872 33833.16680902808
[a-z]{1,12} 0.0007878181002487392
----------------
3.2331581106716385 5809.928030754281
[A-Z]{1,1} 0.000556488495822536
----------------
35.581439770811926 49113.15895084631
[a-zA-Z]{1,13} 0.0007244787452263604
----------------
4.081851311012997 788.949350448831
4.271694582714588 1135.5711750042226
[)] 0.005173781192279129
----------------
328.57258726656175 228887.7333179874
.[]{12,82} 0.0014355185509661408
----------------
best [)]
values_left [891] ['Braund, Mr. Owen Harris', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer', 'Heikkinen, Miss. Laina', 'Futrelle, Mrs. Jacques Heath (Lily May Peel', 'Allen, Mr. William Henry', 'Moran, Mr. James', 'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard', 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg', 'Nasser, M

KeyboardInterrupt: 

In [None]:
print(dataset.meta_vars[10])

In [None]:
df

In [None]:
dataset.synthesize(1000)

In [None]:
dataset.to_json("test.json", validate=True)

In [None]:
print(MetaDataset.from_json("test.json"))

In [None]:
dataset.to_dict()

In [None]:
print(MetaDataset.from_xml("test.xml"))

In [None]:
dataset.to_dict()