# Tutorial on regexmodel

## Setup and installation

If you haven't installed `regexmodel` yet, including the optional dependencies, do so now:

In [1]:
# %pip install git+https://github.com/sodascience/regexmodel.git[tutorial]

Normally we would already have data that we want to model and synthesize, but for this tutorial we will use the faker package to generate that data for us. We will use fake email addresses.

In [2]:
from faker import Faker
from regexmodel.util import Dir
from regexmodel.datastructure import BaseRegex
from regexmodel.model import fit_best_regex_class
from regexmodel.data2 import RegexNode, OrNode, Edge
import polars as pl

fake = Faker("en")
Faker.seed(12345)
email_addresses = pl.Series([fake.email() for _ in range(1000)])

In [3]:
def _preview(series, size=3):
    return series.drop_nulls()[:size].to_numpy()

def _simplify_edge(edge):
    node = edge.destination
    if node is None:
        return edge
    if len(node.edges) == 1:
        return node.edges[0]
    return edge

def fit_main_branch(series: pl.Series,
                    count_thres: float,
                    direction=Dir.RIGHT,
                    optionals=True) -> list[BaseRegex]:

    # Use the returnnode/edge for returning
    return_node = OrNode([], Edge(None, 0))
    return_edge = Edge(return_node)

    # Add an END edge
    n_end_links = (series == "").sum()
    if n_end_links > count_thres:
        return_node.add_edge(Edge(None, n_end_links))
    cur_series = series.set(series == "", None)  # type: ignore
    result = fit_best_regex_class(series, count_thres/len(series), direction=direction)

    # If it fails the threshold, stop the search.
    if result["score"] < count_thres/len(series):
        return _simplify_edge(return_edge)

    new_edge = fit_main_branch(
        result["new_series"], count_thres=count_thres,
        direction=direction)

    if new_edge.count == 0:
        return _simplify_edge(return_edge)

    new_node = RegexNode(result["regex"], new_edge)
    main_edge = Edge(new_node, new_edge.count)
    cur_or_node = OrNode([Edge(None, new_edge.count)], main_edge)

    alt_series = result["alt_series"]
    if alt_series.drop_nulls().len() > count_thres:
        opt_series = alt_series.str.extract(r"(^[\S\s]*?)" + main_edge.regex + r"$")
        alt_edge = fit_main_branch(opt_series, count_thres, direction)
        if alt_edge.count > 0:
            cur_or_node.add_edge(alt_edge)

    if len(cur_or_node.edges) == 1:
        return_node.add_edge(main_edge)
    else:
        return_node.add_edge(Edge(cur_or_node))

    return _simplify_edge(return_edge)
        # print(return_node)
        # return Edge(return_node)
    # print("hi")
    # return_node.add_edge(Edge(cur_or_node))
    # return Edge(cur_or_node)



main_edge = fit_main_branch(email_addresses, 10)
# main_edge.regex
print("XXx", main_edge.regex)
print(email_addresses.str.extract(main_edge.regex, group_index=0).drop_nulls().len())
assert False
# full_regex = create_regex_from_list(regex_list)


XXx [a-z]{4,17}(|[0-9]{2,2})[@][e][x][a][m][p][l][e][\.][c][o][m]
335


AssertionError: 

In [None]:

def create_regex_from_list(regex_data):
    if isinstance(regex_data, BaseRegex):
        return regex_data.regex
    if isinstance(regex_data, tuple):
        optional_regex = [create_regex_from_list(rx) for rx in regex_data]
        return r"(" + r"|".join(optional_regex) + ")"
    return "".join(create_regex_from_list(rx) for rx in regex_data)



## Modeling the structured strings

Now we will use the regexmodel package to model the data:

In [None]:
    # while cur_series.drop_nulls().len() > count_thres:
        
    # if result["score"] < count_thres/len(series):
    #     return [], []
    # regex_list, count_list = fit_main_branch(result["new_series"], count_thres=count_thres,
    #                                          direction=direction)
    # n_main_line = result["new_series"].drop_nulls().len()
    # n_alt_line = result["alt_series"].drop_nulls().len()
    # regex_list = [result["regex"]] + regex_list
    # count_list = [n_main_line] + count_list

    # if len(result["alt_series"].drop_nulls()) > count_thres and optionals:

    #     regex_main = "".join(rx.regex for rx in regex_list)
    #     res = result["alt_series"].str.extract(r"(^[\S\s]*?)" + regex_main + r"$")
    #     alt_regex_list, alt_count_list = fit_main_branch(res, count_thres, direction)
    #     if len(alt_regex_list) > 0:
    #         regex_list = [(alt_regex_list, [])] + regex_list
    #         count_list = [(alt_count_list[-1], n_main_line)] + count_list

    # return return_links
    # # return regex_list, count_list


In [None]:
from regexmodel import RegexModel

model = RegexModel.fit(email_addresses)

Let's first see how the good the model is by synthesizing new email addresses:

In [None]:
[model.draw() for _ in range(10)]

['dgud@example.org',
 'natjwmurwitgbkjq@example.org',
 'sraysnxjcahfcbcfb@example.net',
 'vyziuulecdno@example.com',
 'ecmdnsbqgftfkiiuzk33@example.net',
 'epii@example.org',
 'xsmgl@example.com',
 'femhyfnxeisv@example.com',
 'vtimvqownq@example.org',
 'twvfbssgqun@example.org']

While certainly not perfect, it certainly isn't so bad either, given that we have given the model only positive examples!

Now let's look at the serialization of the model:

In [None]:
model.serialize()

[{'regex': '[a-z]{3,18}[@][e][x][a][m][p][l][e][\\\\.][o][r][g]',
  'weights': [1000,
   728,
   819,
   819,
   819,
   819,
   819,
   819,
   819,
   819,
   341,
   341,
   341],
  'side_branches_before': [{'i_branch': 0, 'data': {'weight': 909}},
   {'i_branch': 1,
    'data': {'regex': '[a-z]{3,9}[0-9]{2,2}',
     'weights': [91, 0],
     'side_branches_before': [{'i_branch': 0, 'data': {'weight': 91}}],
     'side_branches_after': []}}],
  'side_branches_after': [{'i_branch': 0,
    'data': {'regex': '[0-9]{2,2}[@][e][x][a][m][p][l][e][\\\\.][n][e][t]',
     'weights': [180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 93, 93, 93],
     'side_branches_before': [],
     'side_branches_after': [{'i_branch': 9,
       'data': {'regex': '[c][o][m]',
        'weights': [87, 87, 87],
        'side_branches_before': [],
        'side_branches_after': [{'i_branch': 2, 'data': {'weight': 87}}]}},
      {'i_branch': 12, 'data': {'weight': 93}}]}},
   {'i_branch': 0, 'data': {'weight': 1}}

The serialization might seem overwhelming at first, but the first regex (`[a-z]{3,18}[0-9]{2,2}[@][a-z]{4,9}[\\\\.][c][o][m]`) is usually the most important one. We call this the main branch. On this main branch, there will be side branches, for example for ".info" and ".biz" email addresses.

## Modeling performance

There are also some modeling statistics that can be computed. Note that computing these can take a while depending on your computer.

In [None]:
model.fit_statistics(email_addresses)

{'failed': 2,
 'success': 998,
 'n_tot_char': 21621,
 'n_char_success': 21574,
 'n_parameters': 51,
 'avg_log_like_per_char': -1.6394319514351288,
 'avg_log_like_pc_success': -1.627954654856158}

What the `fit_statistics` method does is to retrace back whether an email address that is given to it (e.g. johndoe@example.com) has a non-zero probability to be generated by the regex model. As we can see above, there were 18 email addresses in the list that have a probability of 0 to be generated by the model, while the overwhelming majority (982) can be generated with the fitted model.

The value `n_parameters` gives the number of nodes in the model, and is thus an indicator of the complexity of the model. This is also correlated with the fit taking longer. We can influence this parameter during fitting by setting the `count_thres` parameter. If we set that threshold higher, we generally have a lower number of parameters and better performance.

The statistic `avg_log_like_per_char` (average log-likelihood per character) shows how probable a value is on average per character. To understand this better, let's take a more simple example, where the regex is simply `\d{2,2}`. For this regex, the log likelihood is simply log(1/10\*1/10) = -2\*log(10). Since all values have 2 characters, the average log-likelihood per character is -log(10) ~= 2.30. For failed values (values that cannot be generated by the model), we use a penalty score of -log(1000) per character.

Ideally we want to have the lowest `n_parameters` (simplest model) with the highest `success` and the highest log-likelihood.

## Visualization

To more clearly understand how the graph looks like, we can plot the regex model using the `regex_model_to_pyvis` function. To retrace the paths that can be taken, first find the start node and find the main branch.

Note: PyVis doesnt work interactively in VSCode/Code OSS.

In [None]:
from regexmodel.visualization import regex_model_to_pyvis

net = regex_model_to_pyvis(model)
net.show("regex.html", notebook=True)

regex.html
