# Implementation of edit-distance (Levenshtein)
- May be useful for inexactly comparing names

In [65]:
from os import environ

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Edit Distance Algorithm
- Function is vectorized for efficient Numpy processing

In [66]:
def edit_distance(s1: str, s2: str) -> int:
    memo = [[0]*(len(s2) + 1) for _ in range(len(s1) + 1)]

    for i1 in range(len(s1) + 1):
        for i2 in range(len(s2) + 1):
            if i1 == 0:
                memo[i1][i2] = i2
            elif i2 == 0:
                memo[i1][i2] = i1
            elif s1[i1 - 1] == s2[i2 - 1]:
                memo[i1][i2] = memo[i1 - 1][i2 - 1]  # same
            else:
                memo[i1][i2] = 1 + min(memo[i1 - 1][i2 - 1],  # same
                                       memo[i1 - 1][i2],  # delete s1
                                       memo[i1][i2 - 1]  # insert s1
                                       )

    return memo[-1][-1]

edit_distance_vec = np.vectorize(edit_distance)

# print(edit_distance('calcasieu', 'patel'))
# print(edit_distance('patel', 'calcasieu'))

In [67]:
host = environ.get('PG_SERVER', 'sql-fabulous')
db = environ.get('PG_DATABASE', 'psycodb')
user = environ.get('PG_UID', 'sa')
pw = environ.get('POSTGRES_PASSWORD', 'pwd')

engine = create_engine(f"postgresql+psycopg2://{user}:{pw}@{host}/{db}", echo=False)

## Read Source Data
- Additional transformations could improve match results (remove '-', 'inc.', 'llc', etc.)

In [68]:
mpmi_sql = """
    select  *
    from    public.mc_practice_master_info
"""

feed_sql = """
    select  *
    from    public.practice_feed
"""

mpmi_df = pd.read_sql(mpmi_sql, engine)
mpmi = mpmi_df.loc[:, ['practice_id', 'practice_name']]
mpmi['practice_name'] = mpmi.practice_name.str.lower()

feed_df = pd.read_sql(feed_sql, engine)
feed = feed_df.loc[:, ['id', 's3_path']]
feed['practice'] = feed.s3_path.str.extract(pat=r'.+/.+/(?P<prac>.+)/.+/.+/', expand=False).str.lower()

feed_x = feed.merge(mpmi, how='cross').iloc[:, :]

## Run Algorithm
- Also write back to sql
- The following query could be used to get top x results:
```
with fuzzy
as
(
    select  id,
            edit_distance,
            row_number() over (partition by practice
                                order by edit_distance ASC) as fuzzy_rank,
            practice,
            practice_id,
            practice_name,
            s3_path
    from    public.feed_x
)
select  *
from    fuzzy
where   fuzzy_rank <= 5
order by practice, fuzzy_rank ASC
```

In [69]:
feed_x['edit_distance'] = edit_distance_vec(feed_x.practice, feed_x.practice_name)

In [70]:
feed_x.to_sql('feed_x', engine, schema='public', if_exists='replace', index=False)