## Imports

In [1]:
import pandas as pd
import numpy as np

import re
from tqdm import tqdm
from pprint import pprint

import mlflow
from mlflow import MlflowClient

## Custom Notebook Settings

In [2]:
pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 100)


## MLFlow Configuration (run only once)

In [None]:
client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

In [None]:
pprint(client.search_experiments())

In [None]:
all_experiments = client.search_experiments()

In [None]:
default_experiment = [{"name": experiments.name, "lifecycle_stage": experiments.lifecycle_stage} for experiments in all_experiments if experiments.name == "Default"]

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Invoice Processing experiments - biller ref and payer ref"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "invoice-processing",
    "store_dept": "",
    "team": "",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}


In [None]:
invoice_experiments = client.create_experiment(
    name="invoice-processing",
    tags=experiment_tags
)

## Global Variables

In [3]:
DATA_FOLDER_PATH = "data"
DATA_FILE_PATH = "data/invoice.csv"
COLUMNS_TO_FILTER = ["billerDomain", "billerName", "billerReference", "billerCurrency", "billerAmount", 
                    "payerDomain", "payerName", "payerReference", "payerCurrency", "invoiceNumber", "invoiceAmount",
                    "invoiceCurrency", "invoiceOwner"]
BILLER_REF_NAME_COLS = ["billerReference", "billerName"]
PAYER_REF_NAME_COLS = ["payerReference", "payerName"]
INVOICE_NUM_COLS = ["billerName", "payerName", "invoiceNumber"]


## Load data

In [4]:
df = pd.read_csv(DATA_FILE_PATH)

  df = pd.read_csv(DATA_FILE_PATH)


In [5]:
df.shape

(177550, 93)

In [6]:
df.head()

Unnamed: 0,id,billerDomain,billerName,billerReference,billerCurrency,billerAmount,billerAmountNow,billerAmountLater,forwardExchangeAdjustment,payerDomain,payerName,payerReference,payerCurrency,payerAmount,payerAmountNow,...,awxTransferFeeCurrency,awxSourceCurrency,awxTransferFees,awxSourceAmount,discountedInvoiceAmount,applicant,employeeCode,clientCode,timeKeeper,matterCode,costCode,narrative,barCode,adminReference,lockedAmount
0,1,accuprotm.com,Accupro Trademark Services Ltd,REF,CAD,371.4,371.4,371.4,0.005,griffithhack.com,Griffith Hack,REF,EUR,265.8036,265.8036,...,,,0.0,0.0,0.0,,,,,,,,,,0.0
1,2,accuprotm.com,Accupro Trademark Services Ltd,REF,CAD,456.5,456.5,456.5,0.005,griffithhack.com,Griffith Hack,REF,EUR,326.7079,326.7079,...,,,,,,,,,,,,,,,0.0
2,3,airdmcburney.com,Aird & McBurney LP,REF,CAD,475.0,475.0,475.0,0.005,griffithhack.com,Griffith Hack,REF,EUR,339.948,339.948,...,,,,,,,,,,,,,,,0.0
3,4,airdmcburney.com,Aird & McBurney LP,REF,CAD,425.0,425.0,425.0,0.005,griffithhack.com,Griffith Hack,REF,EUR,304.164,304.164,...,,,,,,,,,,,,,,,0.0
4,5,airdmcburney.com,Aird & McBurney LP,REF,CAD,425.0,425.0,425.0,0.005,griffithhack.com,Griffith Hack,REF,EUR,304.164,304.164,...,,,,,,,,,,,,,,,0.0


In [7]:
df.columns

Index(['id', 'billerDomain', 'billerName', 'billerReference', 'billerCurrency',
       'billerAmount', 'billerAmountNow', 'billerAmountLater',
       'forwardExchangeAdjustment', 'payerDomain', 'payerName',
       'payerReference', 'payerCurrency', 'payerAmount', 'payerAmountNow',
       'payerAmountLater', 'markup', 'invoiceNumber', 'invoiceAmount',
       'invoiceCurrency', 'invoiceDate', 'invoiceDueDate', 'tradeStatus',
       'tradeReference', 'tradeDate', 'tradedBy', 'tradeAuthorisedBy',
       'nowPaymentDueDate', 'laterPaymentDueDate', 'payNow', 'payIn',
       'paymentDueDate', 'paymentDate', 'settlementDate', 'settleBy',
       'paymentStatus', 'paymentReference', 'paidBy', 'paymentAuthorisedBy',
       'paymentFile', 'invoiceFile', 'receiptBankLoadStatus', 'receiptBankId',
       'receiptBankLoadDate', 'indicativePaymentAmount',
       'indicativeClientAmount', 'disbursements', 'integrationStatus',
       'integrationDate', 'integrationText', 'archived', 'billeramountcurr1',


In [8]:
df = df[COLUMNS_TO_FILTER]

In [9]:
df.head()

Unnamed: 0,billerDomain,billerName,billerReference,billerCurrency,billerAmount,payerDomain,payerName,payerReference,payerCurrency,invoiceNumber,invoiceAmount,invoiceCurrency,invoiceOwner
0,accuprotm.com,Accupro Trademark Services Ltd,REF,CAD,371.4,griffithhack.com,Griffith Hack,REF,EUR,407404,371.4,CAD,griffithhack.com
1,accuprotm.com,Accupro Trademark Services Ltd,REF,CAD,456.5,griffithhack.com,Griffith Hack,REF,EUR,408441,456.5,CAD,griffithhack.com
2,airdmcburney.com,Aird & McBurney LP,REF,CAD,475.0,griffithhack.com,Griffith Hack,REF,EUR,539580,475.0,CAD,griffithhack.com
3,airdmcburney.com,Aird & McBurney LP,REF,CAD,425.0,griffithhack.com,Griffith Hack,REF,EUR,540906,425.0,CAD,griffithhack.com
4,airdmcburney.com,Aird & McBurney LP,REF,CAD,425.0,griffithhack.com,Griffith Hack,REF,EUR,541037,425.0,CAD,griffithhack.com


In [10]:
df["billerName"].value_counts()

billerName
Abu Ghazaleh Intellectual Property                                           65248
Moeller IP Advisors - USD                                                     8729
Saba & Co- Head Office                                                        7997
Gorodissky & Partners, Ltd - Russia                                           6167
Shinjyu Global IP Group                                                       2073
                                                                             ...  
Ruttensperger Lachnit Trossin Gomoll  Patent- und Rechtsanwalte PartG mbB        1
FG Y ASOCIADOS S.A.                                                              1
Dennemeyer & Co., LLC                                                            1
NORENS PATENTBYRA AB - StockholmSE - EUR                                         1
Andrea Moretti                                                                   1
Name: count, Length: 2876, dtype: int64

In [11]:
df[df["payerName"].str.contains("Adams Pluck")]["billerName"].value_counts()

billerName
Sunip                                     313
MORRISS OBRYANT COM COMPAGNI P.C.         193
LIU SHEN & ASSOCIATES- Beijing Office     188
APPLEYARD LEES - Halifax                  148
Fitch Even Tobin & Flannery               114
                                         ... 
Edwin A Sisson Attorney At Law LLC          1
ZEUNER & SUMMERER EUR                       1
Verrill - PortlandME - USD                  1
SMART & BIGGAR IP - Ottawa - BoM - CAD      1
F.R Kelly & Co (Belfast)                    1
Name: count, Length: 342, dtype: int64

In [12]:
df[(df["billerName"].str.contains("LIU SHEN & ASSOCIATES")) & (df["payerName"].str.contains("Adams Pluck"))]


Unnamed: 0,billerDomain,billerName,billerReference,billerCurrency,billerAmount,payerDomain,payerName,payerReference,payerCurrency,invoiceNumber,invoiceAmount,invoiceCurrency,invoiceOwner
1715,liu-shen.com.usd,LIU SHEN & ASSOCIATES (USD),PZJ15400AJ,USD,,adamspluck.com.au,Adams Pluck,23051CNP00,AUD,P17AU23807,208.14,USD,adamspluck.com.au
1807,liu-shen.com,LIU SHEN & ASSOCIATES- Beijing Office,PXH04459AJ,CNY,,adamspluck.com.au,Adams Pluck,21261CNP00,AUD,P17AU25734,1350.00,CNY,adamspluck.com.au
1822,liu-shen.com,LIU SHEN & ASSOCIATES- Beijing Office,P17J28024AJ,CNY,,adamspluck.com.au,Adams Pluck,23111CNP00,AUD,P17AU26108,3850.00,CNY,adamspluck.com.au
1879,liu-shen.com,LIU SHEN & ASSOCIATES- Beijing Office,PXJ02206AJ,CNY,,adamspluck.com.au,Adams Pluck,21533CNP00,AUD,P17AU27243,2130.00,CNY,adamspluck.com.au
1899,liu-shen.com.usd,LIU SHEN & ASSOCIATES (USD),PYF01890,USD,,adamspluck.com.au,Adams Pluck,22761CNP00,AUD,A17AU09183,1697.46,USD,adamspluck.com.au
...,...,...,...,...,...,...,...,...,...,...,...,...,...
144222,liu-shen.com,LIU SHEN & ASSOCIATES- Beijing Office,P19J41764D1J,CNY,,adamspluck.com.au,Adams Pluck,23383CNP01,AUD,P24AU21053,11810.00,CNY,adamspluck.com.au
145995,liu-shen.com.usd,LIU SHEN & ASSOCIATES (USD),P23J80581A,USD,,adamspluck.com.au,Adams Pluck,24810CNP00,AUD,P24AU28803,1733.80,USD,adamspluck.com.au
150838,liu-shen.com.usd,LIU SHEN & ASSOCIATES (USD),P20J51885AJ,USD,,adamspluck.com.au,Adams Pluck,23921CNP00,AUD,A24AU22794,381.17,USD,adamspluck.com.au
154963,liu-shen.com.usd,LIU SHEN & ASSOCIATES (USD),P21E62707A,USD,,adamspluck.com.au,Adams Pluck,24476CNP00,AUD,A24AU24349,251.51,USD,adamspluck.com.au


In [13]:
df.isnull().sum()

billerDomain           1
billerName             0
billerReference       76
billerCurrency         6
billerAmount       58265
payerDomain            1
payerName              0
payerReference      2243
payerCurrency          1
invoiceNumber          0
invoiceAmount          0
invoiceCurrency        0
invoiceOwner           1
dtype: int64

## Biller Reference Patterns

In [14]:
biller_df = df[BILLER_REF_NAME_COLS]

In [15]:
biller_df.fillna("-1", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  biller_df.fillna("-1", inplace=True)


In [16]:
billname_df = biller_df["billerName"].value_counts().reset_index()

In [17]:
billname_df[:35]

Unnamed: 0,billerName,count
0,Abu Ghazaleh Intellectual Property,65248
1,Moeller IP Advisors - USD,8729
2,Saba & Co- Head Office,7997
3,"Gorodissky & Partners, Ltd - Russia",6167
4,Shinjyu Global IP Group,2073
5,AFD China Intellectual Property Law Office,1686
6,Cruz Marcelo & Tenefrancia - USD,1510
7,Sojuzpatent,1433
8,AVA Firm - USD,1174
9,AMICA LAW LLC,1172


In [None]:
biller_df = biller_df[biller_df["billerName"].isin(billname_df[:35]["billerName"])]

In [None]:
biller_df["billerReference"].value_counts()

In [None]:
df["billerReference"].fillna("-1", inplace=True)
df["payerReference"].fillna("-1", inplace=True)

In [18]:
def refTypeCompute(ref):
    if ref == "REF":
        return "REF"
    elif ref in [-1, "-1"]:
        return "-1"
    else:
        ref = re.sub("[A-Z]", "X", ref)
        ref = re.sub("\d", "N", ref)
        return ref

In [None]:
biller_df["billerType"] = biller_df["billerReference"].apply(lambda x: refTypeCompute(x))

In [None]:
biller_df["billerType"].value_counts()

In [None]:
biller_df = biller_df[~biller_df["billerType"].isin(["REF", "-1"])]

In [None]:
biller_ct = pd.crosstab(index=biller_df["billerType"], columns=biller_df["billerName"], margins=True)

In [None]:
biller_ct

In [None]:
biller_ct = biller_ct.sort_values(by='All', ascending=False)

In [None]:
# Sort columns by their totals (except the 'All' column)
cols = biller_ct.loc['All'].drop('All')  # exclude 'All'
sorted_cols = cols.sort_values(ascending=False).index.tolist()

# Reorder columns + add 'All' at the end
biller_ct = biller_ct[sorted_cols + ['All']]

In [None]:
biller_ct

In [None]:
biller_ct = biller_ct.reset_index()

In [None]:
biller_ct.to_csv(DATA_FOLDER_PATH + "/biller_ct.csv", index=False)

## Invoice Number Patterns for Billers and Payers

In [None]:
invoice_df = df[INVOICE_NUM_COLS]

In [None]:
invoice_df.isnull().sum()

In [None]:
invoice_df["invoiceNumberType"] = invoice_df["invoiceNumber"].apply(lambda x: refTypeCompute(x))

In [None]:
invoice_df["invoiceNumberType"].value_counts()

In [None]:
inv_payer_ct = pd.crosstab(index=invoice_df["invoiceNumberType"], columns=invoice_df["payerName"], margins=True)
inv_payer_ct = inv_payer_ct.sort_values(by='All', ascending=False)

# Sort columns by their totals (except the 'All' column)
cols = inv_payer_ct.loc['All'].drop('All')  # exclude 'All'
sorted_cols = cols.sort_values(ascending=False).index.tolist()

# Reorder columns + add 'All' at the end
inv_payer_ct = inv_payer_ct[sorted_cols + ['All']]

inv_payer_ct = inv_payer_ct.reset_index()


In [None]:
inv_payer_ct

In [None]:
inv_payer_ct.to_csv(DATA_FOLDER_PATH + "/inv_payer_ct_full.csv", index=False)


In [None]:
inv_biller_ct = pd.crosstab(index=invoice_df["invoiceNumberType"], columns=invoice_df["billerName"], margins=True)
inv_biller_ct = inv_biller_ct.sort_values(by='All', ascending=False)

# Sort columns by their totals (except the 'All' column)
cols = inv_biller_ct.loc['All'].drop('All')  # exclude 'All'
sorted_cols = cols.sort_values(ascending=False).index.tolist()

# Reorder columns + add 'All' at the end
inv_biller_ct = inv_biller_ct[sorted_cols + ['All']]

inv_biller_ct = inv_biller_ct.reset_index()


In [None]:
inv_biller_ct

In [None]:
inv_biller_ct.to_csv(DATA_FOLDER_PATH + "/inv_biller_ct_full.csv", index=False)


## Payer Reference Patterns

In [None]:
payer_df = df[PAYER_REF_NAME_COLS]

In [None]:
payer_df

In [None]:
payer_df.fillna("-1", inplace=True)

In [None]:
payername_df = payer_df["payerName"].value_counts().reset_index()

In [None]:
payer_df = payer_df[payer_df["payerName"].isin(payername_df[:54]["payerName"])]

In [None]:
payer_df.shape

In [None]:
payer_df["payerName"].value_counts()

In [None]:
payer_df["payerType"] = payer_df["payerReference"].apply(lambda x: refTypeCompute(x))

In [None]:
payer_df["payerType"].value_counts()

In [None]:
payer_df = payer_df[~payer_df["payerReference"].isin(["REF", "-1"])]

In [None]:
payer_df.shape

In [None]:
payer_ct = pd.crosstab(index=payer_df["payerType"], columns=payer_df["payerName"], margins=True)

In [None]:
payer_ct = payer_ct.sort_values(by='All', ascending=False)

In [None]:
payer_ct

In [None]:
# Sort columns by their totals (except the 'All' column)
cols = payer_ct.loc['All'].drop('All')  # exclude 'All'
sorted_cols = cols.sort_values(ascending=False).index.tolist()

# Reorder columns + add 'All' at the end
payer_ct = payer_ct[sorted_cols + ['All']]

In [None]:
payer_ct = payer_ct.reset_index()

In [None]:
payer_ct.to_csv(DATA_FOLDER_PATH + "/payer_ct_full.csv", index=False)

In [None]:
# biller_ct = pd.read_csv(DATA_FOLDER_PATH + "/biller_ct_full.csv")
# payer_ct = pd.read_csv(DATA_FOLDER_PATH + "/payer_ct_full.csv")

## Implementation

In [None]:
billers_name = biller_ct.columns
remove_cols = ["All", "billerName", "billerType"]
billers_name = list(set(billers_name) - set(remove_cols))
# billers_name

In [None]:
biller_ct

In [None]:
for idx in tqdm(range(1, len(biller_ct))):
    
    total_count_per_pattern = biller_ct["All"].iloc[idx]
    for each_col in billers_name:
        # total_count_of_each_biller = biller_ct[each_col].iloc[0]
        biller_ct[each_col].iloc[idx] = biller_ct[each_col].iloc[idx] / total_count_per_pattern

In [None]:
biller_ct

In [None]:
payer_name = payer_ct.columns
remove_cols = ["All", "payerName", "payerType"]
payer_name = list(set(payer_name) - set(remove_cols))

In [None]:
for idx in tqdm(range(len(payer_ct))):
    
    total_count_per_pattern = payer_ct["All"].iloc[idx]
    for each_col in payer_name:
        payer_ct[each_col].iloc[idx] = payer_ct[each_col].iloc[idx] / total_count_per_pattern

In [None]:
payer_ct

In [None]:
biller_ct.to_csv(DATA_FOLDER_PATH + "/biller_ct_prob.csv", index=False)
payer_ct.to_csv(DATA_FOLDER_PATH + "/payer_ct_prob.csv", index=False)

In [19]:
biller_ct = pd.read_csv(DATA_FOLDER_PATH + "/biller_ct_full_prob.csv")
payer_ct = pd.read_csv(DATA_FOLDER_PATH + "/payer_ct_full_prob.csv")

In [20]:
df["billerReference"].fillna("-1", inplace=True)
df["payerReference"].fillna("-1", inplace=True)

df["billerType"] = df["billerReference"].apply(lambda x: refTypeCompute(x))
df["payerType"] = df["payerReference"].apply(lambda x: refTypeCompute(x))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["billerReference"].fillna("-1", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["payerReference"].fillna("-1", inplace=True)


In [158]:
# best case example
sample_biller = "TM13017AR38-JD4RG (111493)"
sample_payer = "21CS-249442"

# # average case example
# sample_biller = "WT JAN ksh F.20221021 (6814 SG)"
# sample_payer = "200979SG"
# sample_biller = "4130973568"
# sample_payer = "231697"
# sample_biller = "P23J80581A"
# sample_payer = "24810CNP00"
# sample_biller = "6632CON"
# sample_payer = "23921USP01"
# sample_biller = "CPME2240463P"
# sample_payer = "ONPV01-00082"
sample_biller = "4050963213"
sample_payer = "62554W JO"
     

# # invariant example
# sample_biller = "WT JAN ksh F.20221021 (6814 SG)"
# sample_payer = "21CS-249442"

def get_top_3_combinations(billerref, payerref):
    
    # pre-process biller and payer
    biller_pattern = refTypeCompute(billerref)
    payer_pattern = refTypeCompute(payerref)
    if biller_pattern not in biller_ct["billerType"].values and payer_pattern not in payer_ct["payerType"].values:
        return "biller and payer pattern is empty"
    
    # print(f"{biller_pattern} - {payer_pattern}")
    biller_pattern_fil = biller_ct[biller_ct["billerType"] == biller_pattern]
    payer_pattern_fil = payer_ct[payer_ct["payerType"] == payer_pattern]
    
    if biller_pattern_fil.shape[0] == 0 and payer_pattern_fil.shape[0] == 0:
        return "missing_biller_pattern", "missing_payer_pattern", biller_pattern, payer_pattern
    
    if not biller_pattern_fil.shape[0]:
        return "missing_biller_pattern", payer_pattern_fil, biller_pattern, payer_pattern

    if not payer_pattern_fil.shape[0]:
        return biller_pattern_fil, "missing_payer_pattern", biller_pattern, payer_pattern

    # top 3 biller per pattern
    biller_pattern_support = biller_pattern_fil["All"].iloc[0]
    biller_pattern_fil_ = biller_pattern_fil.drop(["All", "billerType"], axis=1)
    biller_pattern_fil_ = biller_pattern_fil_[biller_pattern_fil_.iloc[0].sort_values(ascending=False).index[:3]]
    biller_pattern_fil_["Support"] = biller_pattern_support

    # top 3 payer per pattern
    payer_pattern_support = payer_pattern_fil["All"].iloc[0]
    payer_pattern_fil_ = payer_pattern_fil.drop(["All", "payerType"], axis=1)
    payer_pattern_fil_ = payer_pattern_fil_[payer_pattern_fil_.iloc[0].sort_values(ascending=False).index[:3]]
    payer_pattern_fil_["Support"] = payer_pattern_support
    
    # get the combinations confidence
    return biller_pattern_fil_, payer_pattern_fil_, biller_pattern, payer_pattern

In [159]:
results = get_top_3_combinations(sample_biller, sample_payer)

In [160]:
results[0]

Unnamed: 0,Abu Ghazaleh Intellectual Property,Abu Ghazaleh Intellectual Property - EUR,Norton Rose Fulbright Canada LLP - Montreal,Support
1,0.974342,0.013677,0.004493,40066


In [161]:
results[1]

Unnamed: 0,Ruttensperger Lachnit Trossin Gomoll,Ruttensperger Lachnit Trossin Gomoll (RLTG ) Patent- und Rechtsanwalte PartG mbB,Ruttensperger Lachnit Trossin Gomoll (RLTG ),Support
744,0.636364,0.181818,0.090909,22


In [126]:
def get_combination_counts(biller_possibilities, payer_possibilities, biller_pattern, payer_pattern):
    result_pair = {}
    biller_payer_pairs = [(x, y) for x in biller_possibilities.columns[:-1] for y in payer_possibilities.columns[:-1] if biller_possibilities[x].iloc[0] != 0 and payer_possibilities[y].iloc[0]]
    # print(biller_payer_pairs)
    
    df_ = df[(df["billerType"] == biller_pattern) & (df["payerType"] == payer_pattern)]
    df__ = df_[(df_["billerName"].isin(biller_possibilities.columns[:-1])) & (df_["payerName"].isin(payer_possibilities.columns[:-1]))]
    if not df__.shape[0]:
        return result_pair, "no_combinations_available", df_.shape[0]

    for each_pair in biller_payer_pairs:
        each_pair_possibility = df[(df["billerName"] == each_pair[0]) & (df["payerName"] == each_pair[1])]
        if each_pair_possibility.shape[0] > 0:
            # each_pair_prob = biller_possibilities[each_pair[0]].iloc[0] * payer_possibilities[each_pair[1]].iloc[0]
            # print(each_pair, df__.shape[0])
            
            each_pair_support = df__[(df__["billerName"] == each_pair[0]) & (df__["payerName"] == each_pair[1])]
            each_pair_prob =  each_pair_support.shape[0] / df__.shape[0]
            prob_agg = (biller_possibilities[each_pair[0]].iloc[0] + payer_possibilities[each_pair[1]].iloc[0] + each_pair_prob) / 3
            result_pair[each_pair] = (each_pair_support.shape[0], each_pair_prob, prob_agg.item())
    return result_pair, "success", df_.shape[0]

In [None]:
# (1 + 0.61)/2, (1 + 0.36)/2, (1 + 0.016)/2, (1 * 0.61), (1 * 0.36), (1 * 0.016), (0.8 * 0.61),(0.8 * 0.36), (0.8 * 0.016), (0.2 * 0.61), (0.2 * 0.36), (0.2 * 0.016), (0.8 + 0.61)/2, (0.8 + 0.36)/2, (0.8 + 0.016)/2

In [26]:
result_pair, output_type, pattern_pair_support = get_combination_counts(results[0], results[1], results[2], results[3])

[('China Patent Agent (HK) Ltd', 'Munck Wilson Mandala LLP'), ('China Patent Agent (HK) Ltd', 'Munck Wilson Mandala - DallasTX - USD'), ('Tsingyihua Intellectual Property LLC', 'Munck Wilson Mandala LLP'), ('Tsingyihua Intellectual Property LLC', 'Munck Wilson Mandala - DallasTX - USD'), ('Patentia Oy', 'Munck Wilson Mandala LLP'), ('Patentia Oy', 'Munck Wilson Mandala - DallasTX - USD')]


In [27]:
from pprint import pprint
pprint(result_pair)
print(output_type)
print(pattern_pair_support)

{}
no_combinations_available
2


In [28]:
df[(df["billerType"] == "XXXXNNNNNNNX") & (df["payerType"] == "XXXXNN-NNNNN")]

Unnamed: 0,billerDomain,billerName,billerReference,billerCurrency,billerAmount,payerDomain,payerName,payerReference,payerCurrency,invoiceNumber,invoiceAmount,invoiceCurrency,invoiceOwner,billerType,payerType
150211,cpahkltd.com.hkd,China Patent Agent (HK) - HKD,CPEL1651139P,HKD,,munckwilson.com,Munck Wilson Mandala LLP,SOLE01-00099,USD,24X31696,180.6,HKD,munckwilson.com,XXXXNNNNNNNX,XXXXNN-NNNNN
171379,cpahkltd.com.hkd,China Patent Agent (HK) - HKD,CPEL2153912D,HKD,,munckwilson.com,Munck Wilson Mandala LLP,GOLD11-00561,USD,25X05693,486.7,HKD,munckwilson.com,XXXXNNNNNNNX,XXXXNN-NNNNN


In [29]:
sorted_dict = dict(sorted(result_pair.items(), key=lambda item: item[1][1], reverse=True))
sorted_dict

{}

In [None]:
# validate these questions
# 1. How many patterns seems to be having the interactions(with other domains) in top_3?
#   - (smaller_set) payer patterns seem to be having intersections(with other domains) within top_3
# 2. How support based guardrail should be implemented?
#   - 80% threshold seems reasonable to start with
# 3. How many patterns seems to be having higher variance -> significant spread in 4th or 5th or so on
#   - minimal, only observed in payer segments
# 4. How offen the pattern library increases?
#   - seems like patterns are growing heavily for available billers and payers
# 5. What will happen if for a specific pattern two biller-payer pairs are possible and because of frequency, we tend to incline towards the pair with more support?
#   - mainly because we are operating on minimal to no interactions between heterogeneous domains
#   - What is the contribution to error rate for such cases?

## Impl - using Invoice Number

In [None]:
billers_name = inv_biller_ct.columns
remove_cols = ["All", "billerName", "invoiceNumberType"]
billers_name = list(set(billers_name) - set(remove_cols))
# billers_name

In [49]:
for idx in tqdm(range(1, len(inv_biller_ct))):
    
    total_count_per_pattern = inv_biller_ct["All"].iloc[idx]
    for each_col in billers_name:
        # total_count_of_each_biller = biller_ct[each_col].iloc[0]
        inv_biller_ct[each_col].iloc[idx] = inv_biller_ct[each_col].iloc[idx] / total_count_per_pattern

In [None]:
payer_name = inv_payer_ct.columns
remove_cols = ["All", "payerName", "invoiceNumberType"]
payer_name = list(set(payer_name) - set(remove_cols))

NameError: name 'inv_payer_ct' is not defined

In [None]:
for idx in tqdm(range(len(inv_payer_ct))):
    
    total_count_per_pattern = inv_payer_ct["All"].iloc[idx]
    for each_col in payer_name:
        inv_payer_ct[each_col].iloc[idx] = inv_payer_ct[each_col].iloc[idx] / total_count_per_pattern

In [8]:
inv_biller_ct = pd.read_csv(DATA_FOLDER_PATH + "/inv_biller_ct_prob.csv")
inv_payer_ct = pd.read_csv(DATA_FOLDER_PATH + "/inv_payer_ct_prob.csv")

In [None]:
def invoice_pattern_filter():
    # for incoming document, get the invoice pattern and find possible biller and payers 
    # filter out unmatching biller and payer
    # 1. if the diff is empty -> take the decision only using biller
    # 2. if the diff is non-empty and reduced the possible combinations of the biller-payer combinations 
    # 3. if the diff is non-intersecting at all.
    pass

## Validations

In [None]:
for idx in range(len(biller_ct)):
    pattern = biller_ct["billerType"].iloc[idx]
    biller_ct_pattern_support = biller_ct["All"].iloc[idx]
    biller_ct_pattern_fil_ = biller_ct.drop(["All", "billerType"], axis=1)
    biller_ct_pattern_fil_ = biller_ct_pattern_fil_[biller_ct_pattern_fil_.iloc[idx].sort_values(ascending=False).index[:3]].iloc[idx]
    # print(biller_ct_pattern_fil_)
    biller_ct_pattern_fil_["Support"] = biller_ct_pattern_support
    print(pattern, biller_ct_pattern_support, biller_ct_pattern_fil_.to_dict())

In [None]:
for idx in range(len(payer_ct)):
    pattern = payer_ct["payerType"].iloc[idx]
    payer_ct_pattern_support = payer_ct["All"].iloc[idx]
    payer_ct_pattern_fil_ = payer_ct.drop(["All", "payerType"], axis=1)
    payer_ct_pattern_fil_ = payer_ct_pattern_fil_[payer_ct_pattern_fil_.iloc[idx].sort_values(ascending=False).index[:3]].iloc[idx]
    # print(biller_ct_pattern_fil_)
    payer_ct_pattern_fil_["Support"] = payer_ct_pattern_support
    print(pattern, payer_ct_pattern_support, payer_ct_pattern_fil_.to_dict())

In [None]:
biller_patterns_with_interactions = {}
columns = biller_ct.columns[1:-1]
for idx in tqdm(range(len(biller_ct))):
    pattern = biller_ct["billerType"].iloc[idx]
    # print(pattern)
    non_zero_interactions = np.where(biller_ct[biller_ct["billerType"] == pattern][columns].iloc[0] > 0)[0]
    if len(non_zero_interactions) > 1:
        biller_patterns_with_interactions[pattern] = len(non_zero_interactions)


In [None]:
payer_patterns_with_interactions = {}
columns = payer_ct.columns[1:-1]
for idx in tqdm(range(len(payer_ct))):
    pattern = payer_ct["payerType"].iloc[idx]
    # print(pattern)
    non_zero_interactions = np.where(payer_ct[payer_ct["payerType"] == pattern][columns].iloc[0] > 0)[0]
    if len(non_zero_interactions) > 1:
        payer_patterns_with_interactions[pattern] = len(non_zero_interactions)

In [None]:
len(biller_patterns_with_interactions), len(payer_patterns_with_interactions)

In [None]:
929/6661, 1258/9229, 80/1264, 124/2400

In [None]:
len(set(biller_patterns_with_interactions.values())), len(set(payer_patterns_with_interactions.values())), len(list(biller_patterns_with_interactions.values()))

In [None]:
len([v for v in payer_patterns_with_interactions.values() if v > 5]), len([v for v in biller_patterns_with_interactions.values() if v > 5])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# sns.histplot(list(biller_patterns_with_interactions.values()))
# plt.show()

counts = [v for v in payer_patterns_with_interactions.values() if v < 20]

sns.histplot(counts, kde=False)
plt.title("Distribution of Interaction Counts per Pattern")
plt.xlabel("Number of Interactions")
plt.ylabel("Frequency")
plt.show()


## MLFlow tracking Setup

In [43]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
# Sets the current active experiment to the "invoice-processing" experiment and
# returns the Experiment metadata
invoice_experiment = mlflow.set_experiment("invoice-processing")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "invoice_processing_test"

# Define an artifact path that the model will be saved to.
artifact_path = "invoice_processing"

## Batch Run

In [220]:
from math import log

p1 = [0.4, 0.5, 0.6]
p2 = [0.9, 0.1, 0] 
p3 = [1, 0, 0]
p4 = [0.9, 0.05, 0.05]

def compute_entropy_based_confidence(prob_dist):
    # print(prob_dist)
    H = -(sum([p*log(p, 10) for p in prob_dist if p != 0]))
    C = 1 - H/log(3, 10)
    return H, C

for i in [p1, p2, p3, p4]:
    H, C = compute_entropy_based_confidence(i)
    print(f"Input {i} = Entropy: {H} and Confidence: {C}")

Input [0.4, 0.5, 0.6] = Entropy: 0.44280025107061943 and Confidence: 0.07193350392492714
Input [0.9, 0.1, 0] = Entropy: 0.14118174150460758 and Confidence: 0.7040967257106154
Input [1, 0, 0] = Entropy: -0.0 and Confidence: 1.0
Input [0.9, 0.05, 0.05] = Entropy: 0.17128474107100572 and Confidence: 0.6410037503534697


In [None]:
def jaccard_similarity(a, b):
    tokens_a = set(a.lower().split())
    tokens_b = set(b.lower().split())
    if not tokens_a or not tokens_b:
        return 0.0
    return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)

def compute_avg_jaccard_for_nonzero(df):
    # Extract non-zero party names
    row = df.iloc[0]
    nonzero_parties = [col for col in df.columns[:-1] if row[col] > 0]

    if len(nonzero_parties) < 2:
        return 1  # Not enough for pairwise comparison

    # print(nonzero_parties)
    similarities = []; count = 0
    for i in range(1):
        for j in range(1, len(nonzero_parties)):
            sim = jaccard_similarity(nonzero_parties[i], nonzero_parties[j])
            if sim < 0.6:
                count = count+1
            similarities.append(sim)
    # print(similarities)
    return count

def get_confusion_type(count_hetero):
    if count_hetero == 0:
        return "Homogeneous"
    elif count_hetero == 1:
        return "Near Homogeneous"
    else:
        return "Heterogeneous"


In [226]:
# Load data
batch = pd.read_csv(DATA_FOLDER_PATH + "/Prod_Invoice_14_June_2025_to_08_July_2025.csv")
batch = batch[BILLER_REF_NAME_COLS + PAYER_REF_NAME_COLS + ["invoiceNumber"]]
batch["output_type"] = ""
batch["identified_biller"] = ""
batch["identified_biller_entropy"] = -1
batch["identified_biller_confidence"] = -1
batch["identified_biller_type"] = ""
batch["identified_payer"] = ""
batch["identified_payer_entropy"] = -1
batch["identified_payer_confidence"] = -1
batch["identified_payer_type"] = ""
batch["prob"] = -1
batch["billerReference"].fillna("-1", inplace=True)
batch["payerReference"].fillna("-1", inplace=True)
batch

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  batch["billerReference"].fillna("-1", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  batch["payerReference"].fillna("-1", inplace=True)


Unnamed: 0,billerReference,billerName,payerReference,payerName,invoiceNumber,output_type,identified_biller,identified_biller_entropy,identified_biller_confidence,identified_biller_type,identified_payer,identified_payer_entropy,identified_payer_confidence,identified_payer_type,prob
0,4050963213,Abu Ghazaleh Intellectual Property,62554W JO,Ruttensperger Lachnit Trossin Gomoll,20250081617,,,-1,-1,,,-1,-1,,-1
1,4080907444,Abu Ghazaleh Intellectual Property,KI/kf/FT-27288,FUJIMarks Japan P.C.,2025036139,,,-1,-1,,,-1,-1,,-1
2,4140900122,Abu Ghazaleh Intellectual Property,TRP/RCG/144447,Stevens Hewlett & Perkins,20250171469,,,-1,-1,,,-1,-1,,-1
3,4050917242,Abu Ghazaleh Intellectual Property,294409,DENNEMEYER & COMPANY S.a.r.l,20250081624,,,-1,-1,,,-1,-1,,-1
4,4060978398,Abu Ghazaleh Intellectual Property,62508W QA-2,Ruttensperger Lachnit Trossin Gomoll,20250093086,,,-1,-1,,,-1,-1,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3192,REF,Joyce A. Tan & Partners,P94816.SG,Griffith Hack,17002624,,,-1,-1,,,-1,-1,,-1
3193,REF,Bereskin & Parr,P105651.CA,Griffith Hack,900218243,,,-1,-1,,,-1,-1,,-1
3194,REF,Hiraki & Associates,P86103.JP,Griffith Hack,172372,,,-1,-1,,,-1,-1,,-1
3195,REF,Joyce A. Tan & Partners,P90385.SG,Griffith Hack,17002660,,,-1,-1,,,-1,-1,,-1


In [227]:
# run for the batch

for idx in tqdm(range(len(batch))):
    # try:
        input_biller = batch["billerReference"].iloc[idx]
        input_payer = batch["payerReference"].iloc[idx]
        
        results = get_top_3_combinations(input_biller, input_payer)

        # no need to process for records with output_type == "success"
        if isinstance(results[0], str) or isinstance(results[1], str):
            if isinstance(results[0], str):
                batch.loc[idx, "output_type"] = "biller_pattern_missing"
            elif isinstance(results[1], str):
                batch.loc[idx, "output_type"] = "payer_pattern_missing"
        else:
            result_pair, output_type, pattern_pair_support = get_combination_counts(results[0], results[1], results[2], results[3])
            if output_type == "success":
                top_biller = results[0]
                top_payer = results[1]
                # Getting independent billers and payers prob dist confidence
                H_biller, C_biller = compute_entropy_based_confidence(top_biller.iloc[0].values[:-1])
                H_payer, C_payer = compute_entropy_based_confidence(top_payer.iloc[0].values[:-1])
                batch.loc[idx, "identified_biller_entropy"] = H_biller
                batch.loc[idx, "identified_biller_confidence"] = C_biller
                batch.loc[idx, "identified_payer_entropy"] = H_payer 
                batch.loc[idx, "identified_payer_confidence"] = C_payer

                # Getting jaccard scores 
                J_score_biller = compute_avg_jaccard_for_nonzero(top_biller)
                J_score_payer = compute_avg_jaccard_for_nonzero(top_payer)
                batch.loc[idx, "identified_biller_type"] = get_confusion_type(J_score_biller)
                batch.loc[idx, "identifies_payer_type"] = get_confusion_type(J_score_payer)
                
                # Getting the pair with highest probability 
                max_key = max(result_pair, key=lambda k: result_pair[k][1])
                batch.loc[idx, "output_type"] = output_type
                batch.loc[idx, "identified_biller"] = max_key[0]
                batch.loc[idx, "identified_payer"] = max_key[1]
                batch.loc[idx, "prob"] = result_pair[max_key][1]
            else:
                batch.loc[idx, "output_type"] = output_type
    # except:
        # print(idx)   
        # break

  0%|          | 0/3197 [00:00<?, ?it/s]

  batch.loc[idx, "identified_biller_entropy"] = H_biller
  batch.loc[idx, "identified_biller_confidence"] = C_biller
  batch.loc[idx, "identified_payer_entropy"] = H_payer
  batch.loc[idx, "identified_payer_confidence"] = C_payer
  batch.loc[idx, "prob"] = result_pair[max_key][1]
100%|██████████| 3197/3197 [03:49<00:00, 13.92it/s]


## Batch Run - Evaluation

In [228]:
# compare true vs pred and compute error rate
batch["output_type"].value_counts()

output_type
success                      1871
no_combinations_available     642
biller_pattern_missing        357
payer_pattern_missing         327
Name: count, dtype: int64

In [229]:
batch[batch["output_type"] == "success"]["prob"].value_counts()

prob
1.000000    1222
0.988417      65
0.938650      41
0.890244      34
0.666667      28
            ... 
0.664671       1
0.625000       1
0.250000       1
0.906433       1
0.881279       1
Name: count, Length: 118, dtype: int64

In [230]:
batch[(batch["output_type"] == "success") & (batch["prob"] > 0.8)]

Unnamed: 0,billerReference,billerName,payerReference,payerName,invoiceNumber,output_type,identified_biller,identified_biller_entropy,identified_biller_confidence,identified_biller_type,identified_payer,identified_payer_entropy,identified_payer_confidence,identified_payer_type,prob,identifies_payer_type
1,4080907444,Abu Ghazaleh Intellectual Property,KI/kf/FT-27288,FUJIMarks Japan P.C.,2025036139,success,Abu Ghazaleh Intellectual Property,0.047040,0.901409,Near Homogeneous,Fujimarks Japan,-0.000000,1.000000,,1.000000,Near Homogeneous
2,4140900122,Abu Ghazaleh Intellectual Property,TRP/RCG/144447,Stevens Hewlett & Perkins,20250171469,success,Abu Ghazaleh Intellectual Property,0.047040,0.901409,Near Homogeneous,Stevens Hewlett & Perkins,0.046281,0.902999,,1.000000,Homogeneous
3,4050917242,Abu Ghazaleh Intellectual Property,294409,DENNEMEYER & COMPANY S.a.r.l,20250081624,success,Abu Ghazaleh Intellectual Property,0.047040,0.901409,Near Homogeneous,DENNEMEYER & COMPANY S.a.r.l,0.298743,0.373864,,0.919540,Heterogeneous
7,4050986492,Abu Ghazaleh Intellectual Property,ENE3P3JOW,PGA S.P.A. (Milan),20250081634,success,Abu Ghazaleh Intellectual Property,0.047040,0.901409,Near Homogeneous,PGA S.P.A. (Milan),0.079152,0.834105,,0.940678,Near Homogeneous
11,40120961346,Abu Ghazaleh Intellectual Property,Magdalena,Instra Corporation Pty. Ltd,202504110401,success,Abu Ghazaleh Intellectual Property,0.029466,0.938241,Near Homogeneous,Instra Corporation Pty. Ltd,-0.000000,1.000000,,1.000000,Near Homogeneous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3173,6.11070/PCTN,Cruz Marcelo & Tenefrancia - USD,2420-301935PH,"Gorodissky & Partners, Ltd - Russia",05353,success,Cruz Marcelo & Tenefrancia - USD,-0.000000,1.000000,Near Homogeneous,"Gorodissky & Partners, Ltd - Russia",0.080184,0.831943,,1.000000,Homogeneous
3174,P41530EP,Schlich Ltd,25873EPP00,Adams Pluck - GBP,20456,success,Schlich Ltd,0.425178,0.108868,Heterogeneous,Adams Pluck,0.012891,0.972983,,0.882353,Heterogeneous
3175,P3151CA00,BENOIT & COTE - CAD,P1318CAPC,Franke Hyland - CAD,9873331,success,BENOIT & COTE - CAD,0.309666,0.350969,Heterogeneous,Franke Hyland,0.275482,0.422617,,1.000000,Heterogeneous
3176,402721505,Abu Ghazaleh Intellectual Property,WALK70-01429660,Munck Wilson Mandala LLP,20240034778,success,Abu Ghazaleh Intellectual Property,0.315261,0.339243,Heterogeneous,Munck Wilson Mandala LLP,-0.000000,1.000000,,1.000000,Near Homogeneous


In [231]:
batch["biller_eval"] = ""
batch["payer_eval"] = ""
batch["biller_payer_comb_eval"] = ""
for idx in tqdm(range(len(batch))):
    biller_true = batch["billerName"].loc[idx].lower().strip()
    payer_true = batch["payerName"].iloc[idx].lower().strip()
    biller_pred = batch["identified_biller"].iloc[idx].lower().strip()
    payer_pred = batch["identified_payer"].iloc[idx].lower().strip()

    batch.loc[idx, "biller_eval"] = biller_true == biller_pred
    batch.loc[idx, "payer_eval"] = payer_true == payer_pred

    batch.loc[idx, "biller_payer_comb_eval"] = batch["biller_eval"].iloc[idx] * batch["payer_eval"].iloc[idx]

  0%|          | 0/3197 [00:00<?, ?it/s]

100%|██████████| 3197/3197 [00:01<00:00, 2012.88it/s]


In [232]:
batch[(batch["output_type"] == "success") & (batch["prob"] > 0.8)]["biller_payer_comb_eval"].value_counts()

biller_payer_comb_eval
1    1253
0     312
Name: count, dtype: int64

In [234]:
batch.to_csv(DATA_FOLDER_PATH + "/batch_w2_full_check.csv", index=False)

In [None]:
check_new_payer_pattern = ["Bae, Kim & Lee IP Group", "Mondial Marchi S.p.a.", "Dennemeyer & Company S. A. R. L.", "DENNEMEYER & COMPANY S.a.r.l", "TRADAMARCA SA", "Rosenthal GmbH", "SocietÃ  Italiana Brevetti - Billtrader", "HINDLES Patent and Trade Mark Attorneys", "Mann + Hummel GmbH", "Daub Patent & Law", "Daub Patent & Law", "Avek IP", "Chofn Intellectual Property - BeijingCN - CNY", "TRADAMARCA SA", "Nordemann Czychowski & Partner", "Foley & Lardner LLP - Washington", "Cabinet Germain & Maureau - LYON - R - USD", "Cabinet Germain & Maureau - LYON - R - USD", "Cabinet Germain & Maureau - LYON - R - USD", "Cabinet Germain & Maureau - LYON - R - USD", "Cabinet Germain & Maureau - LYON - R - USD", "A J Park - Auckland", "A J Park - Auckland", "Taylor Wessing", "Elite Gold Limited - USD", "Cabinet Germain & Maureau - LYON - R - USD", "Casalonga - Fleurus - EUR", "Gearhart Law LLC", "The Concept Law Group PA", "The Concept Law Group PA", "Munck Wilson Mandala LLP", "dompatent von Kreisler Selting Werner - Partnerschaft von Patentanwalten und Rechtsanwalten mbB", "Y.P. Lee, Mock & Partners", "MARKS & CLERK LLP", "Bell & Manning, LLC", "Welsh Flaxman & Gitler LLC", "Welsh Flaxman & Gitler LLC", "Welsh Flaxman & Gitler LLC", "Welsh Flaxman & Gitler LLC", "Welsh Flaxman & Gitler LLC", "Welsh Flaxman & Gitler LLC", "Sierra IP Law PC", "AJ PARK", "Tranfan Law Office Co., Ltd.", "HANA IP LAW FIRM", "Cabinet Germain & Maureau - LYON - R - USD", "Cabinet Germain & Maureau - LYON - R - USD", "Cabinet Germain & Maureau - LYON - R - USD", "Cabinet Germain & Maureau - LYON - R - USD", "Cabinet Germain & Maureau - LYON - R - USD", "Casalonga - Fleurus - EUR", "FRKelly-Waterways-Saba - USD", "A J Park - Wellington", "A J Park - Auckland", "A J Park - Wellington", "A J Park - Wellington", "A J Park - Wellington", "A J Park - Wellington", "Arnold & Siedsma BV - Brussels - USD", "Casalonga - Fleurus - EUR", "Spruson & Ferguson - USD", "Spruson & Ferguson - USD", "Spruson & Ferguson - USD", "Spruson & Ferguson - USD", "Spruson & Ferguson - USD", "Fairbairn Catley Low & Kong", "Fairbairn Catley Low & Kong", "BAE KIM & LEE IP GROUP", "Herrero & Asociados (EUR)", "Papula-Nevinpat", "Casalonga - Fleurus - EUR", "Brooks Kushman P.C.", "Allens Patent & Trade Mark Attorneys- Melbourne - R", "Allens Patent & Trade Mark Attorneys- Melbourne - R", "Allens Patent & Trade Mark Attorneys- Melbourne - R", "Allens Patent & Trade Mark Attorneys- Melbourne - R"]
# check_new_payer_pattern_ = [check_new_payer_pattern[i] for i in range(1, len(check_new_payer_pattern)) if check_new_payer_pattern[i-1] != check_new_payer_pattern[i]]
# check_new_payer_pattern_.insert(0, check_new_payer_pattern[0])

In [None]:
avail_payer_domainname = list(payer_ct.columns)
avail_payer_domainname.remove("payerType")
avail_payer_domainname.remove("All")

In [None]:
len(avail_payer_domainname)

In [None]:
found_in_payer_ct = []
for x in check_new_payer_pattern:
    if x in avail_payer_domainname:
        found_in_payer_ct.append(x)

In [None]:
len(check_new_payer_pattern), len(found_in_payer_ct)

In [None]:
check = "Cabinet Germain & Maureau - LYON - R - USD"
payer_idx = check_new_payer_pattern.index(check)
non_zero_indices = np.where(payer_ct[found_in_payer_ct[payer_idx]] > 0)[0]
payer_ct[payer_ct.index.isin(non_zero_indices)][["payerType", found_in_payer_ct[payer_idx]]]

In [None]:
check_new_biller_pattern = ["Atan Consult - BishkekKG - USD", "Reinhold Cohn & Partners - RC&P", "Reinhold Cohn & Partners - RC&P", "Bae, Kim & Lee IP", "Bae, Kim & Lee IP", "Keltie", "Cermak a spol - PrahaCZ - USD", "Khaitan And Co - GandhiMargMumbai - USD", "Eproint", "Eproint", "Eproint", "Eproint", "Eproint", "Zhong Lun Law Firm - Shanghai - P - USD", "FORRESTERS - EURO", "Litmus Law PLLC", "Troncoso Leroux", "Nony PARIS - EUR", "Troncoso Leroux", "Synergy Patent Group LLC - BoulderCo - USD", "Repertorio Design LTDA - PuertoRicoBR - USD", "Mirandah Asia (Singapore) Pte Ltd - USD", "A J Park - USD", "Marval O'Farrell Mairal", "DRZEWIECKI TOMASZEK - WarsawPL - USD", "Eproint", "Eproint", "Eproint", "Berkemeyer", "Berkemeyer", "Berkemeyer", "Spoor and Fisher - Sth Africa", "Gorodissky & Partners, Ltd - Russia", "Kuzuwa & Partners", "East IP - WanchaiHK - USD", "East IP - WanchaiHK - USD", "ROBIC IP AGENCY LP - CAD", "PT BIRO OKTROI ROOSSENO", "Pinheiro Palmer Advogados", "Fasken Martineau DuMoulin - Montreal", "BOOK CHON - SeoulKR - USD", "Tsai, Lee & Chen", "Gluck and Kritzenberger", "Dias Teixeira - Sao PauloBR - USD", "Henry Goh & Co Sdn Bhd", "Gluck and Kritzenberger", "Nascimento Advogados - SaoPauloBR - USD", "CAP IP Consulting", "Ulises Cabrera - Santo DomingoDO - USD", "Ulises Cabrera - Santo DomingoDO - USD", "Sai & Mehta - DelhiIN - USD", "Sai & Mehta - DelhiIN - USD", "Sai & Mehta - DelhiIN - USD", "Sai & Mehta - DelhiIN - USD", "Mirandah Asia International (Philippines)", "Jimenez Molino Y Moreno", "Jimenez Molino Y Moreno", "GOODRICH RIQUELME Y ASOCIDOS", "GOWLING WLG (CANADA) LLP", "Albright IP Limited - EUR", "Troncoso Leroux", "Shengxun Group - BeijingCN - USD", "Shengxun Group - BeijingCN - USD", "Shengxun Group - BeijingCN - USD", "A J Park - USD", "Lorenz Seidler Gossel", "VON WOBESER - Mexico - USD", "Archer & Angel - DelhiIN - USD", "Troncoso Leroux", "AMICA LAW LLC - SGD", "Orban Miklos Ugyvedi Iroda - EUR", "Mendez Cortes S.C. - Mexico - P - USD", "2SPL", "Dannemann Siemsen Bigler & Ipanema Morei", "Gowling WLG - Ottawa - CAD", "Borden Ladner Gervais LLP", "LEASON ELLIS IPA", "Thomas Melvin Patent - TaylorsvilleNC - USD", "Lee & Li - Leaven - BeijingCN - USD", "Bryn Aarflot AS - OsloNO - NOK", "Bryn Aarflot AS - OsloNO - NOK", "Forresters (Birmingham) - GBP", "Kaminski Harmann Patentanwalte - VaduzLI - CHF", "Eproint", "Patsnap UK", "Phoenix Translations - ElginTX - USD", "Ravindran Associates - Singapore", "BARLAW - Barrera and Asociados - USD", "FRTB S.C. - USD", "Chambers and Partners - LondonGB - GBP", "Finlayson & Singlehurst - OttawaON - USD", "Page White & Farrer Ltd - LondonUK - GBP", "Seratos Conseil Inc - BrossardQC - USD", "Von Seidels Intellectual Property Attorn"]

In [None]:
avail_biller_domainname = list(biller_ct.columns)
avail_biller_domainname.remove("billerType")
avail_biller_domainname.remove("All")

In [None]:
len(avail_biller_domainname)

In [None]:
found_in_biller_ct = []
for x in check_new_biller_pattern:
    if x in avail_biller_domainname:
        found_in_biller_ct.append(x)

In [None]:
len(check_new_biller_pattern), len(found_in_biller_ct)

In [None]:
check = "East IP - WanchaiHK - USD"
try:
    biller_idx = found_in_biller_ct.index(check)
except:
    print(f"{check} not in list")
non_zero_indices = np.where(biller_ct[found_in_biller_ct[biller_idx]] > 0)[0]
biller_ct[biller_ct.index.isin(non_zero_indices)][["billerType", found_in_biller_ct[biller_idx]]]

In [42]:
identified_counts = batch[(batch["output_type"] == "success") & (batch["prob"] > 0.8)].shape[0]
error_rate = batch[(batch["output_type"] == "success") & (batch["prob"] > 0.8) & (batch["biller_payer_comb_eval"] == 0)].shape[0]
batch_size = batch.shape[0]
metrics = {"batch_size": batch_size, "identified_counts": identified_counts, "error_rate": error_rate}

In [44]:
with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_metrics(metrics)

🏃 View run invoice_processing_test at: http://127.0.0.1:8080/#/experiments/598500546395051383/runs/53d1248f580c4c5b90abb85f28942507
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/598500546395051383


In [None]:
# provide all probs for output == success
# if combined prob < 0.8, look into independent probabilities for support
 

In [50]:
checkb="2420-581411"; checkp="ALA-001RU"
get_top_3_combinations(checkb, checkp)

NNNN-NNNNNN - XXX-NNNXX


(   Gorodissky & Partners, Ltd - Russia  Fitch Even Tobin & Flannery  \
 8                             0.901235                     0.042088   
 
    Gorodissky & Partners, Ltd  Support  
 8                     0.02862     1782  ,
      Serio Patent & Trademark Attorneys  Rauschenbach Patent Law Group PLLC  \
 230                            0.272727                            0.261364   
 
      Shobayashi International Patent & Trademark Office  \
 230                                              0.125    
 
      ALNYLAM PHARMACEUTICALS INC.  Anderson Gorecki LLP  Support  
 230                      0.102273              0.079545       88  ,
 'NNNN-NNNNNN',
 'XXX-NNNXX')

In [None]:
1 - H/log(3, 10)

0.07193350392492714