In [3]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)
import my_parser
import my_classifier

In [4]:
data = my_parser.get_all()
labels = [my_classifier.label_map[item['label']['label']] for item in data]
predictions, reasons, explanations = my_classifier.model(data, explain=True)

In [5]:
from print_dict import pd
pd(data[9900])

{
    'passport': {
        'first_name': 'Lilli',
        'middle_name': 'Emilia',
        'last_name': 'Bauer',
        'gender': 'F',
        'country': 'Germany',
        'country_code': 'DEU',
        'nationality': 'German',
        'birth_date': '1983-08-11',
        'passport_number': 'MP9698113',
        'passport_mrz': [
            'P<DEUBAUER<<EMILIA<LILLI<<<<<<<<<<<<<<<<<<<<<',
            'MP9698113DEU830811<<<<<<<<<<<<<<<<<<<<<<<<<<<'
        ],
        'passport_issue_date': '2023-11-07',
        'passport_expiry_date': '2033-11-06'
    },
    'client_profile': {
        'name': 'Emilia Lilli Bauer',
        'address': {
            'city': 'Berlin',
            'street name': 'Kurfürstendamm',
            'street number': 22,
            'postal code': '16821'
        },
        'country_of_domicile': 'Germany',
        'birth_date': '1983-08-11',
        'nationality': 'German',
        'passport_number': 'MP9698113',
        'passport_issue_date': '2023-11-07',
     

In [6]:
def check_predicate(data, true_labels, predicted_labels, predicate):
    """Tells you if the predicate adds value"""
    result = [predicate(d) for d in data]
    violated_indices = [i for i, d in enumerate(data) if not predicate(d)]
    num_violated = len(violated_indices)
    total = len(data)

    if num_violated == 0:
        print("Predicate violated 0 times — nothing to evaluate.")
        return

    violated_accepted = [i for i in range(len(data)) if result[i] == False and true_labels[i] == 1]
    true_rejected = [i for i in violated_indices if true_labels[i] == 0]
    pred_rejected = [i for i in violated_indices if predicted_labels[i] == 0]

    print(f"Predicate violated {num_violated} times ({num_violated / total:.2%} of data)")
    print(f"Profiles with violation that are rejected: {true_rejected[:10]}")
    print(f"Profiles with violation that are accepted: {violated_accepted[:10]}")
    print(f"Profiles with violations are rejected {len(true_rejected) / num_violated:.2%} of the time (ground truth)")
    print(f"Profiles with violations are rejected {len(pred_rejected) / num_violated:.2%} of the time by our model") 
    return violated_accepted

In [10]:
# Is the email adress of interest?
# -> Field is always present
# -> Should field match? YES
# -> Check formatting of email address? -> Never decisive, good rule nonetheless
# -> Didn't find inherently suspicious emails
emails1 = [profile['account_form']['email_address'] for profile in data]
emails2 = [profile['client_profile']['email_address'] for profile in data]
print(len(emails1), len(emails2), [email for email in emails2 if email == ''], [email for email in emails1 if email == ''])
check_predicate(data, labels, predictions, lambda x: x['account_form']['email_address'] == x['client_profile']['email_address'])

import re
EMAIL_REGEX = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
def is_valid_email(email: str) -> bool:
    return re.match(EMAIL_REGEX, email) is not None
check_predicate(data, labels, predictions, lambda x: is_valid_email(x['account_form']['email_address']))



10000 10000 [] []
Predicate violated 174 times (1.74% of data)
Profiles with violation that are rejected: [132, 299, 308, 367, 453, 577, 618, 663, 699, 701]
Profiles with violation that are accepted: []
Profiles with violations are rejected 100.00% of the time (ground truth)
Profiles with violations are rejected 100.00% of the time by our model
Predicate violated 74 times (0.74% of data)
Profiles with violation that are rejected: [132, 299, 884, 961, 973, 993, 1057, 1234, 1691, 1831]
Profiles with violation that are accepted: []
Profiles with violations are rejected 100.00% of the time (ground truth)
Profiles with violations are rejected 100.00% of the time by our model


[]

In [None]:
# Phone number formatting
def is_valid_phone_number(phone) -> bool:
    # Remove common formatting characters
    normalized = re.sub(r"[ \-()]", "", phone)
    international_pattern = r"^(\+|00)[1-9]\d{7,14}$"  # +41791234567, 0041791234567
    local = r"^\d{8,10}$"
    return bool(re.match(international_pattern, normalized) 
                or re.match(local, normalized))

violated_accepted = check_predicate(data, labels, predictions, lambda x: is_valid_phone_number(x['account_form']['phone_number']))

Predicate violated 344 times (3.44% of data)
Profiles with violation that are rejected: [2, 34, 94, 240, 241, 355, 368, 419, 423, 472]
Profiles with violation that are accepted: []
Profiles with violations are rejected 100.00% of the time (ground truth)
Profiles with violations are rejected 99.71% of the time by our model


In [37]:
# Check field emptiness
def get_compound_keys(d, parent_key=""):
    compound_keys = []
    
    for key, value in d.items():
        # Create a new key with the parent key concatenated to the current key
        new_key = f"{parent_key}.{key}" if parent_key else key
        
        if isinstance(value, dict):  # If the value is a nested dictionary
            # Recursively get compound keys from the nested dictionary
            compound_keys.extend(get_compound_keys(value, new_key))
        else:
            # If it's not a dictionary, add the current compound key to the list
            compound_keys.append(new_key)
    
    return compound_keys
keys = get_compound_keys(data[0])
for key in keys:
    print(key)
    check_predicate(data, labels, predictions, lambda x: key in get_compound_keys(x) and my_classifier.get_nested(x, key) != '')


passport.first_name
Predicate violated 0 times — nothing to evaluate.
passport.middle_name
Predicate violated 996 times (9.96% of data)
Profiles with violation that are rejected: [34, 38, 44, 71, 129, 134, 135, 163, 197, 200]
Profiles with violation that are accepted: [11, 30, 148, 165, 170, 229, 232, 253, 276, 278]
Profiles with violations are rejected 49.30% of the time (ground truth)
Profiles with violations are rejected 25.20% of the time by our model
passport.last_name
Predicate violated 0 times — nothing to evaluate.
passport.gender
Predicate violated 76 times (0.76% of data)
Profiles with violation that are rejected: [64, 74, 219, 333, 492, 579, 686, 687, 703, 711]
Profiles with violation that are accepted: []
Profiles with violations are rejected 100.00% of the time (ground truth)
Profiles with violations are rejected 100.00% of the time by our model
passport.country
Predicate violated 0 times — nothing to evaluate.
passport.country_code
Predicate violated 0 times — nothing to 

In [44]:
# Country / Adress
# ->  Address / Country not empty
# ->  Real adress that makes sense
# ->  Adress is in country
check_predicate(data, labels, predictions, lambda x: is_valid_phone_number(x['account_form']['country_of_domicile']) != '')
check_predicate(data, labels, predictions, lambda x: is_valid_phone_number(x['account_form']['country_of_domicile']) != '')
check_predicate(data, labels, predictions, lambda x: bool(re.match(r"^\d+$", str(x['account_form']['address']['street number']))))
check_predicate(data, labels, predictions, lambda x: bool(re.match(r"^\d+$", x['account_form']['address']['postal code'])))[0]

Predicate violated 0 times — nothing to evaluate.
Predicate violated 0 times — nothing to evaluate.
Predicate violated 37 times (0.37% of data)
Profiles with violation that are rejected: [66, 361, 367, 435, 570, 1837, 2206, 2753, 2763, 2805]
Profiles with violation that are accepted: []
Profiles with violations are rejected 100.00% of the time (ground truth)
Profiles with violations are rejected 100.00% of the time by our model
Predicate violated 1062 times (10.62% of data)
Profiles with violation that are rejected: [17, 19, 35, 47, 74, 79, 89, 90, 122, 146]
Profiles with violation that are accepted: [7, 46, 59, 61, 65, 82, 88, 91, 136, 141]
Profiles with violations are rejected 53.30% of the time (ground truth)
Profiles with violations are rejected 28.63% of the time by our model


7

In [50]:
# Currency
currencies = [profile['account_form']['currency'] for profile in data]
check_predicate(data, labels, predictions, lambda x: x['account_form']['currency'] != 'DKK')[0]


Predicate violated 942 times (9.42% of data)
Profiles with violation that are rejected: [21, 60, 69, 75, 87, 111, 171, 177, 221, 246]
Profiles with violation that are accepted: [1, 29, 30, 37, 40, 52, 73, 77, 157, 188]
Profiles with violations are rejected 50.42% of the time (ground truth)
Profiles with violations are rejected 25.80% of the time by our model


1

In [81]:
# Check Names
def name_match(x):
    names = [x['account_form']['first_name'].strip(), x['account_form']['middle_name'].strip(), x['account_form']['last_name'].strip()]
    merged = ' '.join(filter(lambda x: x != '', names))
    return merged == re.sub(r'\s+', ' ', x['client_profile']['name']).strip()
violations_accepted = check_predicate(data, labels, predictions, name_match)

Predicate violated 312 times (3.12% of data)
Profiles with violation that are rejected: [2, 3, 19, 44, 114, 134, 174, 177, 200, 213]
Profiles with violation that are accepted: []
Profiles with violations are rejected 100.00% of the time (ground truth)
Profiles with violations are rejected 100.00% of the time by our model


In [82]:
# Investment Mandate
print(list(set([profile['client_profile']['type_of_mandate'] for profile in data])))
#violations_accepted = check_predicate(data, labels, predictions, name_match)

['', 'Execution-Only', 'Discretionary', 'Hybrid', 'Advisory']


In [73]:
name_match(data[1221])

(False, 'Hermans  Jacobs Janssens')

In [78]:
data[6505]['account_form']

{'name': 'Martina  Marino',
 'first_name': 'Martina',
 'middle_name': '',
 'last_name': ' Marino',
 'passport_number': 'GS7275403',
 'currency': 'DKK',
 'address': {'city': 'Fredericia',
  'street name': 'Krystalgade',
  'street number': 30,
  'postal code': '2619'},
 'country_of_domicile': 'Denmark',
 'phone_number': '+45 08 53 71 35',
 'email_address': 'martina.marino@yousee.dk'}

In [None]:
numbers = [profile['client_profile']['type_of_mandate'] for profile in data]
[numbers[i] for i in violated_accepted]

[]

In [30]:
data[1]['labelx']

KeyError: 'labelx'

In [15]:
numbers[::100]

['+34 625 487 017',
 '+49 7434 498222',
 '+39 397 8113350',
 '+41 017 055 93 17',
 '+31 06 22035701',
 '046 682 24 54',
 '06 88672765',
 '+49 6528 992409',
 '+49 0042 412850',
 '30 7792547',
 '++4106 59972973',
 '06 4428157',
 '+43 557 685 5510',
 '00 34 29 88',
 '0459 102 183',
 '+33 08 39 63 97 89',
 '+41 097 435 09 02',
 '+358 046 373 13 40',
 '+31 06 14966097',
 '+358 042 661 51 32',
 '+45 59 48 83 50',
 '+43 138 160 3333',
 '022 437 40 7',
 '+33 07 05 01 60 86',
 '+49 4152 767351',
 '+34 600 981 806',
 '+33 05 46 08 54 26',
 '+33 09 05 02 58 40',
 '+32 0455 732 453',
 '+49 9606 130537',
 '07 68 69 14',
 '+32 0486 184 1806',
 '+41 023 981 26 52',
 '698 930 2940',
 '+358 049 720 68 47',
 '+39 345 3766834',
 '635 275 13',
 '+41 087 317 96 20',
 '+34 697 095 821',
 '+34 664 886 370',
 '+49 0662 623147',
 '+34 663 762 157',
 '+34 620 472 9033',
 '++34 490 599 7667',
 '+39 321 1752942',
 '',
 '+41 001 242 96 39',
 '+43 544 845 2925',
 '+45 12 85 76 50',
 '+41 041 045 51 36',
 '+32 0455 