In [68]:
import pandas as pd
from tqdm import tqdm
import networkx as nx
df = pd.read_csv('ubs-lauzhack-2024/external_parties_train.csv')

In [74]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

def assign_group_ids(df):
    # Create a graph
    G = nx.Graph()
    G.add_nodes_from(df.index)  # Add all indices as nodes

    # Add edges for parsed_name
    for name, group in tqdm(df.groupby('parsed_name'), desc='Processing parsed names'):
        nodes = group.index.tolist()
        if len(nodes) > 1:  # Only create edges if there are matches
            for i in range(len(nodes) - 1):
                G.add_edge(nodes[i], nodes[i + 1])

    # Add edges for party_iban
    # for iban, group in tqdm(df.groupby('party_iban'), desc='Processing IBANs'):
    #     nodes = group.index.tolist()
    #     if len(nodes) > 1:  # Only create edges if there are matches
    #         for i in range(len(nodes) - 1):
    #             G.add_edge(nodes[i], nodes[i + 1])

    # Find connected components
    connected_components = list(nx.connected_components(G))
    
    # Create group_id mapping
    group_id_map = {}
    current_group_id = 0
    
    # First, assign group IDs to connected components (matching elements)
    for component in tqdm(connected_components, desc='Assigning group IDs to components'):
        if len(component) > 1:  # If component has more than one node
            for node in component:
                group_id_map[node] = current_group_id
            current_group_id += 1
    
    # Then, assign unique group IDs to single nodes (non-matching elements)
    for node in tqdm(G.nodes(), desc='Assigning IDs to single nodes'):
        if node not in group_id_map:
            group_id_map[node] = current_group_id
            current_group_id += 1

    # Assign group IDs to the DataFrame
    df['group_id'] = df.index.map(group_id_map)
    
    return df

# Example usage:
result = assign_group_ids(df)
result

Processing parsed names:   0%|          | 0/7031 [00:00<?, ?it/s]

Processing parsed names: 100%|██████████| 7031/7031 [00:00<00:00, 135503.42it/s]
Assigning group IDs to components: 100%|██████████| 7031/7031 [00:00<00:00, 3553030.29it/s]
Assigning IDs to single nodes: 100%|██████████| 11064/11064 [00:00<00:00, 5035895.76it/s]


Unnamed: 0,transaction_reference_id,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone,external_id,group_id
0,04ff0d1c680189e3a80c92d86407f0f5,BENE,mary mith 107 107 angela brooks n. thomasfurt ...,mary mith,angela brooks,107 107,,,n. thomasfurt,,bulgaria,GB49MYOB82127728573340,+1.815660-6791x8486,50039037,1574
1,439ab0ad7380e6135ab2ff3fddd4a727,ORG,yesneia kim north michael 93971 koribati,yesneia kim,north michael,,,93971,koribati,,,,0 (269)620-8734x2349,60044692,1575
2,00cac12d41191a84f9e31aa731a83512,ORG,w. roberson jr. 41010 rachel crossingapt. 923 ...,w. roberson jr.,rachel crossingapt.,41010 923,,p2235417,thompsonshire amyport,,,GB08OTHR53515837682953,,30008244,0
3,e4fba5f878dd3453e35973605a783a16,BENE,azquez-nelson co. suarez ports suite & 024 bri...,azquez-nelson co.,ports suite &,,,,brittanyberg,,bulgaria bulgaria,GB17VVGW66321494633280,,40017944,1576
4,d03d7e4c31878b0255d39e8c3f0ab625,ORG,m.j. bytd iii 856 john lake s. glenn cocos (ke...,m.j. bytd iii,john lake s. glenn,856,,125838276,cocos (keeling),islands,,,(260)3371534,40012658,1577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11059,7183d9c3700148c9527869948b685085,BENE,james alvarado jr. port james zambia,james alvarado jr.,james,,,,port,,zambia,GB72FLST90715739139871,(+41 ) 9 49.220.3 3879822,35031814,339
11060,b47b9ed0a8cc9fcc4c21ac368fe79757,BENE,marcnguyen 234 price meadows robertmouth,marcnguyen,price meadows,234,,,robertmouth,,,GB64SFWQ57121822291812,,35003780,346
11061,d61ff2b0184f32ad0021a313c6112b2e,ORG,joseph davis 14131 taylor villages apt. 764 s....,joseph davis,taylor villages apt. 764 kimberlymouth,14131,,99190 p5676812,s. gibraltar,,,GB36XGTI64012968550973,"(00,41)+18784377081",30014262,123
11062,bec335b1b1bad8c55b7dce549cfd8de0,ORG,gonzalezltd 94129lozano cape,gonzalezltd,,,,94129lozano,cape,,,,,10359417,7029


In [60]:
result[['transaction_reference_id', 'group_id']].rename(columns={'group_id': 'external_id'}).reset_index(drop=True).to_csv('output.csv', index=False)

In [66]:
result['group_id'].value_counts()



group_id
50        26054
171        4589
288        1150
464         986
970         845
          ...  
397902        1
397903        1
397904        1
397905        1
779332        1
Name: count, Length: 779333, dtype: int64

In [51]:
result[result.group_id == 50]


Unnamed: 0,transaction_reference_id,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone,group_id
81,e3e9889aa30b9d2b4d4b3430b236ea15,ORG,d. williams 4726 collins bypass apt. 322 & nor...,d. williams,collins bypass apt. 322 & north lisaside east,4726 25229,,975820188,rachel liechtenstein,,,,0349-2587787x5701,50
85,cdd1d76016c1d5e187f6d2a813c9a903,BENE,thompson group 897 jacobson route eizabethshire,thompson group,jacobson route,897,,,eizabethshire,,,,+41435.357-.18 _82x16212,50
158,454e5bc0ca0cede3e164052ad71017da,ORG,allen plc tateburgh haiti,allen plc,,,,,tateburgh,,haiti,GB94LETZ90393407376569,,50
192,512c19230ecd2f63b61d519d9fd3b440,BENE,williams group alexis s. william port p.o. box...,williams group,alexis s.,,,box,william port,,,GB85HYYL01207202089732,,50
352,208227c750efea5008bfbff4ccb33a51,BENE,smith limited 321 kevin shoal port carlosberg ...,smith limited,kevin shoal carlosberg,321,,16212,port trinidad,and tobago,,GB82QPFA09711396974121,(00 41) 378632-984 945965,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1481364,5ffdd65e4bc7e731d1ed6c2a06c2683e,ORG,jones plc 2665 stout mill lake jamie south joh...,jones plc,stout mill lake,2665,,,jamie south,john cote d'ivoire,,GB89RCQY63670663536937,,50
1481442,f8fa5a3bab3e73a64af4d1ddb51fb3e3,BENE,smith plc 361 alexis parkways port cynthiaberg...,smith plc,alexis parkways cynthiaberg,361,,50124,port new marcport,,russian federation,GB20FWKY88438441447653,,50
1481486,1f51f03109501484fc2896dd737c73ad,BENE,group smith 454 edwards mews 31442 akgeria,group smith,edwards mews,454,,31442,akgeria,,,GB03QWKF37978745108798,(0_041) (397)-3185806 x049,50
1481543,a17ac46a719e05c132c203e5dc047387,ORG,ramos group chambersside south caseystad walli...,ramos group,chambersside south caseystad,,,,wallis and futuna,,,GB36HRXA29164403218340,(+41)+1-686-5003935,50


In [75]:
from tqdm import tqdm
import numpy as np

# Convert group_ids and external_ids to numpy arrays for faster access
group_ids = df['group_id'].values
external_ids = df['external_id'].values
indices = df.index.values

# Initialize arrays to store results
n = len(indices)
n_pairs = (n * (n-1)) // 2  # Total number of unique pairs

# Create boolean masks for matching ids
group_matches = np.zeros(n_pairs, dtype=bool)
external_matches = np.zeros(n_pairs, dtype=bool)

# Use a single loop with efficient array operations
pair_idx = 0
for i in tqdm(range(n-1)):
    batch_size = n - (i+1)
    group_matches[pair_idx:pair_idx+batch_size] = (
        group_ids[i] == group_ids[i+1:n]
    )
    external_matches[pair_idx:pair_idx+batch_size] = (
        external_ids[i] == external_ids[i+1:n]
    )
    pair_idx += batch_size

# Calculate metrics
true_positives = np.sum(group_matches & external_matches)
false_positives = np.sum(group_matches & ~external_matches)
false_negatives = np.sum(~group_matches & external_matches)

precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
accuracy = np.sum(group_matches == external_matches) / n_pairs if n_pairs > 0 else 0

print(f"Total number of pairs evaluated: {n_pairs}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


  0%|          | 0/11063 [00:00<?, ?it/s]

100%|██████████| 11063/11063 [00:00<00:00, 226613.39it/s]

Total number of pairs evaluated: 61200516
Accuracy: 0.9999
Precision: 0.9626
Recall: 0.5988
F1 Score: 0.7383



