In [1]:
import pandas as pd

In [2]:
# Read in the parquet files
df = pd.read_parquet('all_2023q3_2024q1_FULL.parquet')
df_auth = pd.read_parquet('authenticated_2023q3_2024q1_FULL.parquet')

In [11]:
print(df.shape)
df.columns

(391604, 72)


Index(['id', 'acquiring_bank_name', 'api_version', 'approval_code',
       'assessment_id', 'authorization_additional_data', 'authorization_id',
       'authorization_receipt_number', 'authorization_request_token',
       'authorization_response', 'authorization_transaction_date',
       'authorized_amount', 'avs_code', 'bank_merchant_id',
       'bank_reconciliation_id', 'business_id', 'capture_amount',
       'capture_request_id', 'capture_response', 'card_brand', 'card_data_id',
       'card_expiration_month', 'card_expiration_year', 'card_holder_name',
       'card_type', 'cavv', 'cc_version', 'charge_type', 'client_id',
       'client_type', 'country', 'country_code', 'created',
       'credit_card_payment_channel', 'credit_card_processor',
       'credit_card_token_id', 'currency', 'cvn_code', 'eci', 'external_id',
       'failure_reason', 'fee_amount', 'fee_label', 'original_id',
       'installment', 'is_blocked_by_fraud', 'is_switcher', 'is_t4',
       'issuing_bank_name', 'ma

In [13]:
print(df_auth.shape)
df_auth.columns

(475066, 57)


Index(['id', 'acs_url', 'amount', 'authentication_transaction_id',
       'authentication_type', 'authentication_verification_response',
       'authorization_id', 'billing_details', 'business_id', 'card_bank',
       'card_brand', 'card_data_id', 'card_expiration_month',
       'card_expiration_year', 'cavv', 'cc_version', 'client_id',
       'client_type', 'commerce_indicator', 'country', 'country_code',
       'created', 'credit_card_enrollment_info', 'credit_card_payment_channel',
       'credit_card_token_id', 'currency', 'cybersource_merchant_id',
       'directory_server_transaction_id', 'eci', 'eci_raw', 'external_id',
       'failure_reason', 'original_id', 'initial_client_type',
       'initiate_three_ds_url', 'ip_address', 'is_enrolled',
       'masked_card_number', 'md', 'pa_req', 'pares_status', 'processor_type',
       'proof_xml', 'redirect_html', 'referer', 'request_id', 'status',
       'term_url', 'three_ds_result', 'three_ds_version', 'updated',
       'user_agent', 

In [16]:
# Ensure we are dealing with the same data type
df['id'] = df['id'].astype(str)
df_auth['authentication_transaction_id'] = df_auth['authentication_transaction_id'].astype(str)

# Count occurrences
df_counts = df['id'].value_counts()
df_auth_counts = df_auth['authentication_transaction_id'].value_counts()

# Calculate matches
matches_df_to_auth = df_counts[df_counts.index.isin(df_auth_counts.index)]
matches_auth_to_df = df_auth_counts[df_auth_counts.index.isin(df_counts.index)]

# Total counts
total_df = len(df)
total_df_auth = len(df_auth)

# Calculate match rates
match_rate_df_to_auth = (matches_df_to_auth.sum() / total_df) * 100
match_rate_auth_to_df = (matches_auth_to_df.sum() / total_df_auth) * 100

print(f"Match rate from 'df' to 'df_auth': {match_rate_df_to_auth:.2f}%")
print(f"Match rate from 'df_auth' to 'df': {match_rate_auth_to_df:.2f}%")

Match rate from 'df' to 'df_auth': 0.00%
Match rate from 'df_auth' to 'df': 49.51%


In [19]:
# Define column names
df_col = 'id'
df_auth_col = 'authentication_transaction_id'

# Calculate the match rate of df[df_col] in df_auth[df_auth_col]
match_rate_df_in_auth = df[df_col].isin(df_auth[df_auth_col]).mean()

# Calculate the match rate of df_auth[df_auth_col] in df[df_col]
match_rate_auth_in_df = df_auth[df_auth_col].isin(df[df_col]).mean()

print(f"Match rate of df.{df_col} in df_auth.{df_auth_col}: {match_rate_df_in_auth:.4f}")
print(f"Match rate of df_auth.{df_auth_col} in df.{df_col}: {match_rate_auth_in_df:.4f}")

Match rate of df.id in df_auth.authentication_transaction_id: 0.0000
Match rate of df_auth.authentication_transaction_id in df.id: 0.4951


In [24]:
import itertools
import numpy as np
from operator import itemgetter

# Get all column names ending with 'id' from both DataFrames
df_id_columns = [col for col in df.columns if col.endswith('id')]
df_auth_id_columns = [col for col in df_auth.columns if col.endswith('id')]

# Generate all combinations of columns
all_combinations = list(itertools.product(df_id_columns, df_auth_id_columns))

all_results = []

# Calculate match rates for all combinations
for df_col, df_auth_col in all_combinations:
    # Drop NaN values and reset index to ensure alignment
    df_col_clean = df[df_col].dropna().reset_index(drop=True)
    df_auth_col_clean = df_auth[df_auth_col].dropna().reset_index(drop=True)

    # Convert to sets for better matching
    df_col_set = set(df_col_clean)
    df_auth_col_set = set(df_auth_col_clean)

    # Calculate the match rates
    match_rate_df_in_auth = len(df_col_set & df_auth_col_set) / len(df_col_set) if df_col_set else 0
    match_rate_auth_in_df = len(df_auth_col_set & df_col_set) / len(df_auth_col_set) if df_auth_col_set else 0

    # Calculate NaN percentages
    nan_percent_df = df[df_col].isna().mean() * 100
    nan_percent_df_auth = df_auth[df_auth_col].isna().mean() * 100

    # Store the results
    all_results.append((
        df_col, 
        df_auth_col, 
        match_rate_df_in_auth, 
        match_rate_auth_in_df, 
        max(match_rate_df_in_auth, match_rate_auth_in_df),
        nan_percent_df,
        nan_percent_df_auth
    ))

# Sort results by the maximum match rate in descending order
sorted_results = sorted(all_results, key=itemgetter(4), reverse=True)

# Print the top matches
print("Top matches (in descending order of maximum match rate):")
for result in sorted_results:
    df_col, df_auth_col, rate_df_in_auth, rate_auth_in_df, max_rate, nan_df, nan_df_auth = result
    print(f"\nMatch between df.{df_col} and df_auth.{df_auth_col}:")
    print(f"  Match rate of df.{df_col} in df_auth.{df_auth_col}: {rate_df_in_auth:.4f}")
    print(f"  Match rate of df_auth.{df_auth_col} in df.{df_col}: {rate_auth_in_df:.4f}")
    print(f"  Maximum match rate: {max_rate:.4f}")
    print(f"  NaN percentage in df.{df_col}: {nan_df:.2f}%")
    print(f"  NaN percentage in df_auth.{df_auth_col}: {nan_df_auth:.2f}%")

Top matches (in descending order of maximum match rate):

Match between df.merchant_id and df_auth.cybersource_merchant_id:
  Match rate of df.merchant_id in df_auth.cybersource_merchant_id: 0.5402
  Match rate of df_auth.cybersource_merchant_id in df.merchant_id: 0.8832
  Maximum match rate: 0.8832
  NaN percentage in df.merchant_id: 0.00%
  NaN percentage in df_auth.cybersource_merchant_id: 0.00%

Match between df.business_id and df_auth.business_id:
  Match rate of df.business_id in df_auth.business_id: 0.6616
  Match rate of df_auth.business_id in df.business_id: 0.6667
  Maximum match rate: 0.6667
  NaN percentage in df.business_id: 0.00%
  NaN percentage in df_auth.business_id: 0.00%

Match between df.bank_merchant_id and df_auth.cybersource_merchant_id:
  Match rate of df.bank_merchant_id in df_auth.cybersource_merchant_id: 0.3214
  Match rate of df_auth.cybersource_merchant_id in df.bank_merchant_id: 0.5255
  Maximum match rate: 0.5255
  NaN percentage in df.bank_merchant_id: 1