In [None]:
%pip install pandas
%pip install fuzzywuzzy
%pip install numpy
%pip install rapidfuzz
%pip install tqdm

In [None]:
import pandas as pd
from fuzzywuzzy import process
from tqdm import tqdm

In [None]:
# Load transaction and contact data from CSV files
transactions_df = pd.read_csv('../data/raw/transactions_baseline.csv', delimiter=';')
contacts_df = pd.read_csv('../data/raw/contacts_export.csv', delimiter=',')

print(transactions_df.head())
print(contacts_df.head())


In [None]:
# Preprocess data: Convert all fields to lowercase for case-insensitive matching
def preprocess_name(name):
    if isinstance(name, str):
        return name.lower().strip()
    return ''

transactions_df['Data'] = transactions_df['Data'].apply(preprocess_name)
contacts_df['Name'] = contacts_df['Name'].apply(preprocess_name)

print(transactions_df['Data'].head())
print(contacts_df['Name'].head())

In [None]:
# Function to find the top 2 matches for each transaction detail
def find_top_matches(detail, contacts_df, n=2):
    matches = process.extract(detail, contacts_df['Name'], limit=n)
    match_ids = [contacts_df[contacts_df['Name'] == match[0]]['Id'].values[0] for match in matches]
    return matches, match_ids

# Apply the matching function to each transaction detail with progress tracking
matches = []
for detail in tqdm(transactions_df['Data'], desc="Matching details"):
    matches.append(find_top_matches(detail, contacts_df))
transactions_df['Matches'] = matches

# Extract match details into separate columns
transactions_df['Match_1'] = transactions_df['Matches'].apply(lambda x: x[0][0][0] if len(x[0]) > 0 else None)
transactions_df['Score_1'] = transactions_df['Matches'].apply(lambda x: x[0][0][1] if len(x[0]) > 0 else None)
transactions_df['Match1_ID'] = transactions_df['Matches'].apply(lambda x: x[1][0] if len(x[1]) > 0 else None)

transactions_df['Match_2'] = transactions_df['Matches'].apply(lambda x: x[0][1][0] if len(x[0]) > 1 else None)
transactions_df['Score_2'] = transactions_df['Matches'].apply(lambda x: x[0][1][1] if len(x[0]) > 1 else None)
transactions_df['Match2_ID'] = transactions_df['Matches'].apply(lambda x: x[1][1] if len(x[1]) > 1 else None)

# Display the results
result_df = transactions_df[['TransactionID', 'TransactionDate', 'Detail', 'Amount', 'TransactionType', 'TransactionCategory', 'DetailAditional', 'Invoice', 'Reference', 'Data', 'Match_1', 'Score_1', 'Match1_ID', 'Match_2', 'Score_2', 'Match2_ID']]
print(result_df)

# Save the result to a CSV file
result_df.to_csv('../data/raw/matched_transactions.csv', index=False)
