In [2]:
import sys
import subprocess
import pandas as pd
import numpy as np
import warnings
import os
from datetime import datetime

warnings.filterwarnings('ignore')
np.random.seed(42)



In [3]:
print("Installing required packages...")
required_packages = [
    'torch>=1.12.0',
    'torch-geometric>=2.3.0', 
    'pandas>=1.4.0',
    'numpy>=1.21.0',
    'scikit-learn>=1.1.0',
    'networkx>=2.8.0',
    'matplotlib>=3.5.0',
    'seaborn>=0.11.0',
    'plotly>=5.10.0',
    'tqdm>=4.64.0'
]
for package in required_packages:
    try:
        __import__(package.split('>=')[0].replace('-', '_'))
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

Installing required packages...


In [4]:
data_files = {
    'transactions': '../data/Transactions.csv',
    'accounts': '../data/Accounts.csv',
    'patterns': '../data/Patterns.txt'
}

print("Checking for required files...")
missing_files = [f for f in data_files.values() if not os.path.exists(f)]
if missing_files:
    print(f"Missing files: {missing_files}")
    print("Please place the data files in the data/ directory")
else:
    print("All required files found")

Checking for required files...
All required files found


In [5]:
print("Loading Transactions data...")
transactions_df = pd.read_csv(data_files['transactions'])

print(f"Original columns: {list(transactions_df.columns)}")

column_mapping = {
    'Timestamp': 'timestamp',
    'From Bank': 'from_bank', 
    'Account': 'account_origin',
    'To Bank': 'to_bank',
    'Account.1': 'account_destination', 
    'Amount Received': 'amount_received',
    'Receiving Currency': 'receiving_currency',
    'Amount Paid': 'amount_paid',
    'Payment Currency': 'payment_currency', 
    'Payment Format': 'payment_format',
    'Is Laundering': 'is_laundering'
}

transactions_df.rename(columns=column_mapping, inplace=True)
print(f"Renamed columns: {list(transactions_df.columns)}")

Loading Transactions data...
Original columns: ['Timestamp', 'From Bank', 'Account', 'To Bank', 'Account.1', 'Amount Received', 'Receiving Currency', 'Amount Paid', 'Payment Currency', 'Payment Format', 'Is Laundering']
Renamed columns: ['timestamp', 'from_bank', 'account_origin', 'to_bank', 'account_destination', 'amount_received', 'receiving_currency', 'amount_paid', 'payment_currency', 'payment_format', 'is_laundering']


In [6]:
print("Processing timestamp and numeric columns...")
transactions_df['timestamp'] = pd.to_datetime(transactions_df['timestamp'])
transactions_df['amount_received'] = pd.to_numeric(transactions_df['amount_received'], errors='coerce')
transactions_df['amount_paid'] = pd.to_numeric(transactions_df['amount_paid'], errors='coerce')
transactions_df['is_laundering'] = transactions_df['is_laundering'].astype(int)

print("Basic statistics for transactions:")
print(f"Total transactions: {len(transactions_df):,}")
print(f"Date range: {transactions_df['timestamp'].min()} to {transactions_df['timestamp'].max()}")
print(f"Date span: {(transactions_df['timestamp'].max() - transactions_df['timestamp'].min()).days} days")
print(f"Money laundering transactions: {transactions_df['is_laundering'].sum():,}")
print(f"Money laundering rate: {transactions_df['is_laundering'].mean():.6f} ({transactions_df['is_laundering'].mean()*100:.4f}%)")

Processing timestamp and numeric columns...
Basic statistics for transactions:
Total transactions: 5,078,345
Date range: 2022-09-01 00:00:00 to 2022-09-18 16:18:00
Date span: 17 days
Money laundering transactions: 5,177
Money laundering rate: 0.001019 (0.1019%)


In [7]:
print("Analyzing payment formats and currencies...")
print("\nPayment Format Distribution:")
payment_format_dist = transactions_df['payment_format'].value_counts()
for format_type, count in payment_format_dist.items():
    percentage = (count / len(transactions_df)) * 100
    print(f"  {format_type}: {count:,} ({percentage:.2f}%)")

print("\nReceiving Currency Distribution:")
currency_dist = transactions_df['receiving_currency'].value_counts()
for currency, count in currency_dist.head(10).items():
    percentage = (count / len(transactions_df)) * 100
    print(f"  {currency}: {count:,} ({percentage:.2f}%)")

Analyzing payment formats and currencies...

Payment Format Distribution:
  Cheque: 1,864,331 (36.71%)
  Credit Card: 1,323,324 (26.06%)
  ACH: 600,797 (11.83%)
  Cash: 490,891 (9.67%)
  Reinvestment: 481,056 (9.47%)
  Wire: 171,855 (3.38%)
  Bitcoin: 146,091 (2.88%)

Receiving Currency Distribution:
  US Dollar: 1,879,341 (37.01%)
  Euro: 1,172,017 (23.08%)
  Swiss Franc: 237,884 (4.68%)
  Yuan: 206,551 (4.07%)
  Shekel: 194,988 (3.84%)
  Rupee: 192,065 (3.78%)
  UK Pound: 181,255 (3.57%)
  Ruble: 157,361 (3.10%)
  Yen: 156,319 (3.08%)
  Bitcoin: 148,151 (2.92%)


In [8]:
print("Analyzing transaction amounts...")
amount_stats = {
    'min': transactions_df['amount_paid'].min(),
    'max': transactions_df['amount_paid'].max(),
    'mean': transactions_df['amount_paid'].mean(),
    'median': transactions_df['amount_paid'].median(),
    'std': transactions_df['amount_paid'].std()
}

print("Amount Statistics:")
for stat, value in amount_stats.items():
    print(f"  {stat}: ${value:,.2f}")

print(f"Zero amounts: {(transactions_df['amount_paid'] == 0).sum()}")
print(f"Negative amounts: {(transactions_df['amount_paid'] < 0).sum()}")
print(f"Very large amounts (>$1M): {(transactions_df['amount_paid'] > 1000000).sum()}")

Analyzing transaction amounts...
Amount Statistics:
  min: $0.00
  max: $1,046,302,363,293.48
  mean: $4,509,273.37
  median: $1,414.54
  std: $869,772,830.92
Zero amounts: 0
Negative amounts: 0
Very large amounts (>$1M): 200403


In [9]:
print("Loading Accounts data...")
accounts_df = pd.read_csv(data_files['accounts'])

accounts_column_mapping = {
    'Bank Name': 'bank_name',
    'Bank ID': 'bank_id', 
    'Account Number': 'account_number',
    'Entity ID': 'entity_id',
    'Entity Name': 'entity_name'
}

accounts_df.rename(columns=accounts_column_mapping, inplace=True)

print("Accounts data statistics:")
print(f"Total account records: {len(accounts_df):,}")
print(f"Unique banks: {accounts_df['bank_id'].nunique()}")
print(f"Unique entities: {accounts_df['entity_id'].nunique()}")
print(f"Unique account numbers: {accounts_df['account_number'].nunique()}")

Loading Accounts data...
Accounts data statistics:
Total account records: 518,581
Unique banks: 30470
Unique entities: 166207
Unique account numbers: 518573


In [10]:
print("Analyzing entity types...")
entity_type_analysis = accounts_df['entity_name'].str.extract('(Corporation|Partnership|Sole Proprietorship)')
entity_types = entity_type_analysis[0].value_counts()
print("Entity Types:")
for entity_type, count in entity_types.items():
    percentage = (count / len(accounts_df)) * 100
    print(f"  {entity_type}: {count:,} ({percentage:.2f}%)")

print(f"Entities with missing type info: {entity_type_analysis[0].isna().sum()}")

Analyzing entity types...
Entity Types:
  Partnership: 189,683 (36.58%)
  Corporation: 172,351 (33.24%)
  Sole Proprietorship: 149,048 (28.74%)
Entities with missing type info: 7499


In [11]:
print("Analyzing bank distribution...")
bank_distribution = accounts_df['bank_name'].value_counts()
print("Top 10 Banks by Account Count:")
for i, (bank, count) in enumerate(bank_distribution.head(10).items(), 1):
    percentage = (count / len(accounts_df)) * 100
    print(f"  {i:2d}. {bank}: {count:,} accounts ({percentage:.2f}%)")

print(f"Total banks: {len(bank_distribution)}")
print(f"Banks with single account: {(bank_distribution == 1).sum()}")

Analyzing bank distribution...
Top 10 Banks by Account Count:
   1. National Bank of Laramie: 3,797 accounts (0.73%)
   2. National Bank of the East: 3,663 accounts (0.71%)
   3. Japan Bank #0: 3,051 accounts (0.59%)
   4. Arbor Savings Bank: 2,894 accounts (0.56%)
   5. National Bank of Pittsburgh: 2,683 accounts (0.52%)
   6. Savings Bank of Fairfield: 2,642 accounts (0.51%)
   7. First Bank of Tampa: 2,464 accounts (0.48%)
   8. Savings Bank of Huron: 2,379 accounts (0.46%)
   9. Savings Bank of Los Angeles: 2,052 accounts (0.40%)
  10. Golden Bancorp: 1,967 accounts (0.38%)
Total banks: 20053
Banks with single account: 4079


In [12]:
print("Loading and parsing Patterns data...")

def parse_patterns_file(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
    
    patterns = []
    current_pattern = None
    current_transactions = []
    
    lines = content.strip().split('\n')
    
    for line in lines:
        line = line.strip()
        
        if line.startswith('BEGIN LAUNDERING ATTEMPT'):
            if 'FAN-OUT' in line:
                current_pattern = 'FAN_OUT'
            elif 'CYCLE' in line:
                current_pattern = 'CYCLE'
            elif 'GATHER-SCATTER' in line:
                current_pattern = 'GATHER_SCATTER'
            elif 'STACK' in line:
                current_pattern = 'STACK'
            elif 'RANDOM' in line:
                current_pattern = 'RANDOM'
            elif 'BIPARTITE' in line:
                current_pattern = 'BIPARTITE'
            elif 'FAN-IN' in line:
                current_pattern = 'FAN_IN'
            elif 'SCATTER-GATHER' in line:
                current_pattern = 'SCATTER_GATHER'
            else:
                current_pattern = 'UNKNOWN'
            
            current_transactions = []
            
        elif line.startswith('END LAUNDERING ATTEMPT'):
            if current_pattern and current_transactions:
                patterns.append({
                    'pattern_type': current_pattern,
                    'transaction_count': len(current_transactions),
                    'transactions': current_transactions.copy(),
                    'raw_transactions': current_transactions.copy()
                })
            current_pattern = None
            current_transactions = []
            
        elif current_pattern and ',' in line and not line.startswith('BEGIN') and not line.startswith('END'):
            parts = line.split(',')
            if len(parts) >= 10:
                transaction_data = {
                    'timestamp': parts[0],
                    'from_bank': parts[1],
                    'account_origin': parts[2],
                    'to_bank': parts[3],
                    'account_destination': parts[4],
                    'amount_received': parts[5],
                    'receiving_currency': parts[6],
                    'amount_paid': parts[7],
                    'payment_currency': parts[8],
                    'payment_format': parts[9],
                    'is_laundering': parts[10] if len(parts) > 10 else '1'
                }
                current_transactions.append(transaction_data)
    
    return patterns

all_patterns = parse_patterns_file(data_files['patterns'])
patterns_df = pd.DataFrame(all_patterns)

print(f"Successfully parsed {len(patterns_df)} money laundering patterns")

Loading and parsing Patterns data...
Successfully parsed 370 money laundering patterns


In [13]:
print("Analyzing pattern distribution...")

pattern_stats = {}
total_pattern_transactions = 0

for pattern_type in patterns_df['pattern_type'].unique():
    pattern_subset = patterns_df[patterns_df['pattern_type'] == pattern_type]
    pattern_count = len(pattern_subset)
    total_txns = pattern_subset['transaction_count'].sum()
    avg_txns_per_pattern = pattern_subset['transaction_count'].mean()
    min_txns = pattern_subset['transaction_count'].min()
    max_txns = pattern_subset['transaction_count'].max()
    
    pattern_stats[pattern_type] = {
        'pattern_count': pattern_count,
        'total_transactions': total_txns,
        'avg_transactions_per_pattern': avg_txns_per_pattern,
        'min_transactions': min_txns,
        'max_transactions': max_txns
    }
    
    total_pattern_transactions += total_txns

print("Money Laundering Pattern Analysis:")
print(f"Total patterns identified: {len(patterns_df)}")
print(f"Total transactions in patterns: {total_pattern_transactions}")

for pattern_type, stats in pattern_stats.items():
    print(f"\n{pattern_type}:")
    print(f"  Instances: {stats['pattern_count']}")
    print(f"  Total transactions: {stats['total_transactions']}")
    print(f"  Avg transactions per instance: {stats['avg_transactions_per_pattern']:.1f}")
    print(f"  Transaction range: {stats['min_transactions']} - {stats['max_transactions']}")
    

Analyzing pattern distribution...
Money Laundering Pattern Analysis:
Total patterns identified: 370
Total transactions in patterns: 3209

FAN_OUT:
  Instances: 48
  Total transactions: 342
  Avg transactions per instance: 7.1
  Transaction range: 1 - 16

CYCLE:
  Instances: 54
  Total transactions: 287
  Avg transactions per instance: 5.3
  Transaction range: 2 - 12

GATHER_SCATTER:
  Instances: 51
  Total transactions: 716
  Avg transactions per instance: 14.0
  Transaction range: 2 - 28

STACK:
  Instances: 43
  Total transactions: 466
  Avg transactions per instance: 10.8
  Transaction range: 2 - 30

RANDOM:
  Instances: 41
  Total transactions: 191
  Avg transactions per instance: 4.7
  Transaction range: 1 - 11

BIPARTITE:
  Instances: 49
  Total transactions: 263
  Avg transactions per instance: 5.4
  Transaction range: 1 - 15

FAN_IN:
  Instances: 40
  Total transactions: 318
  Avg transactions per instance: 8.0
  Transaction range: 1 - 16

SCATTER_GATHER:
  Instances: 44
  Tota

In [14]:
print("Extracting pattern transaction details...")

def extract_pattern_transactions(patterns_df):
    all_pattern_transactions = []
    
    for idx, pattern in patterns_df.iterrows():
        pattern_type = pattern['pattern_type']
        
        for txn_idx, txn in enumerate(pattern['transactions']):
            txn_record = txn.copy()
            txn_record['pattern_id'] = idx
            txn_record['pattern_type'] = pattern_type
            txn_record['txn_sequence'] = txn_idx + 1
            txn_record['total_txns_in_pattern'] = len(pattern['transactions'])
            
            all_pattern_transactions.append(txn_record)
    
    return pd.DataFrame(all_pattern_transactions)

pattern_transactions_df = extract_pattern_transactions(patterns_df)

print(f"Extracted {len(pattern_transactions_df)} individual pattern transactions")

pattern_transactions_df['timestamp'] = pd.to_datetime(pattern_transactions_df['timestamp'])
pattern_transactions_df['amount_paid'] = pd.to_numeric(pattern_transactions_df['amount_paid'], errors='coerce')

print("Pattern transaction temporal analysis:")
print(f"Date range: {pattern_transactions_df['timestamp'].min()} to {pattern_transactions_df['timestamp'].max()}")
print(f"Total amount in patterns: ${pattern_transactions_df['amount_paid'].sum():,.2f}")

Extracting pattern transaction details...
Extracted 3209 individual pattern transactions
Pattern transaction temporal analysis:
Date range: 2022-09-01 00:03:00 to 2022-09-18 16:18:00
Total amount in patterns: $8,728,863,427.19


In [15]:
print("Analyzing pattern characteristics...")

pattern_characteristics = {}

for pattern_type in pattern_transactions_df['pattern_type'].unique():
    pattern_txns = pattern_transactions_df[pattern_transactions_df['pattern_type'] == pattern_type]
    
    unique_accounts = set(pattern_txns['account_origin'].unique()) | set(pattern_txns['account_destination'].unique())
    unique_banks = set(pattern_txns['from_bank'].unique()) | set(pattern_txns['to_bank'].unique())
    unique_currencies = pattern_txns['payment_currency'].nunique()
    
    total_amount = pattern_txns['amount_paid'].sum()
    avg_amount = pattern_txns['amount_paid'].mean()
    
    time_span_days = (pattern_txns['timestamp'].max() - pattern_txns['timestamp'].min()).days
    
    pattern_characteristics[pattern_type] = {
        'unique_accounts': len(unique_accounts),
        'unique_banks': len(unique_banks),
        'unique_currencies': unique_currencies,
        'total_amount': total_amount,
        'avg_amount_per_txn': avg_amount,
        'time_span_days': time_span_days,
        'currencies_used': list(pattern_txns['payment_currency'].unique())
    }

print("Pattern Characteristics Analysis:")
for pattern_type, chars in pattern_characteristics.items():
    print(f"\n{pattern_type}:")
    print(f"  Unique accounts involved: {chars['unique_accounts']}")
    print(f"  Unique banks involved: {chars['unique_banks']}")
    print(f"  Currencies used: {chars['unique_currencies']}")
    print(f"  Total amount: ${chars['total_amount']:,.2f}")
    print(f"  Avg amount per transaction: ${chars['avg_amount_per_txn']:,.2f}")
    print(f"  Time span: {chars['time_span_days']} days")
    print(f"  Currencies: {chars['currencies_used'][:5]}{'...' if len(chars['currencies_used']) > 5 else ''}")

Analyzing pattern characteristics...
Pattern Characteristics Analysis:

FAN_OUT:
  Unique accounts involved: 359
  Unique banks involved: 264
  Currencies used: 14
  Total amount: $2,855,555,186.71
  Avg amount per transaction: $8,349,576.57
  Time span: 13 days
  Currencies: ['Euro', 'Yuan', 'US Dollar', 'Australian Dollar', 'Yen']...

CYCLE:
  Unique accounts involved: 271
  Unique banks involved: 208
  Currencies used: 14
  Total amount: $1,006,404,841.38
  Avg amount per transaction: $3,506,637.08
  Time span: 13 days
  Currencies: ['Yuan', 'Swiss Franc', 'Shekel', 'Canadian Dollar', 'Rupee']...

GATHER_SCATTER:
  Unique accounts involved: 685
  Unique banks involved: 427
  Currencies used: 14
  Total amount: $1,210,493,226.40
  Avg amount per transaction: $1,690,633.00
  Time span: 17 days
  Currencies: ['Saudi Riyal', 'Swiss Franc', 'Euro', 'US Dollar', 'Mexican Peso']...

STACK:
  Unique accounts involved: 663
  Unique banks involved: 411
  Currencies used: 15
  Total amount: $1

In [16]:
print("Validating pattern data against main transactions...")

pattern_account_set = set()
for _, pattern in patterns_df.iterrows():
    for txn in pattern['transactions']:
        pattern_account_set.add(txn['account_origin'])
        pattern_account_set.add(txn['account_destination'])

transaction_account_set = set(transactions_df['account_origin'].unique()) | set(transactions_df['account_destination'].unique())
accounts_in_file = set(accounts_df['account_number'].unique())

pattern_accounts_in_main = len(pattern_account_set & transaction_account_set)
pattern_accounts_in_accounts_file = len(pattern_account_set & accounts_in_file)

print("Pattern Data Validation:")
print(f"Unique accounts in patterns: {len(pattern_account_set)}")
print(f"Pattern accounts found in main transactions: {pattern_accounts_in_main}")
print(f"Pattern accounts found in accounts file: {pattern_accounts_in_accounts_file}")
print(f"Pattern accounts missing from main data: {len(pattern_account_set) - pattern_accounts_in_main}")

overlap_percentage = (pattern_accounts_in_main / len(pattern_account_set)) * 100
print(f"Pattern-Transaction overlap: {overlap_percentage:.2f}%")

Validating pattern data against main transactions...
Pattern Data Validation:
Unique accounts in patterns: 3170
Pattern accounts found in main transactions: 3170
Pattern accounts found in accounts file: 3170
Pattern accounts missing from main data: 0
Pattern-Transaction overlap: 100.00%


In [17]:
print("Analyzing payment formats and currencies in patterns...")

print("Payment Format Distribution in Patterns:")
pattern_payment_formats = pattern_transactions_df['payment_format'].value_counts()
for format_type, count in pattern_payment_formats.items():
    percentage = (count / len(pattern_transactions_df)) * 100
    print(f"  {format_type}: {count:,} ({percentage:.2f}%)")

print("\nCurrency Distribution in Patterns (Top 10):")
pattern_currencies = pattern_transactions_df['payment_currency'].value_counts()
for currency, count in pattern_currencies.head(10).items():
    percentage = (count / len(pattern_transactions_df)) * 100
    print(f"  {currency}: {count:,} ({percentage:.2f}%)")

print(f"\nTotal unique currencies in patterns: {pattern_currencies.nunique()}")

Analyzing payment formats and currencies in patterns...
Payment Format Distribution in Patterns:
  ACH: 3,208 (99.97%)
  Bitcoin: 1 (0.03%)

Currency Distribution in Patterns (Top 10):
  US Dollar: 1,178 (36.71%)
  Euro: 886 (27.61%)
  Saudi Riyal: 336 (10.47%)
  Swiss Franc: 114 (3.55%)
  Rupee: 111 (3.46%)
  Yuan: 107 (3.33%)
  Yen: 89 (2.77%)
  Canadian Dollar: 76 (2.37%)
  Ruble: 72 (2.24%)
  UK Pound: 71 (2.21%)

Total unique currencies in patterns: 15


In [18]:
print("Performing data quality assessment...")

quality_metrics = {
    'transactions_loaded': len(transactions_df),
    'accounts_loaded': len(accounts_df),
    'patterns_loaded': len(patterns_df),
    'pattern_transactions_extracted': len(pattern_transactions_df),
    'transaction_null_values': transactions_df.isnull().sum().sum(),
    'accounts_null_values': accounts_df.isnull().sum().sum(),
    'duplicate_transactions': transactions_df.duplicated().sum(),
    'duplicate_accounts': accounts_df.duplicated().sum(),
    'ml_transactions_in_main': transactions_df['is_laundering'].sum(),
    'ml_rate_in_main': transactions_df['is_laundering'].mean(),
    'pattern_transaction_count': total_pattern_transactions,
    'unique_pattern_types': patterns_df['pattern_type'].nunique()
}

print("Data Quality Assessment:")
for metric, value in quality_metrics.items():
    if isinstance(value, float):
        print(f"  {metric}: {value:.6f}")
    else:
        print(f"  {metric}: {value:,}")

Performing data quality assessment...
Data Quality Assessment:
  transactions_loaded: 5,078,345
  accounts_loaded: 518,581
  patterns_loaded: 370
  pattern_transactions_extracted: 3,209
  transaction_null_values: 0
  accounts_null_values: 0
  duplicate_transactions: 9
  duplicate_accounts: 0
  ml_transactions_in_main: 5,177
  ml_rate_in_main: 0.001019
  pattern_transaction_count: 3,209
  unique_pattern_types: 8


In [19]:
print("Creating comprehensive data summary...")

final_summary = {
    'dataset_overview': {
        'total_transactions': len(transactions_df),
        'total_accounts': len(accounts_df),
        'total_ml_patterns': len(patterns_df),
        'ml_transaction_rate': transactions_df['is_laundering'].mean(),
        'date_range_days': (transactions_df['timestamp'].max() - transactions_df['timestamp'].min()).days
    },
    'pattern_analysis': {
        'pattern_types_identified': list(patterns_df['pattern_type'].unique()),
        'total_pattern_instances': len(patterns_df),
        'total_pattern_transactions': total_pattern_transactions,
        'pattern_distribution': patterns_df['pattern_type'].value_counts().to_dict()
    },
    'account_analysis': {
        'unique_banks': accounts_df['bank_id'].nunique(),
        'unique_entities': accounts_df['entity_id'].nunique(),
        'accounts_referenced_in_transactions': len(transaction_account_set),
        'accounts_in_patterns': len(pattern_account_set)
    },
    'transaction_analysis': {
        'payment_formats': list(transactions_df['payment_format'].unique()),
        'currencies': transactions_df['payment_currency'].nunique(),
        'total_transaction_value': transactions_df['amount_paid'].sum(),
        'avg_transaction_amount': transactions_df['amount_paid'].mean()
    }
}

print("\nFINAL DATA SUMMARY:")
print("="*60)
for category, metrics in final_summary.items():
    print(f"\n{category.upper().replace('_', ' ')}:")
    for key, value in metrics.items():
        if isinstance(value, float):
            if 'amount' in key.lower():
                print(f"  {key}: ${value:,.2f}")
            elif 'rate' in key.lower():
                print(f"  {key}: {value:.6f}")
            else:
                print(f"  {key}: {value:.2f}")
        elif isinstance(value, dict):
            print(f"  {key}:")
            for sub_key, sub_value in value.items():
                print(f"    {sub_key}: {sub_value}")
        elif isinstance(value, list):
            if len(value) <= 5:
                print(f"  {key}: {value}")
            else:
                print(f"  {key}: {value[:5]}... ({len(value)} total)")
        else:
            print(f"  {key}: {value:,}")
print("\n" + "="*80)

Creating comprehensive data summary...

FINAL DATA SUMMARY:

DATASET OVERVIEW:
  total_transactions: 5,078,345
  total_accounts: 518,581
  total_ml_patterns: 370
  ml_transaction_rate: 0.001019
  date_range_days: 17

PATTERN ANALYSIS:
  pattern_types_identified: ['FAN_OUT', 'CYCLE', 'GATHER_SCATTER', 'STACK', 'RANDOM']... (8 total)
  total_pattern_instances: 370
  total_pattern_transactions: 3,209
  pattern_distribution:
    CYCLE: 54
    GATHER_SCATTER: 51
    BIPARTITE: 49
    FAN_OUT: 48
    SCATTER_GATHER: 44
    STACK: 43
    RANDOM: 41
    FAN_IN: 40

ACCOUNT ANALYSIS:
  unique_banks: 30,470
  unique_entities: 166,207
  accounts_referenced_in_transactions: 515,080
  accounts_in_patterns: 3,170

TRANSACTION ANALYSIS:
  payment_formats: ['Reinvestment', 'Cheque', 'Credit Card', 'ACH', 'Cash']... (7 total)
  currencies: 15
  total_transaction_value: 22899645860702.70
  avg_transaction_amount: $4,509,273.37



In [21]:
print("Saving analysis results and creating files for EDA...")

transactions_df.to_csv('../data/processed_transactions.csv', index=False)
accounts_df.to_csv('../data/processed_accounts.csv', index=False)

patterns_summary = []
for _, pattern in patterns_df.iterrows():
    pattern_record = {
        'pattern_id': pattern.name,
        'pattern_type': pattern['pattern_type'],
        'transaction_count': pattern['transaction_count']
    }
    patterns_summary.append(pattern_record)

patterns_summary_df = pd.DataFrame(patterns_summary)
patterns_summary_df.to_csv('../data/processed_patterns.csv', index=False)

pattern_transactions_list = []
for pattern_idx, pattern in patterns_df.iterrows():
   for txn_idx, txn in enumerate(pattern['transactions']):
       txn_record = {
           'pattern_id': pattern_idx,
           'pattern_type': pattern['pattern_type'],
           'txn_sequence': txn_idx + 1,
           'total_txns_in_pattern': len(pattern['transactions']),
           'timestamp': txn['timestamp'],
           'from_bank': txn['from_bank'],
           'account_origin': txn['account_origin'],
           'to_bank': txn['to_bank'],
           'account_destination': txn['account_destination'],
           'amount_received': txn['amount_received'],
           'receiving_currency': txn['receiving_currency'],
           'amount_paid': txn['amount_paid'],
           'payment_currency': txn['payment_currency'],
           'payment_format': txn['payment_format'],
           'is_laundering': txn['is_laundering']
       }
       pattern_transactions_list.append(txn_record)

pattern_transactions_df = pd.DataFrame(pattern_transactions_list)
pattern_transactions_df['timestamp'] = pd.to_datetime(pattern_transactions_df['timestamp'])
pattern_transactions_df['amount_paid'] = pd.to_numeric(pattern_transactions_df['amount_paid'], errors='coerce')
pattern_transactions_df.to_csv('../data/processed_pattern_transactions.csv', index=False)

print("Created processed files:")
print("  - data/processed_transactions.csv")
print("  - data/processed_accounts.csv") 
print("  - data/processed_patterns.csv")
print("  - data/processed_pattern_transactions.csv")
print(f"Pattern transactions extracted: {len(pattern_transactions_df):,}")

Saving analysis results and creating files for EDA...
Created processed files:
  - data/processed_transactions.csv
  - data/processed_accounts.csv
  - data/processed_patterns.csv
  - data/processed_pattern_transactions.csv
Pattern transactions extracted: 3,209
