In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import networkx as nx
from collections import Counter, defaultdict
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")

In [18]:
transactions_df = pd.read_csv('../data/processed_transactions.csv')
accounts_df = pd.read_csv('../data/processed_accounts.csv')
patterns_df = pd.read_csv('../data/processed_patterns.csv')
pattern_transactions_df = pd.read_csv('../data/processed_pattern_transactions.csv')

transactions_df['timestamp'] = pd.to_datetime(transactions_df['timestamp'])
pattern_transactions_df['timestamp'] = pd.to_datetime(pattern_transactions_df['timestamp'])

print(f"Loaded {len(transactions_df):,} transactions")
print(f"Loaded {len(accounts_df):,} accounts")
print(f"Loaded {len(patterns_df)} patterns")
print(f"Loaded {len(pattern_transactions_df)} pattern transactions")

Loaded 5,078,345 transactions
Loaded 518,581 accounts
Loaded 370 patterns
Loaded 3209 pattern transactions


In [20]:
print("Analyzing overall dataset characteristics...")

total_transactions = len(transactions_df)
total_accounts = len(set(transactions_df['account_origin'].unique()) | set(transactions_df['account_destination'].unique()))
total_banks = len(set(transactions_df['from_bank'].unique()) | set(transactions_df['to_bank'].unique()))

ml_transactions = transactions_df[transactions_df['is_laundering'] == 1]
normal_transactions = transactions_df[transactions_df['is_laundering'] == 0]

print(f"Dataset Overview:")
print(f"  Total transactions: {total_transactions:,}")
print(f"  Money laundering transactions: {len(ml_transactions):,}")
print(f"  Normal transactions: {len(normal_transactions):,}")
print(f"  ML rate: {len(ml_transactions)/total_transactions:.6f}")
print(f"  Unique accounts: {total_accounts:,}")
print(f"  Unique banks: {total_banks:,}")
print(f"  Date range: {transactions_df['timestamp'].min()} to {transactions_df['timestamp'].max()}")

time_span = transactions_df['timestamp'].max() - transactions_df['timestamp'].min()
print(f"  Analysis period: {time_span.days} days")

Analyzing overall dataset characteristics...
Dataset Overview:
  Total transactions: 5,078,345
  Money laundering transactions: 5,177
  Normal transactions: 5,073,168
  ML rate: 0.001019
  Unique accounts: 515,080
  Unique banks: 30,470
  Date range: 2022-09-01 00:00:00 to 2022-09-18 16:18:00
  Analysis period: 17 days


In [21]:
print("Analyzing temporal patterns...")

transactions_df['hour'] = transactions_df['timestamp'].dt.hour
transactions_df['day_of_week'] = transactions_df['timestamp'].dt.dayofweek
transactions_df['date'] = transactions_df['timestamp'].dt.date

hourly_activity = transactions_df.groupby('hour').agg({
    'is_laundering': ['count', 'sum', 'mean'],
    'amount_paid': ['sum', 'mean']
}).round(4)

hourly_activity.columns = ['total_txns', 'ml_txns', 'ml_rate', 'total_amount', 'avg_amount']

print("Hourly Activity Analysis:")
print("Top 5 hours by transaction volume:")
top_hours = hourly_activity.nlargest(5, 'total_txns')
for hour, row in top_hours.iterrows():
    print(f"  Hour {hour:2d}: {row['total_txns']:,} txns, ML rate: {row['ml_rate']:.4f}")

print("\nTop 5 hours by ML rate:")
top_ml_hours = hourly_activity.nlargest(5, 'ml_rate')
for hour, row in top_ml_hours.iterrows():
    print(f"  Hour {hour:2d}: ML rate: {row['ml_rate']:.4f}, {row['ml_txns']} ML txns")

daily_activity = transactions_df.groupby('date').agg({
    'is_laundering': ['count', 'sum', 'mean'],
    'amount_paid': 'sum'
}).round(4)

daily_activity.columns = ['daily_txns', 'daily_ml_txns', 'daily_ml_rate', 'daily_amount']

print(f"\nDaily Activity Summary:")
print(f"  Avg transactions per day: {daily_activity['daily_txns'].mean():.1f}")
print(f"  Avg ML transactions per day: {daily_activity['daily_ml_txns'].mean():.1f}")
print(f"  Days with ML activity: {(daily_activity['daily_ml_txns'] > 0).sum()}")
print(f"  Peak day transactions: {daily_activity['daily_txns'].max():,}")
print(f"  Peak day ML transactions: {daily_activity['daily_ml_txns'].max()}")

Analyzing temporal patterns...
Hourly Activity Analysis:
Top 5 hours by transaction volume:
  Hour  0: 634,726.0 txns, ML rate: 0.0003
  Hour 15: 194,871.0 txns, ML rate: 0.0013
  Hour  6: 194,456.0 txns, ML rate: 0.0011
  Hour  5: 193,900.0 txns, ML rate: 0.0010
  Hour  1: 193,728.0 txns, ML rate: 0.0008

Top 5 hours by ML rate:
  Hour 12: ML rate: 0.0017, 336.0 ML txns
  Hour 16: ML rate: 0.0016, 311.0 ML txns
  Hour 11: ML rate: 0.0015, 295.0 ML txns
  Hour 13: ML rate: 0.0015, 292.0 ML txns
  Hour 14: ML rate: 0.0014, 279.0 ML txns

Daily Activity Summary:
  Avg transactions per day: 282130.3
  Avg ML transactions per day: 287.6
  Days with ML activity: 18
  Peak day transactions: 1,114,921
  Peak day ML transactions: 539


In [22]:
print("Analyzing payment formats and currencies...")

payment_format_analysis = transactions_df.groupby('payment_format').agg({
    'is_laundering': ['count', 'sum', 'mean'],
    'amount_paid': ['sum', 'mean']
}).round(4)

payment_format_analysis.columns = ['total_txns', 'ml_txns', 'ml_rate', 'total_amount', 'avg_amount']

print("Payment Format Analysis:")
for format_type, row in payment_format_analysis.iterrows():
    print(f"  {format_type}:")
    print(f"    Transactions: {row['total_txns']:,} ({row['total_txns']/total_transactions*100:.2f}%)")
    print(f"    ML transactions: {row['ml_txns']} (rate: {row['ml_rate']:.4f})")
    print(f"    Avg amount: ${row['avg_amount']:,.2f}")

currency_analysis = transactions_df.groupby('payment_currency').agg({
    'is_laundering': ['count', 'sum', 'mean'],
    'amount_paid': ['sum', 'mean']
}).round(4)

currency_analysis.columns = ['total_txns', 'ml_txns', 'ml_rate', 'total_amount', 'avg_amount']
currency_analysis = currency_analysis.sort_values('total_txns', ascending=False)

print("\nTop 10 Currencies by Transaction Volume:")
for currency, row in currency_analysis.head(10).iterrows():
    print(f"  {currency}: {row['total_txns']:,} txns, ML rate: {row['ml_rate']:.4f}")

high_ml_currencies = currency_analysis[currency_analysis['ml_rate'] > 0.001].sort_values('ml_rate', ascending=False)
print(f"\nCurrencies with high ML rates (>0.1%):")
for currency, row in high_ml_currencies.head(10).iterrows():
    print(f"  {currency}: ML rate {row['ml_rate']:.4f}, {row['ml_txns']} ML txns")

Analyzing payment formats and currencies...
Payment Format Analysis:
  ACH:
    Transactions: 600,797.0 (11.83%)
    ML transactions: 4483.0 (rate: 0.0075)
    Avg amount: $9,497,986.77
  Bitcoin:
    Transactions: 146,091.0 (2.88%)
    ML transactions: 56.0 (rate: 0.0004)
    Avg amount: $30.83
  Cash:
    Transactions: 490,891.0 (9.67%)
    ML transactions: 108.0 (rate: 0.0002)
    Avg amount: $7,359,934.25
  Cheque:
    Transactions: 1,864,331.0 (36.71%)
    ML transactions: 324.0 (rate: 0.0002)
    Avg amount: $6,103,601.28
  Credit Card:
    Transactions: 1,323,324.0 (26.06%)
    ML transactions: 206.0 (rate: 0.0002)
    Avg amount: $86,783.37
  Reinvestment:
    Transactions: 481,056.0 (9.47%)
    ML transactions: 0.0 (rate: 0.0000)
    Avg amount: $2,595,009.06
  Wire:
    Transactions: 171,855.0 (3.38%)
    ML transactions: 0.0 (rate: 0.0000)
    Avg amount: $4,876,399.53

Top 10 Currencies by Transaction Volume:
  US Dollar: 1,895,172.0 txns, ML rate: 0.0010
  Euro: 1,168,297.

In [23]:
print("Analyzing transaction amounts and distributions...")

amount_stats = {
    'overall': {
        'min': transactions_df['amount_paid'].min(),
        'max': transactions_df['amount_paid'].max(),
        'mean': transactions_df['amount_paid'].mean(),
        'median': transactions_df['amount_paid'].median(),
        'std': transactions_df['amount_paid'].std()
    },
    'ml_transactions': {
        'min': ml_transactions['amount_paid'].min(),
        'max': ml_transactions['amount_paid'].max(),
        'mean': ml_transactions['amount_paid'].mean(),
        'median': ml_transactions['amount_paid'].median(),
        'std': ml_transactions['amount_paid'].std()
    },
    'normal_transactions': {
        'min': normal_transactions['amount_paid'].min(),
        'max': normal_transactions['amount_paid'].max(),
        'mean': normal_transactions['amount_paid'].mean(),
        'median': normal_transactions['amount_paid'].median(),
        'std': normal_transactions['amount_paid'].std()
    }
}

print("Amount Distribution Analysis:")
for category, stats in amount_stats.items():
    print(f"\n{category.replace('_', ' ').title()}:")
    for stat, value in stats.items():
        print(f"  {stat}: ${value:,.2f}")

amount_ranges = [(0, 100), (100, 1000), (1000, 10000), (10000, 100000), (100000, float('inf'))]
range_labels = ['$0-100', '$100-1K', '$1K-10K', '$10K-100K', '$100K+']

print("\nAmount Range Analysis:")
for i, (min_amt, max_amt) in enumerate(amount_ranges):
    if max_amt == float('inf'):
        mask = transactions_df['amount_paid'] >= min_amt
    else:
        mask = (transactions_df['amount_paid'] >= min_amt) & (transactions_df['amount_paid'] < max_amt)
    
    range_txns = transactions_df[mask]
    range_ml = range_txns[range_txns['is_laundering'] == 1]
    
    print(f"  {range_labels[i]}:")
    print(f"    Transactions: {len(range_txns):,} ({len(range_txns)/total_transactions*100:.2f}%)")
    print(f"    ML transactions: {len(range_ml)} (rate: {len(range_ml)/len(range_txns):.4f})")

Analyzing transaction amounts and distributions...
Amount Distribution Analysis:

Overall:
  min: $0.00
  max: $1,046,302,363,293.48
  mean: $4,509,273.37
  median: $1,414.54
  std: $869,772,830.92

Ml Transactions:
  min: $0.00
  max: $84,853,144,179.58
  mean: $36,135,310.41
  median: $8,667.21
  std: $1,527,918,669.80

Normal Transactions:
  min: $0.00
  max: $1,046,302,363,293.48
  mean: $4,477,000.04
  median: $1,410.99
  std: $868,846,296.80

Amount Range Analysis:
  $0-100:
    Transactions: 934,406 (18.40%)
    ML transactions: 152 (rate: 0.0002)
  $100-1K:
    Transactions: 1,377,027 (27.12%)
    ML transactions: 493 (rate: 0.0004)
  $1K-10K:
    Transactions: 1,396,292 (27.50%)
    ML transactions: 2125 (rate: 0.0015)
  $10K-100K:
    Transactions: 793,993 (15.63%)
    ML transactions: 1803 (rate: 0.0023)
  $100K+:
    Transactions: 576,627 (11.35%)
    ML transactions: 604 (rate: 0.0010)


In [24]:
print("Analyzing account behavior patterns...")

def analyze_account_behavior(account_col, label):
    account_behavior = transactions_df.groupby(account_col).agg({
        'is_laundering': ['count', 'sum', 'mean'],
        'amount_paid': ['sum', 'mean', 'std'],
        'payment_format': lambda x: x.nunique(),
        'payment_currency': lambda x: x.nunique(),
        'timestamp': lambda x: (x.max() - x.min()).days
    }).round(4)
    
    account_behavior.columns = ['txn_count', 'ml_count', 'ml_rate', 'total_amount', 'avg_amount', 'amount_std', 'payment_formats', 'currencies', 'active_days']
    account_behavior['amount_std'] = account_behavior['amount_std'].fillna(0)
    
    print(f"\n{label} Account Behavior Analysis:")
    print(f"  Total accounts: {len(account_behavior):,}")
    print(f"  Accounts with ML activity: {(account_behavior['ml_count'] > 0).sum()}")
    print(f"  Avg transactions per account: {account_behavior['txn_count'].mean():.1f}")
    print(f"  Max transactions per account: {account_behavior['txn_count'].max()}")
    
    high_activity_accounts = account_behavior[account_behavior['txn_count'] >= account_behavior['txn_count'].quantile(0.95)]
    print(f"  High activity accounts (top 5%): {len(high_activity_accounts)}")
    print(f"  High activity ML rate: {high_activity_accounts['ml_rate'].mean():.4f}")
    
    ml_accounts = account_behavior[account_behavior['ml_count'] > 0]
    if len(ml_accounts) > 0:
        print(f"  ML accounts avg transaction count: {ml_accounts['txn_count'].mean():.1f}")
        print(f"  ML accounts avg currencies used: {ml_accounts['currencies'].mean():.1f}")
        print(f"  ML accounts avg payment formats: {ml_accounts['payment_formats'].mean():.1f}")
    
    return account_behavior

origin_behavior = analyze_account_behavior('account_origin', 'Origin')
destination_behavior = analyze_account_behavior('account_destination', 'Destination')

Analyzing account behavior patterns...

Origin Account Behavior Analysis:
  Total accounts: 496,995
  Accounts with ML activity: 3376
  Avg transactions per account: 10.2
  Max transactions per account: 168672
  High activity accounts (top 5%): 25381
  High activity ML rate: 0.0003
  ML accounts avg transaction count: 150.8
  ML accounts avg currencies used: 1.5
  ML accounts avg payment formats: 2.9

Destination Account Behavior Analysis:
  Total accounts: 420,636
  Accounts with ML activity: 3984
  Avg transactions per account: 12.1
  Max transactions per account: 1084
  High activity accounts (top 5%): 22684
  High activity ML rate: 0.0008
  ML accounts avg transaction count: 17.6
  ML accounts avg currencies used: 1.0
  ML accounts avg payment formats: 3.3


In [25]:
print("Analyzing bank-level patterns...")

bank_analysis = transactions_df.groupby(['from_bank', 'to_bank']).agg({
    'is_laundering': ['count', 'sum', 'mean'],
    'amount_paid': ['sum', 'mean']
}).round(4)

bank_analysis.columns = ['txn_count', 'ml_count', 'ml_rate', 'total_amount', 'avg_amount']

same_bank_txns = bank_analysis[bank_analysis.index.get_level_values(0) == bank_analysis.index.get_level_values(1)]
cross_bank_txns = bank_analysis[bank_analysis.index.get_level_values(0) != bank_analysis.index.get_level_values(1)]

print("Bank Transaction Patterns:")
print(f"  Same-bank transactions: {same_bank_txns['txn_count'].sum():,}")
print(f"  Cross-bank transactions: {cross_bank_txns['txn_count'].sum():,}")
print(f"  Same-bank ML rate: {same_bank_txns['ml_count'].sum() / same_bank_txns['txn_count'].sum():.4f}")
print(f"  Cross-bank ML rate: {cross_bank_txns['ml_count'].sum() / cross_bank_txns['txn_count'].sum():.4f}")

bank_ml_activity = transactions_df[transactions_df['is_laundering'] == 1].groupby('from_bank').agg({
    'account_origin': 'nunique',
    'account_destination': 'nunique', 
    'amount_paid': ['count', 'sum'],
    'payment_currency': 'nunique'
}).round(2)

bank_ml_activity.columns = ['unique_origins', 'unique_destinations', 'ml_txn_count', 'ml_total_amount', 'currencies_used']

print(f"\nBanks involved in ML transactions: {len(bank_ml_activity)}")
print("Top 5 banks by ML transaction volume:")
top_ml_banks = bank_ml_activity.nlargest(5, 'ml_txn_count')
for bank, row in top_ml_banks.iterrows():
    print(f"  Bank {bank}: {row['ml_txn_count']} ML txns, ${row['ml_total_amount']:,.2f}")

Analyzing bank-level patterns...
Bank Transaction Patterns:
  Same-bank transactions: 691,332
  Cross-bank transactions: 4,387,013
  Same-bank ML rate: 0.0001
  Cross-bank ML rate: 0.0012

Banks involved in ML transactions: 1021
Top 5 banks by ML transaction volume:
  Bank 70: 633.0 ML txns, $395,491,904.00
  Bank 12: 76.0 ML txns, $44,830,572.51
  Bank 20: 67.0 ML txns, $1,934,260.92
  Bank 119: 59.0 ML txns, $41,232,105.68
  Bank 10: 51.0 ML txns, $1,074,345.13


In [26]:
print("Analyzing individual money laundering patterns...")

pattern_detailed_analysis = {}

for pattern_type in patterns_df['pattern_type'].unique():
    pattern_instances = patterns_df[patterns_df['pattern_type'] == pattern_type]
    pattern_txns = pattern_transactions_df[pattern_transactions_df['pattern_type'] == pattern_type]
    
    unique_accounts = set(pattern_txns['account_origin'].unique()) | set(pattern_txns['account_destination'].unique())
    unique_banks = set(pattern_txns['from_bank'].unique()) | set(pattern_txns['to_bank'].unique())
    
    time_spans = []
    for _, instance in pattern_instances.iterrows():
        if 'transactions' in instance:
            instance_txns = pd.DataFrame(instance['transactions'])
            instance_txns['timestamp'] = pd.to_datetime(instance_txns['timestamp'])
            time_span = (instance_txns['timestamp'].max() - instance_txns['timestamp'].min()).total_seconds() / 3600
            time_spans.append(time_span)
    
    amounts = pattern_txns['amount_paid'].astype(float)
    
    analysis = {
        'instance_count': len(pattern_instances),
        'total_transactions': len(pattern_txns),
        'avg_txns_per_instance': len(pattern_txns) / len(pattern_instances) if len(pattern_instances) > 0 else 0,
        'unique_accounts': len(unique_accounts),
        'unique_banks': len(unique_banks),
        'avg_time_span_hours': np.mean(time_spans) if time_spans else 0,
        'currencies_used': pattern_txns['payment_currency'].nunique(),
        'total_amount': amounts.sum(),
        'avg_amount_per_txn': amounts.mean(),
        'amount_std': amounts.std(),
        'dominant_currency': pattern_txns['payment_currency'].mode()[0] if len(pattern_txns) > 0 else 'N/A'
    }
    
    pattern_detailed_analysis[pattern_type] = analysis

print("Detailed Pattern Analysis:")
for pattern_type, analysis in pattern_detailed_analysis.items():
    print(f"\n{pattern_type}:")
    print(f"  Instances: {analysis['instance_count']}")
    print(f"  Total transactions: {analysis['total_transactions']}")
    print(f"  Avg transactions per instance: {analysis['avg_txns_per_instance']:.1f}")
    print(f"  Unique accounts involved: {analysis['unique_accounts']}")
    print(f"  Unique banks involved: {analysis['unique_banks']}")
    print(f"  Avg time span: {analysis['avg_time_span_hours']:.1f} hours")
    print(f"  Currencies used: {analysis['currencies_used']}")
    print(f"  Total amount: ${analysis['total_amount']:,.2f}")
    print(f"  Avg amount per transaction: ${analysis['avg_amount_per_txn']:,.2f}")
    print(f"  Dominant currency: {analysis['dominant_currency']}")

Analyzing individual money laundering patterns...
Detailed Pattern Analysis:

FAN_OUT:
  Instances: 48
  Total transactions: 342
  Avg transactions per instance: 7.1
  Unique accounts involved: 359
  Unique banks involved: 260
  Avg time span: 0.0 hours
  Currencies used: 14
  Total amount: $2,855,555,186.71
  Avg amount per transaction: $8,349,576.57
  Dominant currency: US Dollar

CYCLE:
  Instances: 54
  Total transactions: 287
  Avg transactions per instance: 5.3
  Unique accounts involved: 271
  Unique banks involved: 204
  Avg time span: 0.0 hours
  Currencies used: 14
  Total amount: $1,006,404,841.38
  Avg amount per transaction: $3,506,637.08
  Dominant currency: US Dollar

GATHER_SCATTER:
  Instances: 51
  Total transactions: 716
  Avg transactions per instance: 14.0
  Unique accounts involved: 685
  Unique banks involved: 422
  Avg time span: 0.0 hours
  Currencies used: 14
  Total amount: $1,210,493,226.40
  Avg amount per transaction: $1,690,633.00
  Dominant currency: US 

In [27]:
print("Analyzing network characteristics for graph construction...")

def calculate_network_metrics():
    account_connections = defaultdict(set)
    bank_connections = defaultdict(set)
    
    for _, txn in transactions_df.iterrows():
        account_connections[txn['account_origin']].add(txn['account_destination'])
        bank_connections[txn['from_bank']].add(txn['to_bank'])
    
    account_out_degrees = {acc: len(connections) for acc, connections in account_connections.items()}
    account_in_degrees = defaultdict(int)
    
    for origin_acc, dest_accounts in account_connections.items():
        for dest_acc in dest_accounts:
            account_in_degrees[dest_acc] += 1
    
    return account_out_degrees, dict(account_in_degrees), account_connections, bank_connections

account_out_degrees, account_in_degrees, account_connections, bank_connections = calculate_network_metrics()

print("Network Structure Analysis:")
print(f"  Accounts with outgoing connections: {len(account_out_degrees):,}")
print(f"  Accounts with incoming connections: {len(account_in_degrees):,}")

out_degree_stats = pd.Series(list(account_out_degrees.values())).describe()
in_degree_stats = pd.Series(list(account_in_degrees.values())).describe()

print(f"\nOut-degree statistics:")
print(f"  Mean: {out_degree_stats['mean']:.2f}")
print(f"  Max: {out_degree_stats['max']:.0f}")
print(f"  75th percentile: {out_degree_stats['75%']:.0f}")

print(f"\nIn-degree statistics:")
print(f"  Mean: {in_degree_stats['mean']:.2f}")
print(f"  Max: {in_degree_stats['max']:.0f}")
print(f"  75th percentile: {in_degree_stats['75%']:.0f}")

high_out_degree_accounts = [acc for acc, degree in account_out_degrees.items() if degree >= 10]
high_in_degree_accounts = [acc for acc, degree in account_in_degrees.items() if degree >= 10]

print(f"\nHigh connectivity accounts:")
print(f"  Accounts with 10+ outgoing connections: {len(high_out_degree_accounts)}")
print(f"  Accounts with 10+ incoming connections: {len(high_in_degree_accounts)}")

ml_involvement = transactions_df[transactions_df['is_laundering'] == 1]
ml_accounts = set(ml_involvement['account_origin'].unique()) | set(ml_involvement['account_destination'].unique())

high_degree_ml_overlap = set(high_out_degree_accounts + high_in_degree_accounts) & ml_accounts
print(f"  High-degree accounts involved in ML: {len(high_degree_ml_overlap)}")

Analyzing network characteristics for graph construction...
Network Structure Analysis:
  Accounts with outgoing connections: 496,995
  Accounts with incoming connections: 420,636

Out-degree statistics:
  Mean: 2.04
  Max: 14230
  75th percentile: 2

In-degree statistics:
  Mean: 2.41
  Max: 545
  75th percentile: 3

High connectivity accounts:
  Accounts with 10+ outgoing connections: 6078
  Accounts with 10+ incoming connections: 3326
  High-degree accounts involved in ML: 498


In [28]:
print("Analyzing temporal clustering and velocity patterns...")

def analyze_temporal_clustering():
    ml_txns = transactions_df[transactions_df['is_laundering'] == 1].copy()
    ml_txns = ml_txns.sort_values('timestamp')
    
    time_gaps = []
    for i in range(1, len(ml_txns)):
        time_gap = (ml_txns.iloc[i]['timestamp'] - ml_txns.iloc[i-1]['timestamp']).total_seconds() / 60
        time_gaps.append(time_gap)
    
    velocity_analysis = {}
    
    for account in ml_txns['account_origin'].value_counts().head(20).index:
        account_txns = ml_txns[ml_txns['account_origin'] == account].sort_values('timestamp')
        if len(account_txns) > 1:
            account_gaps = []
            for i in range(1, len(account_txns)):
                gap = (account_txns.iloc[i]['timestamp'] - account_txns.iloc[i-1]['timestamp']).total_seconds() / 60
                account_gaps.append(gap)
            
            velocity_analysis[account] = {
                'txn_count': len(account_txns),
                'avg_gap_minutes': np.mean(account_gaps),
                'min_gap_minutes': min(account_gaps),
                'total_span_hours': (account_txns['timestamp'].max() - account_txns['timestamp'].min()).total_seconds() / 3600
            }
    
    return time_gaps, velocity_analysis

time_gaps, velocity_analysis = analyze_temporal_clustering()

print("Temporal Clustering Analysis:")
if time_gaps:
    gap_stats = pd.Series(time_gaps).describe()
    print(f"  ML transaction time gaps (minutes):")
    print(f"    Mean: {gap_stats['mean']:.1f}")
    print(f"    Median: {gap_stats['50%']:.1f}")
    print(f"    Min: {gap_stats['min']:.1f}")
    
    rapid_sequences = [gap for gap in time_gaps if gap < 60]
    print(f"  Rapid sequences (<1 hour apart): {len(rapid_sequences)} ({len(rapid_sequences)/len(time_gaps)*100:.1f}%)")

print(f"\nHigh-velocity ML accounts (top 10):")
sorted_velocity = sorted(velocity_analysis.items(), key=lambda x: x[1]['avg_gap_minutes'])
for account, stats in sorted_velocity[:10]:
    print(f"  {account}: {stats['txn_count']} txns, avg gap {stats['avg_gap_minutes']:.1f} min")

Analyzing temporal clustering and velocity patterns...
Temporal Clustering Analysis:
  ML transaction time gaps (minutes):
    Mean: 4.9
    Median: 2.0
    Min: 0.0
  Rapid sequences (<1 hour apart): 5138 (99.3%)

High-velocity ML accounts (top 10):
  100428660: 243 txns, avg gap 59.2 min
  1004286A8: 158 txns, avg gap 89.9 min
  8040AE4F0: 16 txns, avg gap 195.9 min
  80452D470: 16 txns, avg gap 293.7 min
  100428810: 26 txns, avg gap 510.3 min
  100428978: 29 txns, avg gap 511.2 min
  1004286F0: 21 txns, avg gap 571.7 min
  100428780: 21 txns, avg gap 628.8 min
  100428738: 23 txns, avg gap 643.4 min
  80266F880: 29 txns, avg gap 653.3 min


In [29]:
print("Creating insights summary for graph construction...")

insights_summary = {
    'temporal_insights': {
        'peak_hours': list(hourly_activity.nlargest(3, 'total_txns').index),
        'ml_concentrated_hours': list(hourly_activity.nlargest(3, 'ml_rate').index),
        'rapid_ml_sequences_percentage': len(rapid_sequences)/len(time_gaps)*100 if time_gaps else 0,
        'avg_ml_time_gap_minutes': np.mean(time_gaps) if time_gaps else 0
    },
    'network_insights': {
        'high_connectivity_threshold': 10,
        'high_out_degree_accounts': len(high_out_degree_accounts),
        'high_in_degree_accounts': len(high_in_degree_accounts),
        'ml_high_degree_overlap': len(high_degree_ml_overlap),
        'same_bank_transaction_percentage': same_bank_txns['txn_count'].sum() / total_transactions * 100,
        'cross_bank_ml_rate': cross_bank_txns['ml_count'].sum() / cross_bank_txns['txn_count'].sum()
    },
    'pattern_insights': {
        'most_common_pattern': patterns_df['pattern_type'].value_counts().index[0],
        'most_complex_pattern': max(pattern_detailed_analysis.items(), key=lambda x: x[1]['avg_txns_per_instance'])[0],
        'longest_timespan_pattern': max(pattern_detailed_analysis.items(), key=lambda x: x[1]['avg_time_span_hours'])[0],
        'highest_value_pattern': max(pattern_detailed_analysis.items(), key=lambda x: x[1]['total_amount'])[0]
    },
    'currency_insights': {
        'total_currencies': transactions_df['payment_currency'].nunique(),
        'ml_currencies': ml_transactions['payment_currency'].nunique(),
        'dominant_ml_currency': ml_transactions['payment_currency'].mode()[0],
        'high_ml_rate_currencies': len(high_ml_currencies)
    },
    'amount_insights': {
        'ml_vs_normal_avg_ratio': amount_stats['ml_transactions']['mean'] / amount_stats['normal_transactions']['mean'],
        'large_amount_ml_rate': len(transactions_df[(transactions_df['amount_paid'] > 100000) & (transactions_df['is_laundering'] == 1)]) / len(transactions_df[transactions_df['amount_paid'] > 100000]),
        'small_amount_ml_rate': len(transactions_df[(transactions_df['amount_paid'] < 1000) & (transactions_df['is_laundering'] == 1)]) / len(transactions_df[transactions_df['amount_paid'] < 1000])
    }
}

print("\nKEY INSIGHTS FOR GRAPH CONSTRUCTION:")
print("="*50)

print("\nTemporal Insights:")
print(f"  Peak activity hours: {insights_summary['temporal_insights']['peak_hours']}")
print(f"  ML concentrated hours: {insights_summary['temporal_insights']['ml_concentrated_hours']}")
print(f"  Rapid ML sequences: {insights_summary['temporal_insights']['rapid_ml_sequences_percentage']:.1f}%")

print("\nNetwork Structure Insights:")
print(f"  High-degree accounts: {insights_summary['network_insights']['high_out_degree_accounts']} out, {insights_summary['network_insights']['high_in_degree_accounts']} in")
print(f"  ML-involved high-degree accounts: {insights_summary['network_insights']['ml_high_degree_overlap']}")
print(f"  Same-bank transactions: {insights_summary['network_insights']['same_bank_transaction_percentage']:.1f}%")

print("\nPattern Insights:")
print(f"  Most common pattern: {insights_summary['pattern_insights']['most_common_pattern']}")
print(f"  Most complex pattern: {insights_summary['pattern_insights']['most_complex_pattern']}")
print(f"  Longest timespan pattern: {insights_summary['pattern_insights']['longest_timespan_pattern']}")

print("\nCurrency & Amount Insights:")
print(f"  Total currencies: {insights_summary['currency_insights']['total_currencies']}")
print(f"  ML avg/normal avg ratio: {insights_summary['amount_insights']['ml_vs_normal_avg_ratio']:.2f}")
print(f"  Large amount ML rate: {insights_summary['amount_insights']['large_amount_ml_rate']:.4f}")

Creating insights summary for graph construction...

KEY INSIGHTS FOR GRAPH CONSTRUCTION:

Temporal Insights:
  Peak activity hours: [0, 15, 6]
  ML concentrated hours: [12, 16, 11]
  Rapid ML sequences: 99.3%

Network Structure Insights:
  High-degree accounts: 6078 out, 3326 in
  ML-involved high-degree accounts: 498
  Same-bank transactions: 13.6%

Pattern Insights:
  Most common pattern: CYCLE
  Most complex pattern: SCATTER_GATHER
  Longest timespan pattern: FAN_OUT

Currency & Amount Insights:
  Total currencies: 15
  ML avg/normal avg ratio: 8.07
  Large amount ML rate: 0.0010


In [31]:
print("Saving analysis results...")

origin_behavior.to_csv('../data/origin_account_behavior.csv')
destination_behavior.to_csv('../data/destination_account_behavior.csv')
bank_analysis.to_csv('../data/bank_transaction_analysis.csv')
hourly_activity.to_csv('../data/hourly_activity_analysis.csv')
daily_activity.to_csv('../data/daily_activity_analysis.csv')

import pickle
with open('../data/pattern_detailed_analysis.pkl', 'wb') as f:
    pickle.dump(pattern_detailed_analysis, f)

with open('../data/insights_summary.pkl', 'wb') as f:
    pickle.dump(insights_summary, f)

with open('../data/network_metrics.pkl', 'wb') as f:
    pickle.dump({
        'account_out_degrees': account_out_degrees,
        'account_in_degrees': account_in_degrees,
        'account_connections': dict(account_connections),
        'bank_connections': dict(bank_connections)
    }, f)

Saving analysis results...
