In [3]:
import polars as pl
from collections import defaultdict

# === 1. 讀取 CSV 檔案 ===
file_path = r"C:\Users\Leon\Desktop\程式語言資料\python\TD-UF\Anti Money Laundering Transaction Data (SAML-D)\SAML-D.csv"
df = pl.read_csv(file_path, infer_schema_length=0)

# === 2. 取出邊列表（Sender, Receiver）===
edges = df.select(["Sender_account", "Receiver_account"]).rows()

# === 3. Union-Find 分群 ===
parent = {}

def find(x):
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(x, y):
    root_x = find(x)
    root_y = find(y)
    if root_x != root_y:
        parent[root_y] = root_x

# 初始化 parent
accounts = set()
for u, v in edges:
    accounts.update([u, v])
for acc in accounts:
    parent[acc] = acc
for u, v in edges:
    union(u, v)

# 建立群組
groups = defaultdict(list)
for acc in accounts:
    root = find(acc)
    groups[root].append(acc)

print(f"✅ 總共分成 {len(groups)} 個子圖群組")

# === 4. 統計出入度 ===
in_df = (
    df.select(["Receiver_account"])
      .group_by("Receiver_account")
      .agg(pl.len().alias("InDegree"))
      .rename({"Receiver_account": "Account"})
)

out_df = (
    df.select(["Sender_account"])
      .group_by("Sender_account")
      .agg(pl.len().alias("OutDegree"))
      .rename({"Sender_account": "Account"})
)

degree_df = in_df.join(out_df, on="Account", how="outer").fill_null(0)

# 轉為查詢用字典
in_degree = dict(zip(degree_df["Account"], degree_df["InDegree"]))
out_degree = dict(zip(degree_df["Account"], degree_df["OutDegree"]))

# === 5. 每個群組的 Sender / Receiver 統計與列印 ===
for idx, (gid, nodes) in enumerate(groups.items(), 1):
    sender_set = [n for n in nodes if in_degree.get(n, 0) == 0]
    receiver_set = [n for n in nodes if out_degree.get(n, 0) == 0]
    print(f"群組 {idx}: 節點數 = {len(nodes)}，Sender = {len(sender_set)}，Receiver = {len(receiver_set)}")


✅ 總共分成 15592 個子圖群組


(Deprecated in version 0.20.29)
  degree_df = in_df.join(out_df, on="Account", how="outer").fill_null(0)


群組 1: 節點數 = 37，Sender = 1，Receiver = 33
群組 2: 節點數 = 60，Sender = 18，Receiver = 53
群組 3: 節點數 = 62，Sender = 18，Receiver = 56
群組 4: 節點數 = 67，Sender = 20，Receiver = 61
群組 5: 節點數 = 46，Sender = 9，Receiver = 42
群組 6: 節點數 = 56，Sender = 16，Receiver = 52
群組 7: 節點數 = 67，Sender = 16，Receiver = 60
群組 8: 節點數 = 70，Sender = 24，Receiver = 66
群組 9: 節點數 = 53，Sender = 20，Receiver = 49
群組 10: 節點數 = 46，Sender = 1，Receiver = 43
群組 11: 節點數 = 60，Sender = 18，Receiver = 56
群組 12: 節點數 = 215，Sender = 43，Receiver = 183
群組 13: 節點數 = 136，Sender = 26，Receiver = 111
群組 14: 節點數 = 59，Sender = 17，Receiver = 57
群組 15: 節點數 = 48，Sender = 20，Receiver = 40
群組 16: 節點數 = 47，Sender = 14，Receiver = 43
群組 17: 節點數 = 64，Sender = 18，Receiver = 63
群組 18: 節點數 = 35，Sender = 3，Receiver = 31
群組 19: 節點數 = 45，Sender = 14，Receiver = 42
群組 20: 節點數 = 62，Sender = 15，Receiver = 56
群組 21: 節點數 = 24，Sender = 20，Receiver = 23
群組 22: 節點數 = 49，Sender = 10，Receiver = 44
群組 23: 節點數 = 44，Sender = 2，Receiver = 41
群組 24: 節點數 = 67，Sender = 19，Receiver = 64
群組

In [11]:
# === 7. 篩選高風險群組 ===
high_risk_groups = []  # 儲存符合條件的群組資訊

for idx, (gid, nodes) in enumerate(groups.items(), 1):
    sender_count = sum(1 for n in nodes if in_degree.get(n, 0) == 0)
    receiver_count = sum(1 for n in nodes if out_degree.get(n, 0) == 0)
    
    # 加入風險判斷條件：Sender ≦ 3 且 Receiver ≥ 10
    if sender_count <= 3 and receiver_count >= 10:
        high_risk_groups.append((idx, gid, len(nodes), sender_count, receiver_count))

# === 8. 顯示結果 ===
print("\n🚨 高風險群組（Sender ≦ 3 且 Receiver ≥ 10）：")
print("群組編號\t節點數\tSender\tReceiver")
for idx, gid, total_nodes, senders, receivers in high_risk_groups:
    print(f"{idx}\t\t{total_nodes}\t{senders}\t{receivers}")

# 額外列出總數
print(f"\n📊 總共高風險群組數量：{len(high_risk_groups)}")

# === 9. 統計這些高風險群組中，總共出現過幾筆交易（不論是否為洗錢） ===

# 建立所有高風險群組的帳戶集合
high_risk_accounts = set(
    acc for _, gid, _, _, _ in high_risk_groups
    for acc in groups[gid]
)

# 篩選出交易中 sender 或 receiver 屬於這些帳戶的資料
tx_in_high_risk = df.filter(
    df["Sender_account"].is_in(high_risk_accounts) |
    df["Receiver_account"].is_in(high_risk_accounts)
)

# 顯示結果
print(f"📦 高風險群組所涵蓋的交易總筆數：{tx_in_high_risk.shape[0]}")




🚨 高風險群組（Sender ≦ 3 且 Receiver ≥ 10）：
群組編號	節點數	Sender	Receiver
1		37	1	33
10		46	1	43
18		35	3	31
23		44	2	41
30		41	2	35
41		44	2	41
43		39	3	36
64		51	0	43
65		35	0	28
66		83	3	76
82		43	1	37
88		35	1	31
96		33	2	28
101		33	0	25
105		35	3	31
106		49	0	45
111		46	0	39
119		14	1	11
129		26	2	20
143		29	0	26
157		31	1	26
163		48	1	42
172		55	1	49
176		48	0	44
183		41	2	37
185		41	1	35
192		42	1	35
202		57	0	51
208		37	3	33
226		37	0	32
236		17	1	15
241		46	2	40
243		53	0	46
245		29	1	24
251		52	0	48
269		40	2	34
276		43	2	40
288		41	0	33
298		34	2	33
301		46	2	43
302		43	2	39
328		18	1	16
341		40	2	36
342		38	2	32
354		49	2	42
358		50	1	47
359		18	0	15
369		44	0	38
373		108	3	96
378		33	1	31
396		37	1	34
397		33	1	28
398		54	1	49
399		41	3	39
417		30	2	25
418		41	2	36
426		32	2	28
427		46	1	43
444		26	1	24
446		29	3	27
447		43	2	37
450		48	0	43
454		45	3	41
457		35	2	32
460		32	1	27
473		24	0	19
484		41	2	39
497		16	1	15
499		21	1	19
503		29	1	24
508		51	1	47
511		94	2	82
512		28	0	26
5

In [12]:
# === 篩選「Sender 或 Receiver」在高風險帳戶內，且是洗錢交易 ===
laundering_in_high_risk = df.filter(
    (
        df["Sender_account"].is_in(high_risk_accounts) |
        df["Receiver_account"].is_in(high_risk_accounts)
    ) & (df["Is_laundering"] == "1")
)

# === 顯示洗錢交易筆數 ===
count = laundering_in_high_risk.shape[0]
print(f"🚨 高風險群組所涉及的洗錢交易筆數：{count}")


🚨 高風險群組所涉及的洗錢交易筆數：1079


In [13]:
# === 抓出「Sender = 0 或 Receiver = 0」的群組 ===
isolated_groups = []

for idx, (gid, nodes) in enumerate(groups.items(), 1):
    sender_count = sum(1 for n in nodes if in_degree.get(n, 0) == 0)
    receiver_count = sum(1 for n in nodes if out_degree.get(n, 0) == 0)
    
    if sender_count == 0 or receiver_count == 0:
        isolated_groups.append((idx, gid, len(nodes), sender_count, receiver_count))

# === 顯示結果 ===
print("\n🔴 完全出金/收金的遮斷型高風險群組：")
print("群組編號\t節點數\tSender\tReceiver")
for idx, gid, total_nodes, senders, receivers in isolated_groups:
    print(f"{idx}\t\t{total_nodes}\t{senders}\t{receivers}")

print(f"\n📊 遮斷型高風險群組數量：{len(isolated_groups)}")
# === 統計遮斷型高風險群組涵蓋的交易筆數 ===

# 擷取所有遮斷型群組的帳戶
isolated_accounts = set(
    acc for _, gid, _, _, _ in isolated_groups
    for acc in groups[gid]
)

# 篩選 sender 或 receiver 屬於這些帳戶的交易
tx_in_isolated_groups = df.filter(
    df["Sender_account"].is_in(isolated_accounts) |
    df["Receiver_account"].is_in(isolated_accounts)
)

# 顯示結果
print(f"📦 遮斷型高風險群組所涵蓋的交易總筆數：{tx_in_isolated_groups.shape[0]}")




🔴 完全出金/收金的遮斷型高風險群組：
群組編號	節點數	Sender	Receiver
60		13	0	0
64		51	0	43
65		35	0	28
101		33	0	25
106		49	0	45
111		46	0	39
143		29	0	26
176		48	0	44
202		57	0	51
206		10	0	0
226		37	0	32
243		53	0	46
251		52	0	48
253		10	0	0
288		41	0	33
327		11	0	0
359		18	0	15
369		44	0	38
450		48	0	43
473		24	0	19
512		28	0	26
556		22	0	6
582		51	0	46
609		60	0	54
624		22	0	9
627		14	0	0
636		47	0	41
651		12	0	0
656		57	0	49
709		29	0	26
723		45	0	39
736		49	0	46
764		10	0	0
852		47	0	39
868		14	0	0
924		21	0	18
926		56	0	51
940		49	0	43
1020		44	0	38
1029		49	0	42
1040		42	0	38
1134		13	0	0
1169		30	0	27
1244		38	0	31
1245		12	0	0
1276		26	0	23
1277		31	0	26
1286		12	0	0
1315		47	0	38
1324		56	0	51
1335		50	0	45
1351		48	0	40
1378		13	0	0
1400		61	0	55
1405		42	0	36
1438		53	0	48
1452		26	0	8
1491		26	0	19
1518		55	0	48
1529		34	0	26
1569		41	0	33
1575		27	0	24
1585		33	0	29
1624		33	0	28
1639		22	0	6
1699		26	0	22
1700		56	0	52
1707		49	0	42
1714		19	0	6
1720		53	0	46
1726		20	0	18
1729		20	0	7
1754	

In [14]:
# === 抓出遮斷型群組中所有帳戶 ===
isolated_accounts = set(
    acc for _, gid, _, _, _ in isolated_groups
    for acc in groups[gid]
)

# === 篩選出 洗錢交易 且發生在這些遮斷群組帳戶之中 ===
laundering_in_isolated = df.filter(
    (
        df["Sender_account"].is_in(isolated_accounts) |
        df["Receiver_account"].is_in(isolated_accounts)
    ) & (df["Is_laundering"] == "1")
)

# === 顯示結果 ===
count = laundering_in_isolated.shape[0]
print(f"🚨 遮斷型高風險群組所涉及的洗錢交易筆數：{count}")


🚨 遮斷型高風險群組所涉及的洗錢交易筆數：856


In [17]:
# === 抓出中風險群組（小圈層反覆轉帳） ===
medium_risk_groups = []

for idx, (gid, nodes) in enumerate(groups.items(), 1):
    sender_count = sum(1 for n in nodes if in_degree.get(n, 0) == 0)
    receiver_count = sum(1 for n in nodes if out_degree.get(n, 0) == 0)
    total_nodes = len(nodes)

    if sender_count <= 5 and receiver_count <= 5 and total_nodes >= 15:
        medium_risk_groups.append((idx, gid, total_nodes, sender_count, receiver_count))

# === 顯示結果 ===
print("\n🟠 中風險群組（Sender ≤ 5 且 Receiver ≤ 5 且 節點 ≥ 15）：")
print("群組編號\t節點數\tSender\tReceiver")
for idx, gid, total_nodes, senders, receivers in medium_risk_groups:
    print(f"{idx}\t\t{total_nodes}\t{senders}\t{receivers}")

print(f"\n📊 中風險群組數量：{len(medium_risk_groups)}")

# === 統計中風險群組中涉及的交易總筆數 ===
medium_risk_accounts = set(
    acc for _, gid, _, _, _ in medium_risk_groups
    for acc in groups[gid]
)

tx_in_medium_groups = df.filter(
    df["Sender_account"].is_in(medium_risk_accounts) |
    df["Receiver_account"].is_in(medium_risk_accounts)
)

print(f"📦 中風險群組所涵蓋的交易總筆數：{tx_in_medium_groups.shape[0]}")




🟠 中風險群組（Sender ≤ 5 且 Receiver ≤ 5 且 節點 ≥ 15）：
群組編號	節點數	Sender	Receiver
2536		23	0	5
2923		19	0	5
4384		18	0	0
5602		21	0	5
6271		18	0	0
7231		19	0	4
9078		15	0	0
9925		20	0	5
10157		20	0	4
10503		22	0	5
11171		19	0	5
11318		17	0	0
11433		21	0	5
12129		22	0	5
12587		21	0	5
12886		20	0	4
13571		20	0	5
13705		18	0	5
13885		16	0	0
13914		18	0	5
14429		21	0	5
14481		16	0	0
14489		18	0	1
14604		16	0	0
15055		17	0	1
15080		20	0	5
15099		22	0	5
15276		19	0	5
15322		22	0	5
15445		16	0	0
15545		17	0	4

📊 中風險群組數量：31
📦 中風險群組所涵蓋的交易總筆數：42235


In [18]:
# === 篩選中風險群組內的洗錢交易 ===

laundering_in_medium = df.filter(
    (
        df["Sender_account"].is_in(medium_risk_accounts) |
        df["Receiver_account"].is_in(medium_risk_accounts)
    ) & (df["Is_laundering"] == "1")
)

# 顯示結果
print(f"🚨 中風險群組所涉及的洗錢交易筆數：{laundering_in_medium.shape[0]}")


🚨 中風險群組所涉及的洗錢交易筆數：103


In [20]:
# === A. 建立群組風險分類 ===
high_risk_groups = []      # Sender ≤ 3 且 Receiver ≥ 10
severed_risk_groups = []   # Sender == 0 且 Receiver ≥ 10
medium_risk_groups = []    # Sender ≤ 3 且 Receiver ≥ 3（自行定義）

for idx, (gid, nodes) in enumerate(groups.items(), 1):
    sender_count = sum(1 for n in nodes if in_degree.get(n, 0) == 0)
    receiver_count = sum(1 for n in nodes if out_degree.get(n, 0) == 0)
    
    if sender_count == 0 and receiver_count >= 10:
        severed_risk_groups.append((idx, gid, len(nodes), sender_count, receiver_count))
    elif sender_count <= 3 and receiver_count >= 10:
        high_risk_groups.append((idx, gid, len(nodes), sender_count, receiver_count))
    elif sender_count <= 3 and receiver_count >= 3:
        medium_risk_groups.append((idx, gid, len(nodes), sender_count, receiver_count))

# === B. 驗證群組是否有重複 ===
high_risk_gids = set(gid for idx, gid, _, _, _ in high_risk_groups)
severed_risk_gids = set(gid for idx, gid, _, _, _ in severed_risk_groups)
medium_risk_gids = set(gid for idx, gid, _, _, _ in medium_risk_groups)

all_ids = high_risk_gids | severed_risk_gids | medium_risk_gids
assert len(all_ids) == len(high_risk_gids) + len(severed_risk_gids) + len(medium_risk_gids), "❌ 群組有重複分類"

print("✅ 驗證通過：群組分類無重複")


✅ 驗證通過：群組分類無重複
