In [1]:
# 3_Reconstruction_Error.ipynb
# 목적: 검증 노드의 재구성 오차 기반 이상치 탐지 수행

import os
import pickle
import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch.nn as nn


In [2]:

# PyG 필요
USE_PYG = True
try:
    from torch_geometric.data import Data
    from torch_geometric.nn import GCNConv, SAGEConv, GATConv
except Exception as e:
    print('torch_geometric not available:', e)
    USE_PYG = False

if not USE_PYG:
    raise RuntimeError('torch_geometric 필요: pip install torch-geometric')


In [4]:

# 파일 경로
GRAPH_PATH = 'G_graph.gpickle'
MODEL_DIRS = ['models', '.']  # 모델 파일을 찾을 경로 목록
MODEL_NAMES = ['GCN_AE', 'SAGE_AE', 'GAT_AE', 'PaperGAT_AE']

# 1) 그래프 불러오기
try:
    try:
        G = nx.read_gpickle(GRAPH_PATH)
    except AttributeError:
        with open(GRAPH_PATH, 'rb') as f:
            G = pickle.load(f)
    print(f'Loaded G: nodes={G.number_of_nodes()}, edges={G.number_of_edges()}')
except Exception as e:
    raise RuntimeError(f'Failed to load {GRAPH_PATH}: {e}')

# 2) PyG Data 구성 (노드 피처: centralitypagerank, uniqedgesIn, cycles, dayvaloutlier)
nodes = list(G.nodes())
idx_map = {n: i for i, n in enumerate(nodes)}

X_list = []
for n in nodes:
    d = G.nodes[n]
    pr = float(d.get('centralitypagerank', 0.0))
    uniqin = float(d.get('uniqedgesIn', 0))
    cycles = float(d.get('cycles', 0))
    dayout = 1.0 if bool(d.get('dayvaloutlier', False)) else 0.0
    X_list.append([pr, uniqin, cycles, dayout])

X = torch.tensor(np.array(X_list), dtype=torch.float)

edge_src, edge_dst, edge_attr = [], [], []
for u, v, edata in G.edges(data=True):
    if u in idx_map and v in idx_map:
        edge_src.append(idx_map[u]); edge_dst.append(idx_map[v])
        edge_attr.append(float(edata.get('netValue', 0.0)))

edge_index = torch.tensor([edge_src, edge_dst], dtype=torch.long)
edge_attr = torch.tensor(edge_attr, dtype=torch.float).unsqueeze(1) if len(edge_attr)>0 else None

data = Data(x=X, edge_index=edge_index, edge_attr=edge_attr)

# 노드 분할이 그래프에 저장되어 있으면 사용, 없으면 랜덤 8:2 (seed 고정)
if hasattr(data, 'train_mask') and hasattr(data, 'val_mask'):
    train_mask = data.train_mask
    val_mask = data.val_mask
else:
    num_nodes = X.shape[0]
    idxs = np.arange(num_nodes)
    np.random.seed(42)
    np.random.shuffle(idxs)
    train_n = int(num_nodes*0.8)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool); val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    train_mask[idxs[:train_n]] = True; val_mask[idxs[train_n:]] = True

# 3) 모델 클래스 정의 (2_GNN과 동일한 구조)
class GCN_AE(nn.Module):
    def __init__(self, in_ch, hid_ch, lat_ch):
        super().__init__()
        self.conv1 = GCNConv(in_ch, hid_ch)
        self.conv2 = GCNConv(hid_ch, lat_ch)
        self.decoder = nn.Linear(lat_ch, in_ch)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index); x = torch.relu(x)
        z = self.conv2(x, edge_index)
        recon = self.decoder(z)
        return z, recon

class SAGE_AE(nn.Module):
    def __init__(self, in_ch, hid_ch, lat_ch):
        super().__init__()
        self.conv1 = SAGEConv(in_ch, hid_ch)
        self.conv2 = SAGEConv(hid_ch, lat_ch)
        self.decoder = nn.Linear(lat_ch, in_ch)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index); x = torch.relu(x)
        z = self.conv2(x, edge_index)
        recon = self.decoder(z)
        return z, recon

class GAT_AE(nn.Module):
    def __init__(self, in_ch, hid_ch, lat_ch, heads=4, dropout=0.3):
        super().__init__()
        self.proj = nn.Linear(in_ch, hid_ch)
        self.gat1 = GATConv(hid_ch, hid_ch//heads, heads=heads, dropout=dropout)
        self.gat2 = GATConv((hid_ch//heads)*heads, lat_ch, heads=1, dropout=dropout)
        self.decoder = nn.Linear(lat_ch, in_ch)
        self.act = nn.ELU(); self.drop = nn.Dropout(dropout)
    def forward(self, x, edge_index):
        x = self.proj(x); x = self.act(x); x = self.drop(x)
        x = self.gat1(x, edge_index); x = self.act(x)
        z = self.gat2(x, edge_index)
        recon = self.decoder(z)
        return z, recon

class PaperGAT_AE(nn.Module):
    def __init__(self, in_ch, embed_dim=64, dropout=0.3):
        super().__init__()
        self.input_proj = nn.Linear(in_ch, embed_dim)
        self.gat_e1 = GATConv(embed_dim, 256//8, heads=8, dropout=dropout)
        self.gat_e2 = GATConv(256, 32, heads=1, dropout=dropout)
        self.gat_d1 = GATConv(32, 64, heads=1, dropout=dropout)
        self.output_proj = nn.Linear(64, in_ch)
        self.act = nn.ELU(); self.drop = nn.Dropout(dropout)
    def forward(self, x, edge_index):
        x = self.input_proj(x); x = self.act(x); x = self.drop(x)
        x = self.gat_e1(x, edge_index); x = self.act(x); x = self.drop(x)
        x = self.gat_e2(x, edge_index)
        z = x
        x = self.act(x); x = self.drop(x)
        x = self.gat_d1(x, edge_index); x = self.act(x)
        recon = self.output_proj(x)
        return z, recon

# 4) 모델 인스턴스화 및 체크포인트 로드 (있으면)
in_dim = data.x.shape[1]
hidden_dim = 128
latent_dim = 64

def try_load_model(name, model):
    # 후보 경로 목록
    candidates = []
    for d in MODEL_DIRS:
        candidates.append(os.path.join(d, f'{name}.pt'))
        candidates.append(os.path.join(d, f'{name}.pth'))
    # 또한 전체 딕셔너리 파일 가능성
    for path in candidates:
        if os.path.exists(path):
            try:
                model.load_state_dict(torch.load(path, map_location='cpu'))
                print(f'Loaded weights for {name} from {path}')
                return True
            except Exception as e:
                print(f'Failed to load {path}: {e}')
    print(f'No checkpoint found for {name} (searched: {candidates})')
    return False

model_instances = {
    'GCN_AE': GCN_AE(in_dim, 128, latent_dim),
    'SAGE_AE': SAGE_AE(in_dim, 128, latent_dim),
    'GAT_AE': GAT_AE(in_dim, 128, latent_dim, heads=4, dropout=0.3),
    'PaperGAT_AE': PaperGAT_AE(in_dim, embed_dim=64, dropout=0.3)
}

loaded_models = {}
for name, mdl in model_instances.items():
    ok = try_load_model(name, mdl)
    loaded_models[name] = {'model': mdl, 'loaded': ok}


Loaded G: nodes=7958, edges=14128
Loaded weights for GCN_AE from models\GCN_AE.pth
Loaded weights for SAGE_AE from models\SAGE_AE.pth
Loaded weights for GAT_AE from models\GAT_AE.pth
Loaded weights for PaperGAT_AE from models\PaperGAT_AE.pth


In [5]:

# 5) 재구성 오차 계산 및 z-score 기반 이상치 탐지
from math import isfinite

def compute_recon_errors(mdl, data, device='cpu'):
    mdl.to(device); mdl.eval()
    x = data.x.to(device); edge_index = data.edge_index.to(device)
    with torch.no_grad():
        z, recon = mdl(x, edge_index)
    # per-node MSE across features
    se = (recon - x).pow(2).mean(dim=1).cpu().numpy()
    return se

results_list = []
for name, info in loaded_models.items():
    mdl = info['model']
    print(f'Processing model: {name} (loaded={info["loaded"]})')
    try:
        errors = compute_recon_errors(mdl, data, device='cpu')
    except Exception as e:
        print(f'Failed to compute recon for {name}: {e}')
        continue
    # use validation nodes only
    val_idxs = np.where(val_mask.numpy())[0]
    val_errs = errors[val_idxs]
    mean = val_errs.mean(); std = val_errs.std(ddof=0)
    # avoid zero std
    if std == 0:
        zscores = np.zeros_like(val_errs)
    else:
        zscores = (val_errs - mean) / std
    # flags
    flag_z2 = zscores > 2
    flag_z3 = zscores > 3
    # compose dataframe for this model
    dfm = pd.DataFrame({
        'node_index': val_idxs,
        'address': [nodes[i] for i in val_idxs],
        'recon_error': val_errs,
        'zscore': zscores,
        'anomaly_z2': flag_z2,
        'anomaly_z3': flag_z3
    })
    # try attach any known labels stored in node attributes
    possible_label_keys = ['label','is_exchange','exchange','tag','type']
    for k in possible_label_keys:
        if any(k in G.nodes[n] for n in nodes):
            dfm[k] = [G.nodes[n].get(k) for n in dfm['address']]
    # summary counts
    cnt_z2 = int(dfm['anomaly_z2'].sum())
    cnt_z3 = int(dfm['anomaly_z3'].sum())
    print(f'  val_count={len(val_idxs)} mean={mean:.6f} std={std:.6f} anomalies(z>2)={cnt_z2} (z>3)={cnt_z3}')
    # save individual model results
    out_csv = f'recon_errors_{name}.csv'
    dfm.to_csv(out_csv, index=False)
    print(f'  saved -> {out_csv}')
    results_list.append({'model': name, 'val_count': int(len(val_idxs)), 'mean': float(mean), 'std': float(std), 'anomaly_z2': cnt_z2, 'anomaly_z3': cnt_z3, 'csv': out_csv})

# 6) 통합 요약 저장
df_summary = pd.DataFrame(results_list)
df_summary.to_csv('recon_errors_summary.csv', index=False)
print('\nSummary:')
print(df_summary)
print('\nSaved summary -> recon_errors_summary.csv')

# 7) 상위 이상치(예: z>3) 샘플 출력
for r in results_list:
    dfm = pd.read_csv(r['csv'])
    high = dfm.loc[dfm['anomaly_z3']].sort_values('zscore', ascending=False).head(20)
    print(f"\nTop anomalies (z>3) for {r['model']} (showing up to 20):\n", high[['node_index','address','recon_error','zscore']])

# 끝: 생성된 파일들 -> recon_errors_*.csv, recon_errors_summary.csv



Processing model: GCN_AE (loaded=True)
  val_count=1592 mean=1778398.375000 std=16291265.000000 anomalies(z>2)=1 (z>3)=1
  saved -> recon_errors_GCN_AE.csv
Processing model: SAGE_AE (loaded=True)
  val_count=1592 mean=59957.285156 std=40967.734375 anomalies(z>2)=156 (z>3)=4
  saved -> recon_errors_SAGE_AE.csv
Processing model: GAT_AE (loaded=True)
  val_count=1592 mean=3018049.250000 std=1117918.625000 anomalies(z>2)=0 (z>3)=0
  saved -> recon_errors_GAT_AE.csv
Processing model: PaperGAT_AE (loaded=True)
  val_count=1592 mean=4197445.500000 std=2067450.250000 anomalies(z>2)=0 (z>3)=0
  saved -> recon_errors_PaperGAT_AE.csv

Summary:
         model  val_count          mean           std  anomaly_z2  anomaly_z3  \
0       GCN_AE       1592  1.778398e+06  1.629126e+07           1           1   
1      SAGE_AE       1592  5.995729e+04  4.096773e+04         156           4   
2       GAT_AE       1592  3.018049e+06  1.117919e+06           0           0   
3  PaperGAT_AE       1592  4.197446