In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, silhouette_score,auc
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler  # 导入 StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import networkx as nx
import seaborn as sns
import os
# 设置 Matplotlib 的字体为支持中文的字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] 
# 解决负号显示问题
matplotlib.rcParams['axes.unicode_minus'] = False

数据采样

In [47]:
import pandas as pd
from sklearn.utils import shuffle

# 读取原始数据
data = pd.read_csv('.\\data\\FinancialDatasets.csv')

# 提取欺诈和非欺诈数据
fraud_data = data[data['isFraud'] == 1]
non_fraud_data = data[data['isFraud'] == 0]

# 欺诈样本数量
n_fraud = len(fraud_data)

# 设置目标欺诈比例，例如5%
fraud_ratio = 0.05

# 计算所需的非欺诈样本数量
n_non_fraud = int(n_fraud * (1 - fraud_ratio) / fraud_ratio)

# 从非欺诈数据中随机抽取所需数量的样本
non_fraud_sampled = non_fraud_data.sample(n=n_non_fraud, random_state=42)

# 合并欺诈和非欺诈样本
combined_data = pd.concat([fraud_data, non_fraud_sampled])

# 打乱数据顺序
shuffled_data = shuffle(combined_data, random_state=42).reset_index(drop=True)

# 保存为新的CSV文件
shuffled_data.to_csv('.\\data\\balanced_data.csv', index=False)


In [48]:
# 读取文件
data = pd.read_csv('.\\data\\balanced_data.csv')
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,256,TRANSFER,421119.33,C768507203,43407.56,0.00,C1187433637,528027.95,949147.28,0,0
1,514,CASH_IN,28947.82,C769629950,1949.00,30896.82,C294017772,0.00,0.00,0,0
2,38,CASH_OUT,355480.10,C992047135,0.00,0.00,C785812458,2090492.10,2445972.20,0,0
3,179,TRANSFER,272855.03,C838467987,39857.00,0.00,C312728651,121505.38,394360.42,0,0
4,228,CASH_IN,308951.31,C658714397,2050251.69,2359203.00,C266963012,1469231.63,1160280.32,0,0
...,...,...,...,...,...,...,...,...,...,...,...
164254,399,TRANSFER,426121.60,C204304883,0.00,0.00,C1192512131,5643714.16,6069835.77,0,0
164255,181,CASH_OUT,129035.89,C1034098653,12546.00,0.00,C154996261,0.00,129035.89,0,0
164256,280,PAYMENT,18265.45,C1150494225,40099.00,21833.55,M1121808623,0.00,0.00,0,0
164257,596,CASH_OUT,145505.30,C1625230665,23748.26,0.00,C338448829,473192.38,618697.68,0,0


In [49]:
data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [50]:
data.duplicated().sum()

0

In [51]:
data.shape

(164259, 11)

## 特征工程

In [53]:
X = data.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1, errors='ignore')
y = data['isFraud']

# 处理分类特征
if 'type' in X.columns:
    X = pd.get_dummies(X, columns=['type'], drop_first=True)
# 创建新特征
if all(col in X.columns for col in ['oldbalanceOrg', 'newbalanceOrig']):
    X['balanceDiffOrig'] = X['oldbalanceOrg'] - X['newbalanceOrig']
                
if all(col in X.columns for col in ['oldbalanceDest', 'newbalanceDest']):
    X['balanceDiffDest'] = X['oldbalanceDest'] - X['newbalanceDest']
            
# 标准化数值特征
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X = X.fillna(X.mean())  

In [54]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"训练集形状: {x_train.shape}, 测试集形状: {x_test.shape}")
print(f"训练集欺诈比例: {y_train.mean():.4f}, 测试集欺诈比例: {y_test.mean():.4f}")

训练集形状: (131407, 12), 测试集形状: (32852, 12)
训练集欺诈比例: 0.0500, 测试集欺诈比例: 0.0500


In [None]:
sample = SMOTE(random_state=42)
X_train,Y_train = sample.fit_resample(x_train,y_train)





In [56]:
len(Y_train[Y_train==1])

124837

## 建模

#### 1、传统机器学习

In [57]:
import joblib


rf_model = RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                min_samples_split=  2,
                min_samples_leaf= 1,
                random_state= 42,
                class_weight= 'balanced',
                n_jobs= -1)
rf_model.fit(X_train, Y_train)

# 保存模型到文件
joblib.dump(rf_model, r'E:\DevelopmentProject\BigDataCompetition\IntelligentAntiFraud\model\rf_model.pkl')

['E:\\DevelopmentProject\\BigDataCompetition\\IntelligentAntiFraud\\model\\rf_model.pkl']

In [58]:
gbc_model = GradientBoostingClassifier(
                n_estimators= 100,
                learning_rate= 0.1,
                max_depth=3,
                min_samples_split= 2,
                min_samples_leaf= 1,
                subsample=0.8,
                random_state= 42)
gbc_model.fit(X_train, Y_train)

joblib.dump(gbc_model, r'E:\DevelopmentProject\BigDataCompetition\IntelligentAntiFraud\model\gbc_model.pkl')

['E:\\DevelopmentProject\\BigDataCompetition\\IntelligentAntiFraud\\model\\gbc_model.pkl']

In [59]:
def predict(model, X):
    return model.predict(X)

def predict_proba(model, X):
    return model.predict_proba(X)

def evaluate(model, model_type, X_test, y_test):
    y_pred = predict(model, X_test)
    y_prob = predict_proba(model, X_test)[:, 1]

    # 计算评估指标
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob)

    print(f"\n{model_type.upper()} 模型评估结果:")
    print(f"分类报告:\n{report}")
    print(f"混淆矩阵:\n{conf_matrix}")
    print(f"AUC分数: {auc_score:.4f}")

    return {'classification_report': report,
            'confusion_matrix': conf_matrix,
            'auc_score': auc_score}

# 评估随机森林模型
rf_results = evaluate(rf_model, 'Random Forest', x_test, y_test)

# 评估梯度提升模型
gbc_results = evaluate(gbc_model, 'Gradient Boosting', x_test, y_test)


RANDOM FOREST 模型评估结果:
分类报告:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     31209
           1       0.82      1.00      0.90      1643

    accuracy                           0.99     32852
   macro avg       0.91      0.99      0.95     32852
weighted avg       0.99      0.99      0.99     32852

混淆矩阵:
[[30855   354]
 [    8  1635]]
AUC分数: 0.9992

GRADIENT BOOSTING 模型评估结果:
分类报告:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     31209
           1       0.77      0.99      0.87      1643

    accuracy                           0.99     32852
   macro avg       0.89      0.99      0.93     32852
weighted avg       0.99      0.99      0.99     32852

混淆矩阵:
[[30727   482]
 [   10  1633]]
AUC分数: 0.9982


#### 2、深度学习

多层感知机（MLP）模型

In [60]:
# 显式转换数据类型
X_train = X_train.astype(np.float64)
x_test = x_test.astype(np.float64)

X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(Y_train.values)
X_test_tensor = torch.FloatTensor(x_test.values)
y_test_tensor = torch.FloatTensor(y_test.values)

# 定义批量大小
batch_size = 64

# 准备训练数据加载器
train_data = []
for i in range(len(y_train_tensor)):
    train_data.append([X_train_tensor[i], y_train_tensor[i]])
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# 准备验证数据加载器
val_data = []
for i in range(len(y_test_tensor)):
    val_data.append([X_test_tensor[i], y_test_tensor[i]])
val_loader = DataLoader(val_data, batch_size=batch_size)

In [61]:
# 模型构建部分（对应 DeepFraudDetector 类）
input_dim = X_train.shape[1]
hidden_dims = [128, 64]
layers = []

# 输入层
prev_dim = input_dim
for hidden_dim in hidden_dims:
    layers.extend([
        nn.Linear(prev_dim, hidden_dim),
        nn.ReLU(),
        nn.BatchNorm1d(hidden_dim),
        nn.Dropout(0.3)
    ])
    prev_dim = hidden_dim

# 输出层
layers.append(nn.Linear(prev_dim, 1))
layers.append(nn.Sigmoid())

model = nn.Sequential(*layers)

In [62]:
# 选择设备
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# 训练参数设置
epochs = 5
lr = 0.001

# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

该模型在基础MLP结构基础上优化了网络结构（更深更合理），还引入了训练机制（早停）、正则化（weight_decay）与激活函数改进（LeakyReLU），整体更加健壮、可扩展且不易过拟合

In [63]:
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(Y_train.values)
X_test_tensor = torch.FloatTensor(x_test.values)
y_test_tensor = torch.FloatTensor(y_test.values)

batch_size = 64
train_data = []
for i in range(len(y_train_tensor)):
    train_data.append([X_train_tensor[i], y_train_tensor[i]])
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# 准备验证数据加载器
val_data = []
for i in range(len(y_test_tensor)):
    val_data.append([X_test_tensor[i], y_test_tensor[i]])
val_loader = DataLoader(val_data, batch_size=batch_size)

# 优化后的模型构建部分
class OptimizedMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout_rate=0.3):
        super(OptimizedMLP, self).__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.LeakyReLU(negative_slope=0.01),  # 使用 LeakyReLU 激活函数
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, 1))
        layers.append(nn.Sigmoid())
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x).squeeze()

# 创建优化后的模型
input_dim = X_train.shape[1]
model = OptimizedMLP(input_dim)

# 选择设备
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# 训练参数设置
epochs = 20  # 增加训练轮数
lr = 0.0005  # 降低学习率
weight_decay = 0.0001  # 增加权重衰减

# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# 早停策略参数
patience = 3
best_val_auc = 0
early_stopping_counter = 0

for epoch in range(epochs):
    # 训练阶段
    model.train()
    train_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # 验证阶段
    model.eval()
    val_loss = 0
    val_preds = []
    val_true = []

    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            val_loss += criterion(outputs, batch_y).item()

            val_preds.extend(outputs.cpu().numpy())
            val_true.extend(batch_y.cpu().numpy())

    # 计算验证集的 AUC
    val_auc = roc_auc_score(val_true, val_preds)

    print(f'Epoch [{epoch + 1}/{epochs}]')
    print(f'Train Loss: {train_loss / len(train_loader):.4f}')
    print(f'Val Loss: {val_loss / len(val_loader):.4f}')
    print(f'Val AUC: {val_auc:.4f}\n')

    # 早停策略
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        early_stopping_counter = 0
        #保存模型
        save_dir = 'model' 
        save_path = os.path.join(save_dir, 'best_mlp_model.pth')
        torch.save(model, save_path)
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            print("Early stopping!")
            break

Epoch [1/20]
Train Loss: 0.1015
Val Loss: 0.0792
Val AUC: 0.9972

Epoch [2/20]
Train Loss: 0.0593
Val Loss: 0.0627
Val AUC: 0.9979

Epoch [3/20]
Train Loss: 0.0516
Val Loss: 0.0284
Val AUC: 0.9984

Epoch [4/20]
Train Loss: 0.0486
Val Loss: 0.0666
Val AUC: 0.9985

Epoch [5/20]
Train Loss: 0.0495
Val Loss: 0.0455
Val AUC: 0.9989

Epoch [6/20]
Train Loss: 0.0452
Val Loss: 0.0530
Val AUC: 0.9988

Epoch [7/20]
Train Loss: 0.0428
Val Loss: 0.0204
Val AUC: 0.9993

Epoch [8/20]
Train Loss: 0.0447
Val Loss: 0.0234
Val AUC: 0.9985

Epoch [9/20]
Train Loss: 0.0448
Val Loss: 0.0277
Val AUC: 0.9993

Epoch [10/20]
Train Loss: 0.0422
Val Loss: 0.0220
Val AUC: 0.9985

Epoch [11/20]
Train Loss: 0.0428
Val Loss: 0.0322
Val AUC: 0.9994

Epoch [12/20]
Train Loss: 0.0419
Val Loss: 0.0235
Val AUC: 0.9994

Epoch [13/20]
Train Loss: 0.0402
Val Loss: 0.0233
Val AUC: 0.9990

Epoch [14/20]
Train Loss: 0.0400
Val Loss: 0.0297
Val AUC: 0.9987

Epoch [15/20]
Train Loss: 0.0405
Val Loss: 0.0245
Val AUC: 0.9993

Earl

In [64]:
model.eval()
predictions = []

with torch.no_grad():
    for batch_X, _ in val_loader:
        batch_X = batch_X.to(device)
        outputs = model(batch_X).squeeze()
        predictions.extend(outputs.cpu().numpy())

# predictions = np.array(predictions)
predictions

[0.00036151454,
 0.00047280936,
 0.00058177137,
 0.00036862108,
 0.0008287228,
 0.00025434038,
 0.00028455537,
 0.0003797828,
 0.026909787,
 0.00033691316,
 0.0001386943,
 0.001123137,
 0.121046394,
 0.0016204665,
 0.0002926016,
 0.00029086438,
 0.082638815,
 8.808821e-05,
 0.000798108,
 0.010247156,
 0.0007709843,
 0.0009000469,
 0.00018894405,
 0.000499702,
 0.0021079597,
 0.00015683213,
 0.96640277,
 0.00026768283,
 0.0007312747,
 0.0034989053,
 0.00011629557,
 0.00023542027,
 0.0009268219,
 0.00020947366,
 0.00025178984,
 0.004600126,
 0.000274662,
 0.00075528974,
 0.99846965,
 0.00047354703,
 0.0010926703,
 0.00072203495,
 0.013530748,
 0.001784602,
 0.0015688924,
 0.00033001803,
 0.98948354,
 0.0023605342,
 0.014290971,
 0.9858477,
 0.0001966951,
 0.00020059502,
 1.6811491e-05,
 0.0002441853,
 0.00062179746,
 0.00077445613,
 8.842012e-05,
 0.002764389,
 0.004840827,
 0.00036670527,
 0.00040960184,
 0.003484156,
 0.00016608232,
 0.00084887753,
 0.00032735761,
 0.99698585,
 0.00051

## 图神经网络分析

In [65]:
from torch_geometric.nn import GCNConv, SAGEConv
from torch_geometric.data import Data
import torch
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.decomposition import IncrementalPCA
TORCH_GEOMETRIC_AVAILABLE = True
matplotlib.use('Agg')

In [66]:
# pip install torch-geometric

In [67]:
transaction_data = data[['nameOrig', 'nameDest', 'amount', 'type','isFraud']]
transaction_data

Unnamed: 0,nameOrig,nameDest,amount,type,isFraud
0,C768507203,C1187433637,421119.33,TRANSFER,0
1,C769629950,C294017772,28947.82,CASH_IN,0
2,C992047135,C785812458,355480.10,CASH_OUT,0
3,C838467987,C312728651,272855.03,TRANSFER,0
4,C658714397,C266963012,308951.31,CASH_IN,0
...,...,...,...,...,...
164254,C204304883,C1192512131,426121.60,TRANSFER,0
164255,C1034098653,C154996261,129035.89,CASH_OUT,0
164256,C1150494225,M1121808623,18265.45,PAYMENT,0
164257,C1625230665,C338448829,145505.30,CASH_OUT,0


In [68]:
class TransactionGraph:
    def __init__(self):
        self.graph = nx.DiGraph()
        self.node_features = {}
        self.edge_features = {}

    #构建交易图
    def build_graph(self, transactions_df):
        required_cols = ['nameOrig', 'nameDest', 'amount', 'type']
        for col in required_cols:
            if col not in transactions_df.columns:
                raise ValueError(f"缺少必要列: {col}")
        unique_accounts = set(transactions_df['nameOrig']).union(transactions_df['nameDest'])
        for account in unique_accounts:
            self.graph.add_node(account)
            self.node_features[account] = {
                'transaction_count': 0,
                'total_sent': 0.0,
                'total_received': 0.0,
                'is_merchant': account.startswith('M'),
                'fraud_involved': False
            }
        for _, row in transactions_df.iterrows():
            src, dst = row['nameOrig'], row['nameDest']
            amount, tx_type = float(row['amount']), row['type']
            is_fraud = row.get('isFraud', 0) == 1
            self.node_features[src]['transaction_count'] += 1
            self.node_features[src]['total_sent'] += amount
            self.node_features[dst]['transaction_count'] += 1
            self.node_features[dst]['total_received'] += amount
            if is_fraud:
                self.node_features[src]['fraud_involved'] = True
                self.node_features[dst]['fraud_involved'] = True
            if self.graph.has_edge(src, dst):
                self.graph[src][dst]['weight'] += amount
                self.graph[src][dst]['count'] += 1
                if is_fraud:
                    self.graph[src][dst]['fraud_count'] += 1
            else:
                self.graph.add_edge(src, dst, weight=amount, count=1, 
                                    fraud_count=1 if is_fraud else 0, type=tx_type)
        return self.graph

    #节点特征构造
    def get_node_features(self):
        nodes = list(self.graph.nodes())
        node_to_idx = {node: i for i, node in enumerate(nodes)}
        features = [
            [
                self.node_features[node]['transaction_count'],
                self.node_features[node]['total_sent'],
                self.node_features[node]['total_received'],
                1 if self.node_features[node]['is_merchant'] else 0,
                1 if self.node_features[node]['fraud_involved'] else 0
            ]
            for node in nodes
        ]
        return np.array(features, dtype=np.float32), node_to_idx

    #边索引和特征 
    def get_edge_index(self, node_to_idx):
        edge_index, edge_attr = [], []
        for src, dst, data in self.graph.edges(data=True):
            edge_index.append([node_to_idx[src], node_to_idx[dst]])
            edge_attr.append([
                data['weight'],
                data['count'],
                data['fraud_count'],
                1 if data['type'] == 'TRANSFER' else 0,
                1 if data['type'] == 'CASH_OUT' else 0,
                1 if data['type'] == 'PAYMENT' else 0,
                1 if data['type'] == 'DEBIT' else 0,
                1 if data['type'] == 'CASH_IN' else 0
            ])
        return np.array(edge_index).T, np.array(edge_attr, dtype=np.float32)

    #转换为 PyTorch Geometric 图结构
    def to_pytorch_geometric(self):
        x, node_to_idx = self.get_node_features()
        edge_index, edge_attr = self.get_edge_index(node_to_idx)
        data = Data(
            x=torch.tensor(x, dtype=torch.float32),
            edge_index=torch.tensor(edge_index, dtype=torch.long),
            edge_attr=torch.tensor(edge_attr, dtype=torch.float32)
        )
        return data, node_to_idx

GraphSAGE 图神经网络模型

In [69]:
class GraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv3(x, edge_index)
        return x

GNN 模型

FraudGNN 构建在 GraphSAGE 之上，训练节点的图嵌入，并通过聚类分析交易图中可能的“诈骗社群”

In [70]:

class FraudGNN:
    def __init__(self, in_channels, hidden_channels=64, out_channels=32):
        self.model = GraphSAGE(in_channels, hidden_channels, out_channels)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.embeddings = None
        self.node_to_idx = None

    def train(self, data, epochs=50, lr=0.005):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        data = data.to(self.device)
        self.model.train()
        try:
            for epoch in range(epochs):
                optimizer.zero_grad()
                out = self.model(data.x, data.edge_index)
                loss = self._link_prediction_loss(out, data.edge_index)
                loss.backward()
                optimizer.step()
                if (epoch + 1) % 10 == 0:
                    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')
        except RuntimeError as e:
            print(f"训练中断，错误: {e}")

    def _link_prediction_loss(self, z, edge_index):
        pos_edge_index = edge_index
        neg_edge_index = self._negative_sampling(edge_index, z.size(0))
        pos_score = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)
        neg_score = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)
        pos_loss = F.binary_cross_entropy_with_logits(pos_score, torch.ones_like(pos_score))
        neg_loss = F.binary_cross_entropy_with_logits(neg_score, torch.zeros_like(neg_score))
        return pos_loss + neg_loss

    def _negative_sampling(self, edge_index, num_nodes, num_neg_samples=None):
        if num_neg_samples is None:
            num_neg_samples = edge_index.size(1)
        edge_set = set((i.item(), j.item()) for i, j in edge_index.t())
        neg_edges = []
        while len(neg_edges) < num_neg_samples:
            i = torch.randint(0, num_nodes, (1,)).item()
            j = torch.randint(0, num_nodes, (1,)).item()
            if i != j and (i, j) not in edge_set:
                neg_edges.append([i, j])
                edge_set.add((i, j))
        return torch.tensor(neg_edges, device=edge_index.device).t()

    def get_embeddings(self, data):
        self.model.eval()
        data = data.to(self.device)
        with torch.no_grad():
            self.embeddings = self.model(data.x, data.edge_index).cpu().numpy()
        return self.embeddings

    

In [71]:
def main():

    #构建交易图
    transaction_graph = TransactionGraph()
    transaction_graph.build_graph(transaction_data)
    graph_data, node_mapping = transaction_graph.to_pytorch_geometric()

    #构建GNN模型
    gnn_model = FraudGNN(in_channels=graph_data.x.size(1))

    #创建优化器
    gnn_optimizer = torch.optim.Adam(gnn_model.model.parameters(), lr=0.005)

    #训练GNN模型
    gnn_model.train(graph_data, epochs=50,lr=0.005)

    #保存模型和优化器
    torch.save({
        'model_state_dir':gnn_model.model,
        'optimizer_state_dir':gnn_optimizer,
    },r"E:\DevelopmentProject\BigDataCompetition\IntelligentAntiFraud\model\fraud_gnn_model.pth")

    print("=== 图分析完成 ===")
if __name__ == "__main__":
    main()


Epoch 10/50, Loss: 628775168.0000
Epoch 20/50, Loss: 56753288.0000
Epoch 30/50, Loss: 10014223.0000
Epoch 40/50, Loss: 1363010.5000
Epoch 50/50, Loss: 1143635.3750
=== 图分析完成 ===


## 可视化

In [72]:
class ModelVisualizer:
    def __init__(self, output_dir='.\\0417'):
        self.output_dir = output_dir
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    
    def plot_confusion_matrix(self, y_true, y_pred, model_name, normalize=False):
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            fmt = '.2f'
            title = f'{model_name} - 归一化混淆矩阵'
        else:
            fmt = 'd'
            title = f'{model_name} - 混淆矩阵'
        
        sns.heatmap(cm, annot=True, fmt=fmt, cmap='Blues', 
                    xticklabels=['正常', '欺诈'], 
                    yticklabels=['正常', '欺诈'])
        
        plt.title(title)
        plt.ylabel('真实标签')
        plt.xlabel('预测标签')
        
        # 保存图像
        output_path = os.path.join(self.output_dir, f'{model_name}_confusion_matrix.png')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f"混淆矩阵已保存至: {output_path}")
        return output_path
    
    def plot_roc_curve(self, y_true, y_scores_dict):
        plt.figure(figsize=(10, 8))
        
        for model_name, y_score in y_scores_dict.items():
            fpr, tpr, _ = roc_curve(y_true, y_score)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.4f})')
        
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('假阳性率 (FPR)')
        plt.ylabel('真阳性率 (TPR)')
        plt.title('各模型ROC曲线对比')
        plt.legend(loc="lower right")
        
        # 保存图像
        output_path = os.path.join(self.output_dir, 'roc_curves_comparison.png')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f"ROC曲线对比图已保存至: {output_path}")
        return output_path
    
    def plot_feature_importance(self, model, feature_names, model_name):
        if not hasattr(model, 'feature_importances_'):
            print(f"警告: {model_name} 模型没有feature_importances_属性")
            return None
        
        # 获取特征重要性
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1][:]
        
        plt.figure(figsize=(12, 8))
        plt.title(f'{model_name} - 特征重要性')
        plt.bar(range(len(indices)), importances[indices], align='center')
        plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=90)
        plt.tight_layout()
        
        # 保存图像
        output_path = os.path.join(self.output_dir, f'{model_name}_feature_importance.png')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f"特征重要性图已保存至: {output_path}")
        return output_path




In [73]:
visualizer = ModelVisualizer('visualizations')
graph_viz = GraphVisualizer('visualizations')

In [74]:
# 绘制混淆矩阵
visualizer.plot_confusion_matrix(y_test, rf_model.predict(x_test), 'RandomForest', normalize=True)
visualizer.plot_confusion_matrix(y_test, gbc_model.predict(x_test), 'GBDT', normalize=True)

混淆矩阵已保存至: visualizations\RandomForest_confusion_matrix.png
混淆矩阵已保存至: visualizations\GBDT_confusion_matrix.png


'visualizations\\GBDT_confusion_matrix.png'

In [75]:
from sklearn.metrics import roc_curve, auc

# 绘制ROC曲线对比
y_scores_dict = {
    'RandomForest': rf_model.predict_proba(x_test)[:, 1],
    'GBDT': gbc_model.predict_proba(x_test)[:, 1]}
visualizer.plot_roc_curve(y_test, y_scores_dict)
    
# 绘制特征重要性
feature_names = [f'Feature_{i}' for i in range(X.shape[1])]
visualizer.plot_feature_importance(rf_model, feature_names, 'RandomForest')
visualizer.plot_feature_importance(gbc_model, feature_names, 'GBDT')

ROC曲线对比图已保存至: visualizations\roc_curves_comparison.png
特征重要性图已保存至: visualizations\RandomForest_feature_importance.png
特征重要性图已保存至: visualizations\GBDT_feature_importance.png


'visualizations\\GBDT_feature_importance.png'