In [1]:
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data


In [2]:
torch.__version__

'2.1.1+cu118'

In [3]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
import torch.optim as optim

# 加载数据
data = np.load('data.npz')
x = data['x']
y = data['y']
edge_index = data['edge_index']
train_mask_t= data['train_mask']
test_mask = data['test_mask']
edge_type = data['edge_type']
np.random.seed(42)
np.random.shuffle(train_mask_t)
train_mask = train_mask_t[:int(len(train_mask_t)/10*8)]
valid_mask = train_mask_t[int(len(train_mask_t)/10*8):]


In [4]:
np.unique(edge_type,return_counts=True)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64),
 array([18766,  1758,  2949, 35076, 63909, 26108,   107,  5044,  5589,
         5493,  2760], dtype=int64))

### 开始特征工程
#### 1.节点重要度
1.1度中心性
$c_D (v)=\frac{deg(v)}{n-1}$
有向图要计算出度中心性和入度中心性

In [5]:
#构造每个节点的入度和出度
num_nodes = x.shape[0]
in_degree = np.zeros(x.shape[0], dtype=int)
out_degree = np.zeros(x.shape[0], dtype=int)
for src,dst in edge_index:
    in_degree[dst]+=1
    out_degree[src]+=1
indeg_centrality=in_degree/(num_nodes-1)
outdeg_centrality=out_degree/(num_nodes-1)

1.2 接近中心性
衡量节点影响范围和在信息传播中的作用，是节点到其他节点平均距离的倒数
$c_c(v)=\frac{1}{\sum_{u\neq v }d(u,v)}$

In [6]:
import networkx as nx
G = nx.DiGraph()
G.add_edges_from(edge_index)
# 计算所有节点对的最短路径长度
path_lengths = dict(nx.all_pairs_shortest_path_length(G))
closeness_centrality = np.zeros(num_nodes, dtype=float)
for node in path_lengths:
    total_distance = sum(path_lengths[node].values())
    if total_distance > 0:  # 避免除以零
        closeness_centrality[node] =1/ total_distance
print(closeness_centrality)

[0. 0. 0. ... 1. 0. 0.]


1.3 介数中心性
指节点在所有最短路径中出现的次数。该指标可以衡量节点在信息传播和资源流动中的作用。
$C_B(v)=\sum_{s\neq v\neq t}\frac{\sigma(s,t|v)}{\sigma(s,t)}$
其中$\sigma(s,t)$表示s与t之间的最短路径数，$\sigma(s,t|v)$表示v在最短路径中出现的次数

In [7]:
# betweenness_centrality = nx.betweenness_centrality(G)

1.4 特征向量重要度
指节点的重要性与其相邻节点的重要性有关。如果一个节点与其他重要节点相连，那么它的重要性也会提高。
$C_E(v)=\frac{1}{\lambda}\sum_{u\in N(v)}C_E(u)$
其中N(v)表示节点u的邻居节点集合，$\lambda$是常数，满足$Av=\lambda v$A是邻接矩阵，v是特征向量

In [8]:
eigenvector_centrality = nx.eigenvector_centrality_numpy(G)
eigenvector_centrality_array = np.array([eigenvector_centrality.get(node, 0) for node in range(num_nodes)])

1.5 集群系数
计算方法为：该节点的周围节点之间相连数 / 该节点与周围节点相连数，值域为[0,1]


In [9]:
#转为无向图计算
G_undirected = G.to_undirected()
# 计算集群系数
clustering_coefficient = nx.clustering(G_undirected)
# 将集群系数结果转换为数组形式
clustering_coefficient_array = np.array([clustering_coefficient.get(node, 0) for node in range(num_nodes)])

1.6 pagerank节点相对重要性
一个节点向它指向的每个节点“投票”，每个投票的权重是投票者的重要性除以它的出度（即它指向的节点数）

In [10]:
page_rank=nx.pagerank(G, alpha=0.85)
pagerank = np.array([page_rank.get(node, 0) for node in range(num_nodes)])

# 输出PageRank值
print(pagerank)

[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 2.23319579e-06
 4.38268035e-06 0.00000000e+00]


1.7 katz centrality 
Katz中心性的定义基于以下思想：节点的重要性是其所有邻居的重要性的总和，但是每条路径的贡献通过一个衰减因子α逐步减少。这个衰减因子α是一个小于网络所有权重的特征值的正数，用来控制路径长度对中心性计算的影响。
$C_{\text{Katz}}(i) = \sum_{k=1}^{\infty} \sum_{j=1}^{n} \alpha^k \cdot (A^k)_{ji}$

In [11]:
katz_centrality = nx.katz_centrality(G, alpha=0.005)

# 将Katz中心性结果转换为数组形式
katz_centrality_array = np.array([katz_centrality.get(node, 0) for node in range(num_nodes)])

1.8 HITS
算法是一种用于网络分析的算法，主要用于识别网络中的“权威”（Authorities）和“枢纽”（Hubs）
权威（Authorities）：这些是被许多枢纽指向的节点。在互联网环境中，一个权威页面是一个提供特定主题高质量信息的页面，被许多其他页面通过链接引用。
枢纽（Hubs）：枢纽是指向多个权威的节点。一个好的枢纽是指一个包含许多权威链接的页面。

In [12]:
# # 计算HITS的权威和枢纽分数
# hubs, authorities = nx.hits(G)
# 
# # 将结果转换为数组形式
# hubs_array = np.array([hubs.get(node, 0) for node in range(max(edge_index.flatten()) + 1)])
# authorities_array = np.array([authorities.get(node, 0) for node in range(max(edge_index.flatten()) + 1)])

In [13]:
# 计算所有入边的权重和
# 初始化节点特征为0（这里假设节点特征是单一值，如果有多维特征，需要调整这里）
in_weight=np.zeros(num_nodes,dtype=int)
out_weight=np.zeros(num_nodes,dtype=int)
# 累加每个节点的所有入边的边类型
for i, edge in enumerate(edge_index):
    src, dst = edge
    in_weight[dst] += edge_type[i]
    out_weight[src]+=edge_type[i]

In [14]:
triangle_count = nx.triangles(G_undirected)
triangle_count=np.array([triangle_count.get(node, 0) for node in range(num_nodes)])

In [15]:
# # 计算所有可能节点对的杰卡德相似性
# jaccard_coefficients = list(nx.jaccard_coefficient(G_undirected))
# 
# # 初始化计数器
# count_above_threshold =np.zeros(num_nodes,dtype=int)
# 
# # 对于每个节点对的杰卡德相似性，如果大于阈值，则增加计数
# for u, v, coeff in jaccard_coefficients:
#     if coeff > 0.5:
#         count_above_threshold[u] += 1
#         count_above_threshold[v] += 1

### 2.社区属性

In [16]:
import community as community_louvain

# 使用Louvain算法进行社区检测
partition = community_louvain.best_partition(G_undirected)
community_label=np.array([partition.get(node, 0) for node in range(num_nodes)])

In [17]:
constraints = nx.constraint(G)
constraint=np.array([constraints.get(node, 0) for node in range(num_nodes)])
constraint=np.nan_to_num(constraint,nan=0)

In [18]:
def to_column_vector(arr):
    return np.array(arr).reshape(-1, 1)

# 将所有特征转换为列向量
indeg_col = to_column_vector(indeg_centrality)
outdeg_col = to_column_vector(outdeg_centrality)
closeness_col = to_column_vector(closeness_centrality)
eigenvector_col = to_column_vector(eigenvector_centrality_array)
clustering_col = to_column_vector(clustering_coefficient_array)
pagerank_col = to_column_vector(pagerank)
katz_col = to_column_vector(katz_centrality_array)
community_col = to_column_vector(community_label)
triangle_col= to_column_vector(triangle_count)
in_weight_col=to_column_vector(in_weight)
out_weight_col=to_column_vector(out_weight)
constraint_col=to_column_vector(constraint)
x_train=np.hstack((x,indeg_col,outdeg_col,closeness_col,eigenvector_col,clustering_col,pagerank_col,katz_col,community_col,triangle_col,in_weight_col,out_weight_col,constraint_col))

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# 对特征矩阵进行标准化
x_train = scaler.fit_transform(x_train)

In [20]:
# 将数据转换为PyTorch张量
x_tensor = torch.tensor(x_train, dtype=torch.float)
y_tensor = torch.tensor(y, dtype=torch.long)
edge_index_tensor = torch.tensor(edge_index.T, dtype=torch.long)
edge_type_tensor = torch.tensor(edge_type, dtype=torch.long)

# 创建掩码
num_nodes = x.shape[0]
train_mask_tensor = torch.zeros(num_nodes, dtype=torch.bool)
test_mask_tensor = torch.zeros(num_nodes, dtype=torch.bool)
val_mask_tensor=torch.zeros(num_nodes, dtype=torch.bool)
train_mask_tensor[train_mask] = True
test_mask_tensor[test_mask] = True
val_mask_tensor[valid_mask]=True
# 构造PyTorch Geometric的Data对象
data = Data(x=x_tensor, edge_index=edge_index_tensor, edge_type=edge_type_tensor,y=y_tensor)
data.train_mask = train_mask_tensor
data.test_mask = test_mask_tensor
data.val_mask=val_mask_tensor

In [21]:
from torch_geometric.datasets import Entities
from torch_geometric.nn import RGATConv
import os.path as osp
path = osp.join( './', 'Entities')
dataset = Entities(path, 'AIFB')
data_demo = dataset[0]
data_demo.x = torch.randn(data_demo.num_nodes, 16)

In [22]:
data_demo.x[data_demo.train_idx]

tensor([[-0.5699,  0.5517,  1.0438,  ..., -1.2942,  0.5713, -1.0686],
        [-1.1967, -0.9412,  0.1415,  ...,  1.1197,  0.2940, -0.1127],
        [-1.5150,  1.7539, -0.9283,  ...,  1.0184, -1.2091, -0.4294],
        ...,
        [ 1.3246,  0.1571, -0.1912,  ..., -0.3914, -0.0574,  0.9096],
        [ 0.6003, -1.5452, -0.1763,  ...,  0.4012, -0.1707,  0.1311],
        [-0.7009,  0.2053, -1.0368,  ...,  0.6078, -0.5184, -1.2131]])

In [23]:
data.x[data.train_mask]

tensor([[-0.5609,  0.9173, -0.7178,  ..., -0.4412, -0.4761, -0.5409],
        [-0.5609, -0.5913, -0.7178,  ..., -0.4412, -0.4761, -0.5409],
        [-0.5609,  0.4144, -0.7178,  ..., -0.4412, -0.4761, -0.5409],
        ...,
        [-0.5609,  0.4144,  0.0376,  ..., -0.4412, -0.4761, -0.5409],
        [ 1.7001, -0.0885, -0.7178,  ..., -0.4412, -0.1416,  2.0200],
        [ 1.7001, -0.0885,  0.2910,  ..., -0.4412, -0.4761, -0.5409]])

In [24]:
data.y[train_mask].shape

torch.Size([308883])

In [25]:
#这个GATv2还有点问题
from torch import scatter_add
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, MessagePassing,SAGEConv,GCN2Conv,GATv2Conv,GINConv,RGATConv
from torch.nn import Linear, BatchNorm1d, Dropout
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import SAGEConv
torch.cuda.empty_cache()
num_relations=11
class RGAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels,
                 num_relations):
        super().__init__()
        self.conv1 = RGATConv(in_channels, 2*hidden_channels, num_relations)
        self.conv2 = RGATConv(2*hidden_channels, 2*hidden_channels, num_relations)
        self.conv3 = RGATConv(2*hidden_channels, hidden_channels, num_relations)
        self.lin = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, edge_type):
        x = self.conv1(x, edge_index, edge_type).relu()
        x = self.conv2(x, edge_index, edge_type).relu()
        x =self.conv3(x, edge_index, edge_type).relu()
        x = self.lin(x)
        return x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)
model = RGAT(data.num_node_features, 32, 2, 12).to(device)

# 损失函数和优化器
criterion = torch.nn.CrossEntropyLoss()  # 适用于多类分类任务
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 将数据移动到设备
data = data.to(device)
print(model)

RGAT(
  (conv1): RGATConv(29, 64, heads=1)
  (conv2): RGATConv(64, 64, heads=1)
  (conv3): RGATConv(64, 32, heads=1)
  (lin): Linear(in_features=32, out_features=2, bias=True)
)


In [26]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, data.edge_type)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# 训练模型
for epoch in range(200):  # 可根据需要调整epoch数量
    loss = train()
    print(f'Epoch {epoch+1}: Loss {loss:.4f}')


def predict(model, data):
    model.eval()
    with torch.no_grad():
        # 确保传递了edge_type参数
        out = model(data.x, data.edge_index, data.edge_attr)
        predictions = torch.softmax(out, dim=1)
        return predictions


# 使用模型进行预测
predictions = predict(model, data)

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.28 GiB. GPU 0 has a total capacty of 6.00 GiB of which 261.55 MiB is free. Of the allocated memory 3.27 GiB is allocated by PyTorch, and 716.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [20]:
from torch_geometric.nn import GCNConv, GATConv, MessagePassing,SAGEConv,GCN2Conv,GATv2Conv,TransformerConv
from torch import scatter_add
from torch.nn import Linear
import torch.nn.functional as F
from torch.nn import Linear, BatchNorm1d, Dropout
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import SAGEConv
import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

#SAGE是目前跑出来最好的，GAT其次，GCN最差
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(data.num_node_features, 2*hidden_channels)
        self.bn1 = BatchNorm1d(2*hidden_channels)
        self.conv2 =  GATConv(2*hidden_channels, 2*hidden_channels)
        self.bn2 = BatchNorm1d(2*hidden_channels)
        self.conv3 =  GATConv(2*hidden_channels, hidden_channels)
        self.bn3 = BatchNorm1d(hidden_channels)
        self.conv4 =  GATConv(hidden_channels, hidden_channels)
        self.bn4 = BatchNorm1d(hidden_channels)
        self.conv5 =  GATConv(hidden_channels, hidden_channels)
        self.bn5 = BatchNorm1d(hidden_channels)
        self.lin = Linear(hidden_channels, 2)
        self.dropout1 = Dropout(p=0.2)
        self.dropout2=Dropout(p=0.05)

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x=self.dropout2(x)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x=self.dropout2(x)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x=self.dropout2(x)
        x = x.relu()
        x = self.conv4(x, edge_index)
        x = self.bn4(x)
        x = x.relu()
        x = self.conv5(x, edge_index)
        x = self.bn5(x)
        x = x.relu()
        x = self.dropout1(x)
        x = self.lin(x)

        return x
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data.to(device)
model = GCN(hidden_channels=64).to(device)
print(model)

GCN(
  (conv1): GATConv(29, 128, heads=1)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): GATConv(128, 128, heads=1)
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): GATConv(128, 64, heads=1)
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): GATConv(64, 64, heads=1)
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): GATConv(64, 64, heads=1)
  (bn5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=64, out_features=2, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (dropout2): Dropout(p=0.05, inplace=False)
)


In [21]:
from torch_geometric.nn import MessagePassing

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 训练函数
def train():
    model.train()
    optimizer.zero_grad()

    out = model(data.x, data.edge_index,data.edge_attr)
    loss = criterion(out[train_mask], data.y[train_mask])

    loss.backward()
    optimizer.step()
    return loss.item()

# 预测函数
def predict(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data.x,data.edge_index,data.edge_attr)
        predictions = torch.softmax(out, dim=1)
        return predictions

# 训练模型
for epoch in range(350):
    loss = train()
    print(f'Epoch {epoch}: Loss {loss}')

# 使用模型进行预测
predictions = predict(model, data)

OutOfMemoryError: CUDA out of memory. Tried to allocate 142.00 MiB. GPU 0 has a total capacty of 6.00 GiB of which 0 bytes is free. Of the allocated memory 4.42 GiB is allocated by PyTorch, and 221.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# 构造测试集

In [25]:
#用valdata本地测一下分数
from sklearn.metrics import auc,roc_auc_score,roc_curve
correct=0
pred_test=predictions.cpu().numpy()[valid_mask]
auc_score=roc_auc_score(y[valid_mask],pred_test[:,1])
auc_score

0.7333391906653455

In [8]:
from torch_geometric.nn import MessagePassing, global_mean_pool
import torch.nn.functional as F
import torch

class EdgeWeightedConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(EdgeWeightedConv, self).__init__(aggr='add')  # "add" aggregation.
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index, edge_weight):
        # x: [N, in_channels], edge_index: [2, E], edge_weight: [E]
        edge_weight = edge_weight.unsqueeze(-1)  # [E, 1]
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x, edge_weight=edge_weight)

    def message(self, x_j, edge_weight):
        return edge_weight * self.lin(x_j)

class Net(torch.nn.Module):
    def __init__(self, num_node_features, hidden_dim):
        super(Net, self).__init__()
        self.conv1 = EdgeWeightedConv(num_node_features, hidden_dim)
        self.conv2 = EdgeWeightedConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, 2)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr

        x = F.relu(self.conv1(x, edge_index, edge_weight))
        x = F.relu(self.conv2(x, edge_index, edge_weight))
        x = F.dropout(x, training=self.training)
        x = self.fc(x)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 使用示例
model = Net(17, hidden_dim=64).to(device)
# 判断是否有GPU
data = data.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.02)

# 训练函数
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# 预测函数
def predict(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        predictions = torch.softmax(out, dim=1)
        return predictions

# 训练模型
for epoch in range(200):
    loss = train(model, data, optimizer, criterion)
    print(f'Epoch {epoch}: Loss {loss}')

# 使用模型进行预测
predictions = predict(model, data)



Epoch 0: Loss 1.4810748100280762
Epoch 1: Loss 0.5523459315299988
Epoch 2: Loss 0.5455896258354187
Epoch 3: Loss 0.5382967591285706
Epoch 4: Loss 0.5338159203529358
Epoch 5: Loss 0.5285800695419312
Epoch 6: Loss 0.5176761150360107
Epoch 7: Loss 0.5037001371383667
Epoch 8: Loss 0.48367154598236084
Epoch 9: Loss 0.46988195180892944
Epoch 10: Loss 0.4525890350341797
Epoch 11: Loss 0.4334716200828552
Epoch 12: Loss 0.4127180278301239
Epoch 13: Loss 0.39623966813087463
Epoch 14: Loss 0.37849104404449463
Epoch 15: Loss 0.36127474904060364
Epoch 16: Loss 0.34692028164863586
Epoch 17: Loss 0.3384524881839752
Epoch 18: Loss 0.33358246088027954
Epoch 19: Loss 0.3263548016548157
Epoch 20: Loss 0.31523656845092773
Epoch 21: Loss 0.3050832450389862
Epoch 22: Loss 0.29544883966445923
Epoch 23: Loss 0.28744494915008545
Epoch 24: Loss 0.27970683574676514
Epoch 25: Loss 0.27338606119155884
Epoch 26: Loss 0.2675369381904602
Epoch 27: Loss 0.2599165141582489
Epoch 28: Loss 0.25341030955314636
Epoch 29: L

In [65]:
from sklearn.metrics import auc,roc_auc_score,roc_curve
correct=0
pred_test=predictions.cpu().numpy()[valid_mask]
auc_score=roc_auc_score(y[valid_mask],pred_test[:,1])

In [None]:
prob=torch.nn.functional.softmax(predictions, dim=1)

In [25]:
pred=predictions.cpu().numpy()[test_mask]
pred[:,1]=pred[:,1].astype(float)

In [45]:
predictions.argmax(dim=1)

tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')

In [46]:
(predictions.argmax(dim=1)==0).sum()

tensor(579157, device='cuda:0')

In [25]:

pred = model(data).detach().cpu().numpy()
y

TypeError: GCN.forward() missing 2 required positional arguments: 'edge_index' and 'edge_attr'

In [44]:
predy=pred[test_mask]

IndexError: index 551751 is out of bounds for axis 0 with size 193053

In [43]:
np.where(pred[:,0]<0.5)

(array([], dtype=int64),)

In [26]:
result=pd.DataFrame({'index':test_mask,'predict':pred[:,1]})
result.to_csv('gcn.csv',index=None)