In [5]:
import os
import sys
import pickle as pkl
import networkx as nx
import numpy as np
import scipy.sparse as sp

In [9]:
data_path = "Raw_Data/Pubmed"
dataset_str = "pubmed"
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []

for i in range(len(names)):
    with open(os.path.join(data_path, "ind.{}.{}".format(dataset_str, names[i])), 'rb') as f:
        if sys.version_info > (3, 0):
            objects.append(pkl.load(f, encoding='latin1'))
        else:
            objects.append(pkl.load(f))

x, y, tx, ty, allx, ally, graph = tuple(objects)

## Note

通过分析后发现：<br>
np.vstack((ally, ty))就是按照节点标识符顺序（0-191716）所对应的标签。<br>
sp.vstack((allx, tx)).tolil()就是按照节点标识符顺序（0-191716）所对应的特征矩阵。

## 1. 处理并保存标签

In [42]:
labels = np.vstack((ally, ty))
labels = np.argmax(labels, 1)
# print(labels)
print(len(labels))

# 保存节点及其标签数据(node_idx, label)
label_file = 'Processed_Data/Pubmed/pubmed.labels'

with open(label_file, 'w') as lb_f:
    
    # 写入到输出文件
    for node_idx, lb in enumerate(labels):
        lb_temp = f"{node_idx}, {lb}\n"
        lb_f.write(lb_temp)
        
print(lb_temp)
print(f"Labels saved to {label_file}")

19717
19716, 2

Labels saved to Processed_Data/Pubmed/pubmed.labels


## 2. 处理并保存特征矩阵

In [50]:
features = sp.vstack((allx, tx)).tolil()
# print(features[0])
features

<19717x500 sparse matrix of type '<class 'numpy.float32'>'
	with 988031 stored elements in List of Lists format>

In [51]:
# 将稀疏矩阵转换为密集矩阵
dense_features = features.toarray()
print(dense_features.shape)
print(dense_features[1])

(19717, 500)
[0.         0.         0.         0.         0.         0.
 0.         0.0164341  0.         0.         0.         0.
 0.         0.         0.         0.         0.03209745 0.
 0.02216047 0.         0.         0.         0.02784638 0.
 0.         0.         0.         0.02351782 0.         0.
 0.         0.         0.         0.         0.         0.
 0.01473095 0.         0.         0.         0.         0.02362674
 0.         0.         0.         0.         0.         0.02032989
 0.         0.01064451 0.         0.         0.         0.03039469
 0.         0.         0.01828312 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.00836507
 0.         0.         0.         0.         0.         0.
 0.0190884  0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.04266247
 0.         0.         0.         0.         0.         0.0183

In [52]:
# 打开文件以写入模式
feature_file = 'Processed_Data/Pubmed/pubmed.features'

with open(feature_file, 'w') as fea_f:
    
    # 遍历特征矩阵的每一行，将节点索引和特征保存到文件中
    for node_index, feature in enumerate(dense_features):
        formatted_feature = ', '.join(map(str, feature))
        fea_f.write(f"{node_index}: {formatted_feature}\n")

print(f"Features saved to {feature_file}")

Features saved to Processed_Data/Pubmed/pubmed.features


## 3. 处理并保存邻接表

In [77]:
# 将图转换成邻接表形式的字符串，确保不会出现重复边
adjacency_list = set()  # 使用集合来避免重复边
for node, neighbors in graph.items():
    for neighbor in neighbors:
        edge = (int(node), int(neighbor))
        reverse_edge = (int(neighbor), int(node))
        if reverse_edge not in adjacency_list:
            adjacency_list.add(edge)

print(len(adjacency_list))
print(adjacency_list)

44327
{(6690, 11450), (12278, 16698), (2098, 2979), (11354, 11883), (688, 7316), (6715, 6784), (4257, 15130), (4744, 7112), (11784, 16540), (4455, 7009), (17868, 18986), (1275, 8015), (1545, 11848), (7830, 12298), (12955, 17290), (13853, 18440), (10575, 18911), (1582, 2281), (3677, 7092), (8580, 17899), (10650, 15298), (5595, 7103), (15982, 18671), (1763, 14370), (14008, 18728), (8077, 19530), (925, 4411), (1721, 17150), (3715, 17183), (10558, 14019), (2120, 10443), (9185, 9378), (15664, 16295), (3918, 7154), (2945, 9802), (9402, 17714), (9882, 18351), (8414, 12884), (8655, 9563), (8661, 13515), (16918, 18211), (10226, 15128), (500, 3831), (1021, 15611), (5871, 16958), (10162, 11171), (2982, 9842), (6273, 13740), (11070, 13763), (1098, 2999), (1884, 3154), (2493, 7399), (18707, 18751), (12778, 13964), (856, 6095), (1365, 5972), (2523, 4043), (3564, 3781), (15841, 19166), (4620, 6271), (7063, 8382), (4221, 5219), (3572, 15161), (1565, 18276), (4411, 19114), (3489, 4546), (12924, 18111),

In [78]:
# 验证边数是否为44327
sparse_adj_matrix = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

# 保留上三角部分
upper_triangular_matrix = sp.triu(sparse_adj_matrix)

# 获取非零元素的行和列索引
rows, cols = upper_triangular_matrix.nonzero()

# 将边的信息保存到文本文件
edges = []
for row, col in zip(rows, cols):
    edge = (row, col)
    edges.append(edge)

# edges是排序好的边
print(len(edges))
print(edges)

44327
[(0, 1378), (0, 1544), (0, 6092), (0, 7636), (0, 14442), (1, 2943), (1, 8359), (1, 10199), (2, 10471), (2, 11485), (2, 15572), (3, 8249), (4, 14044), (5, 1312), (5, 12968), (6, 767), (6, 2128), (6, 2216), (6, 3150), (6, 3509), (6, 4464), (6, 6572), (6, 6697), (6, 7296), (6, 7335), (6, 7691), (6, 8661), (6, 8981), (6, 9232), (6, 10265), (6, 12098), (6, 13655), (6, 13656), (6, 16720), (6, 17284), (6, 18121), (6, 18614), (7, 1568), (7, 1588), (7, 2019), (7, 2343), (7, 4058), (7, 5564), (7, 6242), (7, 8335), (7, 10243), (7, 14688), (7, 14754), (7, 14843), (7, 15577), (7, 17354), (7, 17955), (7, 18425), (7, 19376), (8, 3157), (9, 221), (9, 1456), (9, 7875), (9, 7956), (9, 8915), (9, 11186), (9, 12664), (9, 16622), (9, 19447), (10, 3224), (10, 3287), (10, 5579), (10, 6631), (10, 12126), (10, 14471), (11, 18813), (12, 389), (12, 863), (12, 1048), (12, 2617), (12, 9366), (12, 10962), (12, 12111), (12, 12597), (12, 17889), (12, 19494), (13, 13940), (14, 2501), (15, 4442), (15, 7267), (15,

In [83]:
# 过滤自环
filtered_edges = [(node1, node2) for node1, node2 in edges if node1 != node2]
print(len(filtered_edges))

44324


In [84]:
edge_file = 'Processed_Data/Pubmed/pubmed.edges'


# 打开输出文件以写入模式
with open(edge_file, 'w') as edge_f:
    # 将节点对写入到输出文件
    for edge in filtered_edges:
        e_temp = f"{edge[0]}, {edge[1]}\n"
        edge_f.write(e_temp)

print(e_temp)
print(f"Edges saved to {edge_file}")

19474, 19609

Edges saved to Processed_Data/Pubmed/pubmed.edges


In [85]:
# 查看孤立节点
all_node = np.arange(19717)
list1 = np.unique(all_node)
list2 = np.unique(sorted_edges)
print(len(list1))
print(len(list2))

len(set(list1) - set(list2))

19717
19717


0

In [87]:
# 统计自环的数量
self_loops_count = sum(node1 == node2 for node1, node2 in filtered_edges)
print("Number of Self-loops:", self_loops_count)

Number of Self-loops: 0
