In [160]:
import numpy as np

## 1. 从content文件中提取节点标识符、节点特征、标签

特征文件的一行包含：节点标识(1) + 节点特征(1433) + 标签(1)，<br>
如：'20043\t(1433个数值)\t0\tNeural_Networks'

In [161]:
content_file_path = 'Raw_Data/Cora/cora_v1/cora.content'

# 提取特征、节点标识、标签
features = []
nodes = []
labels = []

with open(content_file_path, 'r') as content_file:

    for line in content_file:
        f = line.strip().split("\t")
        
        # 首位为节点idx
        node = f[0]
        # print(node)
        
        # 除了首位和末位，均为特征
        feature = (f[1:-1])
        # print(feature)
        
        # 末位为标签
        label = f[-1]
        # print(label)
        
        nodes.append(node)
        features.append(feature)
        labels.append(label)
          
# 节点个数
# print(nodes)
print(len(nodes))

# 特征矩阵维度
# print(features[0])
print(f"({len(features)}, {len(features[0])})")

# 标签个数
# print(labels)
print(len(labels))

2708
(2708, 1433)
2708


## 2. 处理并保存节点特征

cora.features文件实质上已经存下了被映射过后的节点标识符<br>
特征矩阵第一行对应的节点映射为0，第二行映射为1，以此类推。

In [162]:
# 打开文件以写入模式
feature_file = 'Processed_Data/Cora/cora.features'

with open(feature_file, 'w') as fea_f:
    
    # 遍历特征矩阵的每一行，将节点索引和特征保存到文件中
    for node_index, feature in enumerate(features):
        formatted_feature = ', '.join(map(str, feature))
        fea_f.write(f"{node_index}: {formatted_feature}\n")

print(f"Features saved to {feature_file}")

Features saved to Processed_Data/Cora/cora.features


## 3. 处理并保存邻接表

重映射邻接表节点标识到为从0开始。cora.features文件实质上已经存下了被映射过后的节点标识符，但是邻接表的节点表示符还没有修改

In [163]:
# 先读取原始的节点标识进行重映射
idx_map = {}

for i in range(len(nodes)):
    idx_map.update({int(nodes[i]): i})
    
# print(idx_map)
print(len(idx_map))

2708


In [164]:
# 读取cites文件中的邻接关系进行节点标识的重映射
cites_file_path = 'Raw_Data/Cora/cora_v1/cora.cites'

edges = []

with open(cites_file_path, 'r') as cites_file:
    
    # 读取每一行，以 {node1}, {node2} 格式保存到列表中
    for line in cites_file:
        
        e = line.strip().split("\t")
        
        edge = (e[0], e[1])
        # print(edge)
        
        edges.append(edge)

# print(edges)
print(len(edges))

5429


In [165]:
# 节点标识的重映射
remapped_edges = [(idx_map[int(node1)], idx_map[int(node2)]) for node1, node2 in edges]
remapped_edges

[(163, 402),
 (163, 659),
 (163, 1696),
 (163, 2295),
 (163, 1274),
 (163, 1286),
 (163, 1544),
 (163, 2600),
 (163, 2363),
 (163, 1905),
 (163, 1611),
 (163, 141),
 (163, 1807),
 (163, 1110),
 (163, 174),
 (163, 2521),
 (163, 1792),
 (163, 1675),
 (163, 1334),
 (163, 813),
 (163, 1799),
 (163, 1943),
 (163, 2077),
 (163, 765),
 (163, 769),
 (163, 781),
 (163, 940),
 (163, 942),
 (163, 1590),
 (163, 1734),
 (163, 1872),
 (163, 2286),
 (163, 390),
 (163, 1717),
 (163, 1030),
 (163, 2274),
 (163, 2518),
 (163, 606),
 (163, 800),
 (163, 1575),
 (163, 546),
 (163, 1070),
 (163, 309),
 (163, 935),
 (163, 1205),
 (163, 1571),
 (163, 1971),
 (163, 1127),
 (163, 530),
 (163, 856),
 (163, 2604),
 (163, 910),
 (163, 2173),
 (163, 191),
 (163, 1253),
 (163, 1728),
 (163, 1729),
 (163, 1206),
 (163, 2177),
 (163, 1136),
 (163, 1457),
 (163, 2265),
 (163, 1225),
 (163, 2563),
 (163, 1689),
 (163, 1498),
 (163, 563),
 (163, 2396),
 (163, 717),
 (163, 1890),
 (163, 188),
 (163, 982),
 (163, 1130),
 (

In [166]:
# 查看未映射前的前两条边的节点标识
print(edges[0])
print(edges[1])

('35', '1033')
('35', '103482')


In [167]:
# 验证映射结果：前两条边
print(idx_map[35], idx_map[1033])
print(idx_map[35], idx_map[103482])
print(remapped_edges[0])
print(remapped_edges[1])

163 402
163 659
(163, 402)
(163, 659)


In [168]:
# 排序邻接表
# 按照元组的第一位进行排序，然后在第一位相同的情况下再按照第二位进行排序
sorted_tuples = sorted(remapped_edges, key=lambda x: (x[0], x[1]))

print("Sorted by First and Second Element:")
# print(sorted_tuples)
print(len(sorted_tuples))

sorted_edges = sorted_tuples
# print(sorted_edges)

# 假设保存文件的路径为 output_file.txt
edge_file = 'Processed_Data/Cora/cora.edges'

# 打开输出文件以写入模式
with open(edge_file, 'w') as edge_f:
    # 将节点对写入到输出文件
    for edge in sorted_edges:
        e_temp = f"{edge[0]}, {edge[1]}\n"
        edge_f.write(e_temp)

print(e_temp)
print(f"Edges saved to {edge_file}")

Sorted by First and Second Element:
5429
2707, 2344

Edges saved to Processed_Data/Cora/cora.edges


## 4. 处理并保存标签

由于标签是字符串，所以要转换成从0开始的数值

In [169]:
# 查看标签详情
np.unique(labels)

array(['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
       'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning',
       'Theory'], dtype='<U22')

In [170]:
# 标签映射
label_mapping = {}
for i, label in zip(range(7), np.unique(labels)):
    label_mapping.update({label: i})
    
# print(label_idx)

mapped_values = []
for label in labels:
    mapped_value = label_mapping.get(label, -1)  # 如果找不到映射，默认为-1
    mapped_values.append(mapped_value)

# 打印映射后的结果
# print("Mapped Values:", mapped_values)

In [171]:
# 检查映射情况
print(f"映射表：{label_mapping}")

for i in range(5):
    print(f"{labels[i]}: {mapped_values[i]}")

映射表：{'Case_Based': 0, 'Genetic_Algorithms': 1, 'Neural_Networks': 2, 'Probabilistic_Methods': 3, 'Reinforcement_Learning': 4, 'Rule_Learning': 5, 'Theory': 6}
Neural_Networks: 2
Rule_Learning: 5
Reinforcement_Learning: 4
Reinforcement_Learning: 4
Probabilistic_Methods: 3


In [172]:
# 保存节点及其标签数据(node_idx, label)
label_file = 'Processed_Data/Cora/cora.labels'

with open(label_file, 'w') as lb_f:
    
    # 写入到输出文件
    for node_idx, mv in enumerate(mapped_values):
        lb_temp = f"{node_idx}, {mv}\n"
        lb_f.write(lb_temp)
        
print(lb_temp)
print(f"Labels saved to {label_file}")

2707, 2

Labels saved to Processed_Data/Cora/cora.labels


In [173]:
# 查看最大node_idx
max(sorted_edges)

(2707, 2344)

In [174]:
# 查看孤立节点
all_node = np.arange(2708)
list1 = np.unique(all_node)
list2 = np.unique(sorted_edges)
print(len(list1))
print(len(list2))

set(list1) - set(list2)

2708
2708


set()

In [175]:
# 统计自环的数量
self_loops_count = sum(node1 == node2 for node1, node2 in sorted_edges)
print("Number of Self-loops:", self_loops_count)

Number of Self-loops: 0
