In [1]:
import numpy as np

## 1. 从content文件中提取节点标识符、节点特征、标签

特征文件的一行包含：节点标识(1) + 节点特征(3703) + 标签(1)，<br>
如：'20043\t(1433个数值)\t0\tNeural_Networks'

In [2]:
content_file_path = 'Raw_Data/Citeseer/citeseer_v1/citeseer.content'

# 提取特征、节点标识、标签
features = []
nodes = []
labels = []

with open(content_file_path, 'r') as content_file:

    for line in content_file:
        f = line.strip().split("\t")
        
        # 首位为节点idx
        node = f[0]
        # print(node)
        
        # 除了首位和末位，均为特征
        feature = (f[1:-1])
        # print(feature)
        
        # 末位为标签
        label = f[-1]
        # print(label)
        
        nodes.append(node)
        features.append(feature)
        labels.append(label)
          
# 节点个数
# print(nodes)
print(len(nodes))

# 特征矩阵维度
# print(features[0])
print(f"({len(features)}, {len(features[0])})")

# 标签个数
# print(labels)
print(len(labels))

3312
(3312, 3703)
3312


In [15]:
nodes

['100157',
 '100598',
 '105684',
 '11099',
 '114091',
 '11510',
 '115971',
 '117999',
 '120432',
 '126894',
 '128239',
 '130387',
 '157253',
 '164846',
 '164953',
 '165504',
 '172324',
 '17752',
 '184682',
 '186486',
 '187087',
 '18915',
 '1894',
 '19199',
 '192612',
 '196686',
 '196762',
 '198191',
 '206655',
 '210',
 '2102',
 '21158',
 '21655',
 '226296',
 '22863',
 '233063',
 '23381',
 '236095',
 '239748',
 '241799',
 '242172',
 '243680',
 '243755',
 '243827',
 '250204',
 '250815',
 '257383',
 '267501',
 '267965',
 '270678',
 '271013',
 '271585',
 '273596',
 '276915',
 '279508',
 '282608',
 '28307',
 '28315',
 '284454',
 '284772',
 '286829',
 '292223',
 '292524',
 '29551',
 '295535',
 '296568',
 '298502',
 '298782',
 '300584',
 '300852',
 '301461',
 '302575',
 '302729',
 '303620',
 '30494',
 '30500',
 '306507',
 '306529',
 '310158',
 '314295',
 '315153',
 '318212',
 '319653',
 '32215',
 '323158',
 '323867',
 '32521',
 '32986',
 '329899',
 '332050',
 '336904',
 '340329',
 '340534',
 

## 2. 处理并保存节点特征

features文件实质上已经存下了被映射过后的节点标识符<br>
特征矩阵第一行对应的节点映射为0，第二行映射为1，以此类推。

In [6]:
# 打开文件以写入模式
feature_file = 'Processed_Data/Citeseer/citeseer.features'

with open(feature_file, 'w') as fea_f:
    
    # 遍历特征矩阵的每一行，将节点索引和特征保存到文件中
    for node_index, feature in enumerate(features):
        formatted_feature = ', '.join(map(str, feature))
        fea_f.write(f"{node_index}: {formatted_feature}\n")

print(f"Features saved to {feature_file}")

Features saved to Processed_Data/Citeseer/citeseer.features


## 3. 处理并保存邻接表

重映射邻接表节点标识到为从0开始。features文件实质上已经存下了被映射过后的节点标识符，但是邻接表的节点表示符还没有修改

In [9]:
# 先读取原始的节点标识进行重映射
idx_map = {}

for i in range(len(nodes)):
    idx_map.update({nodes[i]: i})
    
print(idx_map)
print(len(idx_map))

{'100157': 0, '100598': 1, '105684': 2, '11099': 3, '114091': 4, '11510': 5, '115971': 6, '117999': 7, '120432': 8, '126894': 9, '128239': 10, '130387': 11, '157253': 12, '164846': 13, '164953': 14, '165504': 15, '172324': 16, '17752': 17, '184682': 18, '186486': 19, '187087': 20, '18915': 21, '1894': 22, '19199': 23, '192612': 24, '196686': 25, '196762': 26, '198191': 27, '206655': 28, '210': 29, '2102': 30, '21158': 31, '21655': 32, '226296': 33, '22863': 34, '233063': 35, '23381': 36, '236095': 37, '239748': 38, '241799': 39, '242172': 40, '243680': 41, '243755': 42, '243827': 43, '250204': 44, '250815': 45, '257383': 46, '267501': 47, '267965': 48, '270678': 49, '271013': 50, '271585': 51, '273596': 52, '276915': 53, '279508': 54, '282608': 55, '28307': 56, '28315': 57, '284454': 58, '284772': 59, '286829': 60, '292223': 61, '292524': 62, '29551': 63, '295535': 64, '296568': 65, '298502': 66, '298782': 67, '300584': 68, '300852': 69, '301461': 70, '302575': 71, '302729': 72, '30362

In [13]:
# 读取cites文件中的邻接关系进行节点标识的重映射
cites_file_path = 'Raw_Data/Citeseer/citeseer_v1/citeseer.cites'

edges = []

with open(cites_file_path, 'r') as cites_file:
    
    # 读取每一行，以 {node1}, {node2} 格式保存到列表中
    for line in cites_file:
        
        e = line.strip().split("\t")
        
        edge = (e[0], e[1])
        # print(edge)
        
        edges.append(edge)

print(edges)
# print(len(edges))

[('100157', '100157'), ('100157', '364207'), ('100157', '38848'), ('100157', 'bradshaw97introduction'), ('100157', 'bylund99coordinating'), ('100157', 'dix01metaagent'), ('100157', 'gray99finding'), ('100157', 'labrou01standardizing'), ('100157', 'labrou99agent'), ('100157', 'nodine98overview'), ('100157', 'nodine99active'), ('100157', 'wagner97artificial'), ('100598', '455651'), ('100598', 'marquez00machine'), ('100598', 'punyakanok01use'), ('101570', 'krasnogor00memetic'), ('10227', '131669'), ('10227', 'sima00computational'), ('10227', 'sima01computational'), ('103027', '140169'), ('105684', 'weiss00building'), ('106003', '104129'), ('106003', 'singhal99document'), ('106339', 'amer-yahia00boundingschemas'), ('106339', 'arlein99making'), ('106339', 'cluet99using'), ('106339', 'shasha02algorithmics'), ('108321', '184462'), ('108321', '73962'), ('108321', '83140'), ('108573', '448486'), ('108573', 'bailey01analysis'), ('108573', 'bailey02eventconditionaction'), ('108573', 'tova99active

In [27]:
# 删除多余的边：多余边的node不在nodes中

# 找到没有出现在节点列表中但出现在边列表中的节点
nodes_in_edges = set(sum(edges, ()))  # 所有出现在边列表中的节点
nodes_to_remove = set(nodes_in_edges) - set(nodes)  # 没有出现在节点列表中的节点

# 统计自环的数量
self_loops_count = sum(node1 == node2 for node1, node2 in edges)
print("Number of Self-loops:", self_loops_count)

# 删除与这些节点相关的边，并过滤掉自环
filtered_edges = [(node1, node2) for node1, node2 in edges if node1 != node2 and node1 not in nodes_to_remove and node2 not in nodes_to_remove]

# # 删除与removed节点相关的边
# filtered_edges = [(node1, node2) for node1, node2 in edges if node1 not in nodes_to_remove and node2 not in nodes_to_remove]

# 打印结果
print("Nodes to Remove:", nodes_to_remove)
print("Filtered Edges:", len(filtered_edges))

Number of Self-loops: 124
Nodes to Remove: {'197556', '95786', 'ghani01hypertext', 'weng95shoslifn', '38137', 'khardon99relational', 'raisamo99evaluating', 'gabbard97taxonomy', 'nielsen00designing', 'hahn98ontology', 'flach99database', 'kohrs99using', '293457', 'tobies99pspace', 'wang01process'}
Filtered Edges: 4591


In [28]:
# 节点标识的重映射
remapped_edges = [(idx_map[node1], idx_map[node2]) for node1, node2 in filtered_edges]
# print(remapped_edges)
print(len(remapped_edges))

4591


In [30]:
# 查看未映射前的前两条边的节点标识
print(filtered_edges[0])
print(filtered_edges[1])

('100157', '364207')
('100157', '38848')


In [31]:
# 验证映射结果：前两条边
print(idx_map['100157'], idx_map['364207'])
print(idx_map['100157'], idx_map['38848'])
print(remapped_edges[0])
print(remapped_edges[1])

0 99
0 111
(0, 99)
(0, 111)


In [32]:
# 排序邻接表
# 按照元组的第一位进行排序，然后在第一位相同的情况下再按照第二位进行排序
sorted_tuples = sorted(remapped_edges, key=lambda x: (x[0], x[1]))

print("Sorted by First and Second Element:")
# print(sorted_tuples)
print(len(sorted_tuples))

sorted_edges = sorted_tuples
# print(sorted_edges)

edge_file = 'Processed_Data/Citeseer/citeseer.edges'

# 打开输出文件以写入模式
with open(edge_file, 'w') as edge_f:
    # 将节点对写入到输出文件
    for edge in sorted_edges:
        e_temp = f"{edge[0]}, {edge[1]}\n"
        edge_f.write(e_temp)

print(e_temp)
print(f"Edges saved to {edge_file}")

Sorted by First and Second Element:
4591
3308, 3309

Edges saved to Processed_Data/Citeseer/citeseer.edges


## 4. 处理并保存标签

由于标签是字符串，所以要转换成从0开始的数值

In [33]:
# 查看标签详情
np.unique(labels)

array(['AI', 'Agents', 'DB', 'HCI', 'IR', 'ML'], dtype='<U6')

In [35]:
# 标签映射
label_mapping = {}
for i, label in zip(range(7), np.unique(labels)):
    label_mapping.update({label: i})
    
# print(label_idx)

mapped_values = []
for label in labels:
    mapped_value = label_mapping.get(label, -1)  # 如果找不到映射，默认为-1
    mapped_values.append(mapped_value)

# 打印映射后的结果
print("Mapped Values:", mapped_values)

Mapped Values: [1, 4, 1, 2, 0, 0, 1, 4, 0, 3, 4, 1, 2, 4, 2, 4, 2, 5, 1, 1, 1, 4, 4, 2, 4, 5, 2, 1, 4, 5, 5, 5, 5, 1, 4, 1, 0, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 4, 1, 3, 4, 2, 5, 2, 4, 1, 2, 3, 4, 0, 1, 4, 4, 4, 2, 4, 1, 1, 1, 2, 5, 1, 1, 5, 5, 4, 2, 1, 4, 5, 1, 4, 2, 0, 4, 1, 5, 3, 1, 4, 4, 4, 5, 0, 4, 1, 4, 1, 5, 4, 1, 4, 4, 4, 2, 1, 1, 1, 2, 0, 1, 1, 1, 2, 4, 1, 2, 1, 4, 5, 0, 1, 0, 4, 2, 3, 5, 1, 1, 1, 0, 3, 1, 1, 1, 1, 0, 1, 1, 3, 1, 1, 1, 4, 1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 1, 5, 1, 3, 5, 1, 2, 1, 1, 4, 2, 3, 1, 1, 5, 0, 1, 5, 4, 1, 4, 5, 4, 1, 4, 5, 4, 1, 1, 1, 1, 4, 1, 5, 1, 1, 4, 3, 1, 1, 1, 1, 4, 0, 1, 1, 1, 4, 4, 3, 1, 1, 3, 1, 4, 4, 1, 5, 4, 0, 5, 4, 4, 4, 4, 4, 4, 3, 1, 4, 4, 1, 1, 4, 4, 4, 4, 4, 4, 1, 2, 1, 1, 1, 1, 5, 4, 1, 1, 1, 4, 3, 1, 4, 1, 4, 4, 1, 4, 4, 1, 4, 1, 4, 5, 0, 1, 4, 5, 4, 5, 0, 0, 1, 5, 0, 5, 4, 4, 1, 5, 3, 0, 1, 1, 0, 5, 5, 5, 1, 0, 0, 0, 4, 4, 3, 0, 4, 4, 5, 0, 4, 3, 2, 4, 4, 4, 5, 1, 1, 2, 3, 4, 2, 1, 1, 1, 1, 1, 4, 4, 1, 1, 4, 4, 1, 2, 2, 2, 2, 

In [38]:
# 检查映射情况
print(f"映射表：{label_mapping}")

for i in range(15):
    print(f"{labels[i]}: {mapped_values[i]}")

映射表：{'AI': 0, 'Agents': 1, 'DB': 2, 'HCI': 3, 'IR': 4, 'ML': 5}
Agents: 1
IR: 4
Agents: 1
DB: 2
AI: 0
AI: 0
Agents: 1
IR: 4
AI: 0
HCI: 3
IR: 4
Agents: 1
DB: 2
IR: 4
DB: 2


In [39]:
# 保存节点及其标签数据(node_idx, label)
label_file = 'Processed_Data/Citeseer/citeseer.labels'

with open(label_file, 'w') as lb_f:
    
    # 写入到输出文件
    for node_idx, mv in enumerate(mapped_values):
        lb_temp = f"{node_idx}, {mv}\n"
        lb_f.write(lb_temp)
        
print(lb_temp)
print(f"Labels saved to {label_file}")

3311, 5

Labels saved to Processed_Data/Citeseer/citeseer.labels


In [40]:
# 查看最大node_idx
max(sorted_edges)

(3308, 3309)

In [43]:
# 查看孤立节点
all_node = np.arange(3312)
list1 = np.unique(all_node)
list2 = np.unique(sorted_edges)
print(len(list1))
print(len(list2))

len(set(list1) - set(list2))

3312
3264


48