In [3]:
import pickle as pkl
import networkx as nx
import numpy as np


file_path = 'Raw_Data/Airport/airport.p'
graph = pkl.load(open(file_path, 'rb'))

## 1. 从.p文件中提取节点特征

以特征矩阵为准，进行节点标识符的映射（特征矩阵暗含映射规则）：<br>
例如：特征矩阵第一行对应的可能是节点123的特征，则映射规则为123:0（0是矩阵的行index）

In [4]:
# fea_matrix的一行：features(4) + label(1)
fea_matrix = np.array([graph.nodes[n]['feat'] for n in graph.nodes])
print(fea_matrix.shape)
# print(fea_matrix)

# 抽取features
label_idx = 4
features = fea_matrix[:, :label_idx]
print(features.shape)
print(features)

(3188, 5)
(3188, 4)
[[ 0.51425887  0.26670168 -0.02166667  0.82629479]
 [ 0.61784668  0.27377057  0.137       0.82629479]
 [ 0.49139002  0.2393439   0.35133333  0.82629479]
 ...
 [-0.41009555  0.83278334  0.00233333  0.68952271]
 [-0.39886445  0.83413332  0.00466667  0.68952271]
 [-0.37090776  0.82851664  1.03833333  0.68952271]]


In [5]:
# 打开文件以写入模式
feature_file = 'Processed_Data/Airport/airport.features'
with open(feature_file, 'w') as fea_f:
    # 遍历特征矩阵的每一行，将节点索引和特征保存到文件中
    for node_index, feature in enumerate(features):
        formatted_feature = ', '.join(map(str, feature))
        fea_temp = f"{node_index}: {formatted_feature}\n"
        fea_f.write(fea_temp)

print(fea_temp)
print(f"Features saved to {feature_file}")

3187: -0.37090776231555556, 0.82851664225, 1.0383333333333333, 0.6895227140874354

Features saved to Processed_Data/Airport/airport.features


## 2.处理并保存label

参考：https://github.com/HazyResearch/hgcn/blob/master/utils/data_utils.py的load_data_airport函数

In [6]:
# 通过抽取的特征来构造对应的类别
def bin_feat(feat, bins):
    # 使用 np.digitize 将 feat 值分到 bins 中
    digitized = np.digitize(feat, bins)
    
    # 通过减去最小的 bin 索引来调整 bin 索引
    return digitized - digitized.min()


label_idx = 4
labels = fea_matrix[:, label_idx]
labels = bin_feat(labels, bins=[7.0/7, 8.0/7, 9.0/7])

print(labels.shape)
print(labels)
print(np.unique(labels))

(3188,)
[2 2 2 ... 1 1 1]
[0 1 2 3]


In [7]:
# 保存节点及其标签数据(node_idx, label)
label_file = 'Processed_Data/Airport/airport.labels'

with open(label_file, 'w') as lb_f:
    
    # 写入到输出文件
    for node_idx, lb in enumerate(labels):
        lb_temp = f"{node_idx}, {lb}\n"
        lb_f.write(lb_temp)
        
print(lb_temp)
print(f"Labels saved to {label_file}")

3187, 1

Labels saved to Processed_Data/Airport/airport.labels


## 3.处理邻接表：映射节点标识从0开始

In [8]:
# 从graph中提取nodes
nodes = list(graph.nodes)
# print(nodes)

# 重映射nodes标识符为从0开始
# 先读取原始的节点标识进行重映射
idx_map = {}

for i in range(len(nodes)):
    idx_map.update({int(nodes[i]): i})

print(idx_map)
print(len(idx_map))

{2966: 0, 2990: 1, 2962: 2, 2968: 3, 4078: 4, 4029: 5, 6969: 6, 6160: 7, 6156: 8, 2952: 9, 2922: 10, 2965: 11, 2948: 12, 2975: 13, 2972: 14, 4364: 15, 2936: 16, 2937: 17, 2935: 18, 8944: 19, 6924: 20, 9026: 21, 2923: 22, 2925: 23, 2786: 24, 2789: 25, 2812: 26, 2808: 27, 6067: 28, 2801: 29, 2781: 30, 2806: 31, 253: 32, 247: 33, 246: 34, 248: 35, 1044: 36, 1084: 37, 245: 38, 298: 39, 280: 40, 2709: 41, 2673: 42, 2688: 43, 2715: 44, 6045: 45, 2670: 46, 1506: 47, 1678: 48, 1264: 49, 490: 50, 1665: 51, 1054: 52, 1197: 53, 4197: 54, 1056: 55, 730: 56, 719: 57, 737: 58, 715: 59, 709: 60, 688: 61, 718: 62, 5595: 63, 442: 64, 722: 65, 448: 66, 664: 67, 746: 68, 729: 69, 711: 70, 5599: 71, 435: 72, 460: 73, 644: 74, 3531: 75, 7162: 76, 7161: 77, 6019: 78, 2397: 79, 4200: 80, 2399: 81, 2400: 82, 6024: 83, 2425: 84, 8076: 85, 2402: 86, 2430: 87, 4203: 88, 2426: 89, 4204: 90, 2433: 91, 6026: 92, 6016: 93, 6021: 94, 2404: 95, 7376: 96, 4214: 97, 7373: 98, 2531: 99, 2621: 100, 7367: 101, 2613: 102, 2

In [9]:
# 从graph中提取edges
edges = list(graph.edges())
# print(edges)
print(len(edges))

18631


In [10]:
# 节点标识的重映射
remapped_edges = [(idx_map[int(node1)], idx_map[int(node2)]) for node1, node2 in edges]

# print(remapped_edges)
print(len(remapped_edges))

18631


In [11]:
# 查看未映射前的前两条边的节点标识
print(edges[0])
print(edges[1])

(2966, 2990)
(2966, 2962)


In [12]:
# 验证映射结果：前两条边
print(idx_map[2966], idx_map[2990])
print(idx_map[2966], idx_map[2962])
print(remapped_edges[0])
print(remapped_edges[1])

0 1
0 2
(0, 1)
(0, 2)


In [13]:
# 统计自环的数量
self_loops_count = sum(node1 == node2 for node1, node2 in remapped_edges)
print("Number of Self-loops:", self_loops_count)

Number of Self-loops: 1


In [14]:
filtered_edges = [(node1, node2) for node1, node2 in remapped_edges if node1 != node2]
print(len(filtered_edges))

18630


In [15]:
filtered_edges

[(0, 1),
 (0, 2),
 (0, 5),
 (0, 12),
 (0, 1975),
 (0, 566),
 (0, 225),
 (1, 3),
 (1, 5),
 (1, 8),
 (1, 11),
 (1, 12),
 (1, 13),
 (1, 410),
 (1, 85),
 (1, 754),
 (1, 236),
 (1, 240),
 (1, 1701),
 (1, 748),
 (1, 566),
 (1, 747),
 (1, 749),
 (1, 752),
 (1, 338),
 (1, 750),
 (1, 1898),
 (2, 662),
 (2, 225),
 (2, 1975),
 (2, 154),
 (2, 1701),
 (2, 10),
 (2, 330),
 (2, 5),
 (2, 663),
 (2, 12),
 (2, 566),
 (2, 13),
 (2, 85),
 (2, 747),
 (2, 335),
 (3, 4),
 (3, 454),
 (3, 750),
 (3, 1898),
 (3, 5),
 (3, 747),
 (3, 663),
 (3, 749),
 (3, 12),
 (3, 566),
 (3, 85),
 (3, 238),
 (3, 1701),
 (4, 13),
 (4, 16),
 (4, 17),
 (4, 23),
 (4, 650),
 (4, 12),
 (4, 651),
 (4, 747),
 (4, 749),
 (4, 262),
 (4, 277),
 (4, 238),
 (4, 1701),
 (4, 652),
 (4, 14),
 (4, 316),
 (4, 2356),
 (4, 2362),
 (4, 10),
 (4, 1704),
 (4, 750),
 (4, 1898),
 (4, 962),
 (4, 748),
 (4, 158),
 (4, 1976),
 (4, 5),
 (4, 154),
 (4, 663),
 (4, 236),
 (4, 483),
 (4, 163),
 (4, 18),
 (4, 1879),
 (4, 180),
 (4, 2826),
 (4, 454),
 (4, 2360),


In [18]:
# 去除重复边

c_edges = []
dup_edges = []

# 读取每一行，以 {node1}, {node2} 格式保存到列表中
for e_i in filtered_edges:
    
    edge = (e_i[0], e_i[1])
    # print(edge)

    # 去除重复边
    if (e_i[1], e_i[0]) not in edges:
        c_edges.append(edge)
    else:
        dup_edges.append(edge)

print(len(dup_edges))
print(len(c_edges))

59
18571


In [20]:
# 排序邻接表
# 按照元组的第一位进行排序，然后在第一位相同的情况下再按照第二位进行排序
sorted_tuples = sorted(c_edges, key=lambda x: (x[0], x[1]))

print("Sorted by First and Second Element:")
# print(sorted_tuples)
print(len(sorted_tuples))

sorted_edges = sorted_tuples
# print(sorted_edges)

# 打开文件以写入模式
edge_file = 'Processed_Data/Airport/airport.edges'

with open(edge_file, 'w') as edge_f:
    # 遍历每一条边，将其保存到文件中
    for edge in sorted_edges:
        e_temp = f"{edge[0]}, {edge[1]}\n"
        edge_f.write(e_temp)

print(e_temp)
print(f"Edges saved to {edge_file}")

Sorted by First and Second Element:
18571
3185, 3186

Edges saved to Processed_Data/Airport/airport.edges


In [21]:
# 查看最大node_idx
max(sorted_edges)

(3185, 3186)

In [22]:
# 查看孤立节点
list1 = np.unique(graph.nodes())
list2 = np.unique(graph.edges())
print(len(list1))
print(len(list2))

set(list1) - set(list2)

3188
3184


{6136, 7217, 7309, 7642}

## 5. 数据统计

In [23]:
import networkx as nx

# 读取边文件
edges_file = 'Processed_Data/Airport/airport.edges'
edges = []
with open(edges_file, 'r') as file:
    edges = [tuple(map(int, line.strip().split(","))) for line in file]

# 构建图
G = nx.Graph()
G.add_edges_from(edges)

# 计算节点度数
degree_sequence = list(G.degree())

# 统计最大和最小度数
max_degree = max(degree_sequence, key=lambda x: x[1])
min_degree = min(degree_sequence, key=lambda x: x[1])

print(f"Maximum Degree Node: {max_degree[0]}, Degree: {max_degree[1]}")
print(f"Minimum Degree Node: {min_degree[0]}, Degree: {min_degree[1]}")

Maximum Degree Node: 214, Degree: 246
Minimum Degree Node: 763, Degree: 1
