In [2]:
# 基础模块
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm.notebook import tqdm
from datetime import datetime
from collections import defaultdict, deque

# 可视化配置
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.family'] = 'STHeiti'         # 中文支持（macOS）
plt.rcParams['axes.unicode_minus'] = False      # 正负号支持
%matplotlib inline

# 轨迹聚类与建模
from sklearn.cluster import DBSCAN
from pyproj import Transformer

# API调用（如POI增强）
import requests
import time
import json

# Neo4j 图数据库
from py2neo import Graph, Node, Relationship  # 若报错先注释，等后面阶段再装
from neo4j import GraphDatabase

# 路径配置
base_dir = os.path.dirname(os.path.abspath("__file__"))  # 当前脚本所在目录
root_dir = os.path.abspath(os.path.join(base_dir, '..'))  # 项目根目录
output_dir = os.path.join(root_dir, 'outputs')
traj_path = os.path.join(output_dir, 'geolife_cleaned_traj.csv')

In [3]:
# Step 1：加载数据
df = pd.read_csv(traj_path)
df['t'] = pd.to_datetime(df['t'])

# Step 2：空间聚类识别热点节点
eps = 0.0006      # 空间阈值（近似50米）
min_samples = 5
db = DBSCAN(eps=eps, min_samples=min_samples)
df['node'] = db.fit_predict(df[['x', 'y']])

# 移除噪声节点
df = df[df['node'] != -1].reset_index(drop=True)

# Step 3：构建轨迹节点序列 & 时间序列
traj_records = []

for (uid, traj_id), group in df.groupby(['uid', 'traj_id']):
    group = group.sort_values('t')
    nodes = group['node'].tolist()
    times = group['t'].tolist()

    # 去除连续重复节点
    clean_nodes = [nodes[0]]
    clean_times = [times[0]]
    for i in range(1, len(nodes)):
        if nodes[i] != clean_nodes[-1]:
            clean_nodes.append(nodes[i])
            clean_times.append(times[i])

    if len(clean_nodes) >= 2:
        traj_records.append({
            'uid': int(uid),
            'traj_id': int(traj_id),
            'start_time': clean_times[0].time(),
            'end_time': clean_times[-1].time(),
            'node_sequence': json.dumps(clean_nodes, ensure_ascii=False),
            'time_sequence': json.dumps([t.strftime("%H:%M:%S") for t in clean_times], ensure_ascii=False)
        })

print(f"构建完成，共记录轨迹数：{len(traj_records)}")

构建完成，共记录轨迹数：1181


In [4]:
traj_meta_df = pd.DataFrame(traj_records)
metadata_path = os.path.join(output_dir, 'traj_metadata.csv')
traj_meta_df.to_csv(metadata_path, index=False)

print(f"轨迹元数据表已保存：{metadata_path}")

轨迹元数据表已保存：/Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/outputs/traj_metadata.csv


In [4]:
base_dir = os.path.dirname(os.path.abspath("__file__"))
root_dir = os.path.abspath(os.path.join(base_dir, '..'))
output_dir = os.path.join(root_dir, 'outputs')
meta_path = os.path.join(output_dir, 'traj_metadata.csv')

# 加载轨迹元数据
df = pd.read_csv(meta_path)
df['node_sequence'] = df['node_sequence'].apply(json.loads)

# 构建1阶路径表：(start_node, end_node) → 轨迹集合（uid-traj_id）
one_degree_path_table = defaultdict(set)

for _, row in df.iterrows():
    uid = int(row['uid'])
    traj_id = int(row['traj_id'])
    nodes = row['node_sequence']

    for i in range(len(nodes) - 1):
        path = (nodes[i], nodes[i + 1])
        one_degree_path_table[path].add(f"{uid}-{traj_id}")

print(f"1阶路径构建完成，共有唯一路径：{len(one_degree_path_table)} 条")

# 输出为CSV标准结构：start_node, end_node, traj_set
one_degree_path_df = pd.DataFrame([
    {
        'start_node': path[0],
        'end_node': path[1],
        'traj_set': json.dumps(sorted(list(traj_set)), ensure_ascii=False)
    }
    for path, traj_set in one_degree_path_table.items()
])

# 保存1阶路径表
output_file = os.path.join(output_dir, 'one_degree_path_table.csv')
one_degree_path_df.to_csv(output_file, index=False)

print(f"1阶路径表已保存至：{output_file}")

1阶路径构建完成，共有唯一路径：1152 条
1阶路径表已保存至：/Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/outputs/one_degree_path_table.csv


In [6]:
# 加载原始轨迹数据
df = pd.read_csv(traj_path)

# DBSCAN聚类参数（保持与之前完全一致）
eps = 0.0006
min_samples = 5
db = DBSCAN(eps=eps, min_samples=min_samples)
df['node'] = db.fit_predict(df[['x', 'y']])
df = df[df['node'] != -1]  # 移除噪声

# 计算每个 node_id 的坐标中心
node_coords = df.groupby('node')[['x', 'y']].mean().reset_index()
node_coords.columns = ['node_id', 'x', 'y']

# 保存为 nodes.csv
nodes_path = os.path.join(output_dir, 'nodes.csv')
node_coords.to_csv(nodes_path, index=False)
print(f"节点中心文件已保存至：{nodes_path}")

节点中心文件已保存至：/Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/outputs/nodes.csv


In [8]:
# 读取 traj_metadata
meta_path = os.path.join(output_dir, 'traj_metadata.csv')
meta_df = pd.read_csv(meta_path)
meta_df['node_sequence'] = meta_df['node_sequence'].apply(json.loads)

# 构建边频率与所属轨迹集合
edge_freq = defaultdict(int)
edge_trajs = defaultdict(set)

for _, row in meta_df.iterrows():
    uid, traj_id = int(row['uid']), int(row['traj_id'])
    nodes = row['node_sequence']

    for i in range(len(nodes) - 1):
        edge = (nodes[i], nodes[i+1])
        edge_freq[edge] += 1
        edge_trajs[edge].add((uid, traj_id))

# 构建输出表
edges_df = pd.DataFrame([
    {
        'source': src,
        'target': tgt,
        'frequency': edge_freq[(src, tgt)],
        'traj_ids': json.dumps(sorted([list(x) for x in edge_trajs[(src, tgt)]]), ensure_ascii=False)
    }
    for (src, tgt) in edge_freq
])

# 保存为 edges.csv
edges_path = os.path.join(output_dir, 'edges.csv')
edges_df.to_csv(edges_path, index=False)
print(f"边文件已保存至：{edges_path}")

边文件已保存至：/Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/outputs/edges.csv


In [12]:
uri = "bolt://localhost:7687"
user = "neo4j"
password = "#020728Ceq"

driver = GraphDatabase.driver(uri, auth=(user, password))

# 清空整个数据库（慎用）
with driver.session() as session:
    session.run("MATCH (n) DETACH DELETE n")

print("Neo4j 已清空所有节点和关系。")

driver.close()

Neo4j 已清空所有节点和关系。


In [13]:
# 路径配置
base_dir = os.path.dirname(os.path.abspath("__file__"))
root_dir = os.path.abspath(os.path.join(base_dir, '..'))
output_dir = os.path.join(root_dir, 'outputs')
nodes_path = os.path.join(output_dir, 'nodes.csv')
edges_path = os.path.join(output_dir, 'edges.csv')

# Neo4j 连接信息
uri = "bolt://localhost:7687"
user = "neo4j"
password = "#020728Ceq"

driver = GraphDatabase.driver(uri, auth=(user, password))

# 节点导入函数
def import_node(tx, node_id, x, y):
    tx.run("""
        MERGE (n:Hotspot {id: $node_id})
        SET n.x = $x, n.y = $y
    """, node_id=node_id, x=x, y=y)

# 边导入函数
def import_edge(tx, source, target, frequency, traj_ids_flat):
    tx.run("""
        MATCH (a:Hotspot {id: $source})
        MATCH (b:Hotspot {id: $target})
        MERGE (a)-[r:TRAJ_EDGE]->(b)
        SET r.frequency = $frequency,
            r.traj_ids = $traj_ids
    """, source=source, target=target, frequency=frequency, traj_ids=traj_ids_flat)

with driver.session() as session:
    print("导入节点中...")
    nodes_df = pd.read_csv(nodes_path)
    for _, row in nodes_df.iterrows():
        session.execute_write(import_node, int(row['node_id']), float(row['x']), float(row['y']))

    print("导入边中...")
    edges_df = pd.read_csv(edges_path)
    for _, row in edges_df.iterrows():
        # 修复嵌套数组问题：将 [[1,2],[2,3]] → ["1_2", "2_3"]
        raw_traj_ids = json.loads(row['traj_ids']) if isinstance(row['traj_ids'], str) else []
        traj_ids_flat = [f"{uid}_{tid}" for uid, tid in raw_traj_ids]

        session.execute_write(
            import_edge,
            int(row['source']),
            int(row['target']),
            int(row['frequency']),
            traj_ids_flat
        )

driver.close()
print("Neo4j 数据导入完成")

导入节点中...
导入边中...
Neo4j 数据导入完成


In [14]:
# ---------- 通用工具 ----------
def safe_parse_json_list(x):
    return json.loads(x) if isinstance(x, str) else x

def preprocess_onedegree_df(df, start_col="start_node", end_col="end_node", traj_col="traj_set"):
    """
    将轨迹集合列转为 set，并统一列名结构。
    """
    rename_map = {start_col: "start_node", end_col: "end_node", traj_col: "traj_set"}
    df = df.rename(columns=rename_map)[["start_node", "end_node", "traj_set"]].copy()
    df["traj_set"] = df["traj_set"].apply(lambda x: set(safe_parse_json_list(x)))
    return df

def save_result(df, filename, output_dir=None):
    if output_dir is None:
        current_dir = os.getcwd()
        output_dir = os.path.join(current_dir, "..", "outputs")
    os.makedirs(output_dir, exist_ok=True)
    if "path" in df.columns:
        df["path"] = df["path"].apply(lambda p: json.dumps(list(p)) if isinstance(p, (list, tuple)) else p)
    out = os.path.join(output_dir, filename)
    df.to_csv(out, index=False)
    print(f"结果已保存到: {out}")

# ---------- NDTTJ ----------
def run_ndttj(one_degree_df, m, k,
              save_as=None, max_depth=None):
    """
    NDTTJ：基于 N 度路径表连接的 Apriori-Join 热点挖掘
    ------------------
    one_degree_df : 1 阶路径表 DataFrame
    m             : 频繁度阈值 (support)
    k             : 最小路径长度
    max_depth     : 最长允许的节点数（含首尾）。None 表示不限制
    """
    df = preprocess_onedegree_df(one_degree_df)
    edge_dict = {(r.start_node, r.end_node): r.traj_set
                 for r in df.itertuples()}

    result = {}
    # 初始队列：所有满足支持度的一阶边
    queue = deque([((u, v), s)
                   for (u, v), s in edge_dict.items()
                   if len(s) >= m])

    while queue:
        path, sg = queue.popleft()

        # 写结果
        if len(path) >= k:
            result[path] = sg

        # 如果已达最大深度就停止向下拼接
        if max_depth is not None and len(path) >= max_depth:
            continue

        tail = path[-1]
        # 用尾节点拼接下一条边
        for (x, y), s2 in edge_dict.items():
            if x != tail:
                continue
            new_sg = sg & s2
            if len(new_sg) < m:
                continue
            new_path = path + (y,)
            # 只有第一次出现时才入队
            if new_path not in result:
                queue.append((new_path, new_sg))

    out_df = pd.DataFrame([{
            "path": p,
            "frequency": len(s),
            "traj_ids": json.dumps(sorted(s))
        } for p, s in result.items()])

    if save_as:
        save_result(out_df, save_as)
    return out_df

# ---------- NDTTT ----------
def run_ndttt(one_degree_df, m, k, save_as=None, max_depth=12):
    df = preprocess_onedegree_df(one_degree_df)
    edge_dict = defaultdict(list)
    for r in df.itertuples():
        if len(r.traj_set) >= m:
            edge_dict[r.start_node].append((r.end_node, r.traj_set))

    result = {}
    # 显式栈元素: (path_tuple, traj_set)
    for u, lst in edge_dict.items():
        for v, sg in lst:
            stack = [((u, v), sg)]
            while stack:
                path, cur_sg = stack.pop()
                if len(path) >= k:
                    result[path] = cur_sg
                if len(path) >= max_depth:
                    continue
                tail = path[-1]
                for nxt, sg2 in edge_dict.get(tail, []):
                    new_sg = cur_sg & sg2
                    if len(new_sg) < m:
                        continue
                    new_path = path + (nxt,)
                    if new_path not in result:
                        stack.append((new_path, new_sg))

    out_df = pd.DataFrame(
        [{"path": p, "frequency": len(s), "traj_ids": json.dumps(sorted(s))} for p, s in result.items()]
    )
    if save_as:
        save_result(out_df, save_as)
    return out_df

# ---------- TTHS ----------
def run_tths_from_neo4j(uri, user, password, m, k, save_as=None):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    results = []
    visited_paths = set()

    def dfs(tx, path, traj_ids):
        if len(path) >= k and len(traj_ids) >= m:
            key = tuple(path)
            if key not in visited_paths:
                visited_paths.add(key)
                results.append({
                    'path': path[:],
                    'frequency': len(traj_ids),
                    'traj_ids': json.dumps(sorted(list(traj_ids)), ensure_ascii=False)
                })
        if len(path) > 12:
            return

        query = """
        MATCH (n:Hotspot {id: $nid})-[r:TRAJ_EDGE]->(m)
        RETURN m.id AS next_id, r.traj_ids AS tids
        """
        result = tx.run(query, nid=path[-1])
        for record in result:
            next_id = record['next_id']
            if next_id in path:
                continue
            tids = set(tuple(map(int, tid.split('_'))) for tid in record['tids'])
            intersected = traj_ids & tids
            if len(intersected) >= m:
                dfs(tx, path + [next_id], intersected)

    with driver.session() as session:
        start_nodes = session.run("MATCH (n:Hotspot) RETURN n.id AS nid")
        for record in start_nodes:
            nid = record['nid']
            edges = session.run("""
                MATCH (n:Hotspot {id: $nid})-[r:TRAJ_EDGE]->(m)
                RETURN m.id AS next_id, r.traj_ids AS tids
            """, nid=nid)
            for edge in edges:
                next_id = edge['next_id']
                tids = set(tuple(map(int, tid.split('_'))) for tid in edge['tids'])
                if len(tids) >= m:
                    dfs(session, [nid, next_id], tids)

    driver.close()
    df_result = pd.DataFrame(results)
    if save_as:
        save_result(df_result, save_as)
    return df_result

In [15]:
# 调用三个算法
base_dir = os.path.dirname(os.path.abspath("__file__"))
root_dir = os.path.abspath(os.path.join(base_dir, '..'))
output_dir = os.path.join(root_dir, 'outputs')
one_degree_path_file = os.path.join(output_dir, "one_degree_path_table.csv")
one_degree_df = pd.read_csv(one_degree_path_file)

ndttj_df = run_ndttj(one_degree_df, m=4, k=3, save_as="ndttj_hotspot_paths.csv", max_depth=8)
ndttt_df = run_ndttt(one_degree_df, m=4, k=3, save_as="ndttt_hotspot_paths.csv", max_depth=12)
tths_df = run_tths_from_neo4j(
    uri="bolt://localhost:7687",
    user="neo4j",
    password="#020728Ceq",
    m=4, k=3,
    save_as="tths_hotspot_paths.csv"
)

print("NDTTJ 示例结果：")
print(ndttj_df.head())

print("NDTTT 示例结果：")
print(ndttt_df.head())

print("TTHS  示例结果：")
print(tths_df.head())

结果已保存到: /Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/scripts/../outputs/ndttj_hotspot_paths.csv
结果已保存到: /Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/scripts/../outputs/ndttt_hotspot_paths.csv
结果已保存到: /Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/scripts/../outputs/tths_hotspot_paths.csv
NDTTJ 示例结果：
          path  frequency                                           traj_ids
0  [0, 1, 116]          6  ["35-13", "35-23", "35-24", "35-38", "35-42", ...
1    [1, 2, 1]          6   ["1-16", "1-35", "1-36", "1-62", "1-64", "1-69"]
2    [3, 0, 3]          7  ["1-13", "179-21", "179-27", "5-18", "5-27", "...
3    [2, 1, 2]          6   ["1-16", "1-35", "1-36", "1-62", "1-64", "1-69"]
4    [2, 1, 0]          5           [

In [19]:
from ast import literal_eval

# 安全解析 traj_ids，TTHS 的格式特殊需转换
def safe_eval_traj_ids(x, convert_uid_traj=False):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        result = x
    else:
        try:
            result = literal_eval(x)
        except Exception:
            return []
    if convert_uid_traj:
        # TTHS 专用：把 [[uid, tid], ...] 转换成 ['uid-tid', ...]
        return [f"{uid}-{tid}" for uid, tid in result if isinstance(uid, int) and isinstance(tid, int)]
    return result

def safe_parse_path(x):
    if pd.isna(x):
        return ()
    if isinstance(x, list):
        return tuple(x)
    if isinstance(x, str) and x.strip().startswith('['):
        try:
            return tuple(json.loads(x))
        except Exception:
            return ()
    return ()

# 加载并标记来源，TTHS 要多传一个参数
def load_with_source(path, source_name, convert_uid_traj=False):
    df = pd.read_csv(path)
    df['path'] = df['path'].apply(safe_parse_path)
    df['traj_ids'] = df['traj_ids'].apply(lambda x: safe_eval_traj_ids(x, convert_uid_traj=convert_uid_traj))
    df['source'] = [[source_name]] * len(df)
    df = df[df['path'].apply(lambda x: isinstance(x, tuple) and len(x) > 0)]
    return df

# 设置输出目录（与 scripts 平行的 outputs 文件夹）
output_dir = "../outputs"
ndttj_df = load_with_source(os.path.join(output_dir, 'ndttj_hotspot_paths.csv'), 'NDTTJ')
ndttt_df = load_with_source(os.path.join(output_dir, 'ndttt_hotspot_paths.csv'), 'NDTTT')
# 注意这里传 convert_uid_traj=True 来处理嵌套格式
tths_df  = load_with_source(os.path.join(output_dir, 'tths_hotspot_paths.csv'), 'TTHS', convert_uid_traj=True)

# 合并前统计
print("NDTTJ:", len(ndttj_df), "NDTTT:", len(ndttt_df), "TTHS:", len(tths_df))

# 合并数据
merged_df = pd.concat([ndttj_df, ndttt_df, tths_df], ignore_index=True)

# 合并规则：同 path 合并 traj_ids 与 source，保留最大 frequency
def merge_groups(group):
    merged_traj_ids = {tid for sublist in group['traj_ids'] for tid in sublist}
    merged_sources = sorted(set(src for sources in group['source'] for src in sources))
    return pd.Series({
        'frequency': max(group['frequency']),
        'traj_ids': json.dumps(sorted(merged_traj_ids)),
        'source': merged_sources
    })

# 分组合并
merged_df = merged_df.groupby('path', group_keys=False).apply(merge_groups).reset_index()

# 输出格式转为 JSON 样式（仅最后转换）
merged_df['path'] = merged_df['path'].apply(list)

# 打印来源统计
print("路径来源统计：")
print(merged_df['source'].explode().value_counts())

# 保存合并结果
output_path = os.path.join(output_dir, 'merged_hotspot_paths.csv')
merged_df.to_csv(output_path, index=False)
print(f"合并完成，输出文件已保存：{output_path}")

NDTTJ: 478 NDTTT: 1134 TTHS: 58
路径来源统计：
source
NDTTT    1134
NDTTJ     478
TTHS       58
Name: count, dtype: int64
合并完成，输出文件已保存：../outputs/merged_hotspot_paths.csv


  merged_df = merged_df.groupby('path', group_keys=False).apply(merge_groups).reset_index()


In [30]:
#  路径级时间 & 空间特征增强
#  ---------------------------------------------------------------
#  - 输入 : merged_hotspot_paths.csv, traj_metadata.csv, nodes.csv
#  - 输出 : merged_hotspot_paths_with_time_space.csv

import os, json, ast, csv, warnings
from math import sqrt
from collections import Counter

import numpy as np
import pandas as pd
from scipy.stats import entropy

warnings.filterwarnings("ignore")

# ------------ 通用路径 ------------
OUTPUT_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "outputs"))

MERGED_FILE = os.path.join(OUTPUT_DIR, "merged_hotspot_paths.csv")
TRAJ_META_FILE = os.path.join(OUTPUT_DIR, "traj_metadata.csv")
NODES_FILE = os.path.join(OUTPUT_DIR, "nodes.csv")
TARGET_FILE = os.path.join(OUTPUT_DIR, "merged_hotspot_paths_with_time_space.csv")

# ------------ 1. 载入 3 个文件 ------------
merged_df = pd.read_csv(MERGED_FILE)
traj_meta_df = pd.read_csv(TRAJ_META_FILE)
nodes_df = pd.read_csv(NODES_FILE)

# ------------ 2. 通用解析函数 ------------
def parse_maybe_list(val):
    """兼容 json / python 表达式 / 已经是 list 的三种情况"""
    if isinstance(val, list):
        return val
    if isinstance(val, (float, int)) and np.isnan(val):
        return []
    if isinstance(val, str):
        txt = val.strip()
        for loader in (json.loads, ast.literal_eval):
            try:
                return loader(txt)
            except Exception:
                continue
    return []

for col in ["path", "traj_ids", "source"]:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].apply(parse_maybe_list)

# ------------ 3. 轨迹元数据预处理 ------------
traj_meta_df["start_time"] = pd.to_datetime(
    traj_meta_df["start_time"], errors="coerce"
)
traj_meta_df["start_hour"] = (
    traj_meta_df["start_time"].dt.hour.fillna(-1)
    + traj_meta_df["start_time"].dt.minute.fillna(0) / 60
)

traj_meta_df["traj_key"] = list(
    zip(traj_meta_df["uid"].astype(int), traj_meta_df["traj_id"].astype(int))
)
traj_hour_dict = dict(zip(traj_meta_df["traj_key"], traj_meta_df["start_hour"]))
# -------------------------------------------------------------
# 反序列化 path / traj_ids / source
# -------------------------------------------------------------
for col in ["path", "traj_ids", "source"]:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].apply(parse_maybe_list)

# 把 traj_ids 统一转换成 (uid, traj_id) 键
def to_traj_key(item):
    """
    支持三种格式:
    1) "35-13"        → (35, 13)
    2) [35, 13]       → (35, 13)
    3) (35, 13)       → (35, 13)
    解析失败返回 None
    """
    if isinstance(item, (list, tuple)) and len(item) == 2:
        try:
            return (int(item[0]), int(item[1]))
        except Exception:
            return None
    if isinstance(item, str) and "-" in item:
        a, b = item.split("-", 1)
        try:
            return (int(float(a)), int(float(b)))
        except Exception:
            return None
    return None

merged_df["traj_ids"] = merged_df["traj_ids"].apply(
    lambda lst: [k for k in (to_traj_key(x) for x in lst) if k]
)

# ------------ 4. node 坐标字典 ------------
coord_map = dict(zip(nodes_df["node_id"], zip(nodes_df["x"], nodes_df["y"])))

# ------------ 5. 辅助函数 ------------
def time_entropy(hours):
    if len(hours) <= 1:
        return 0.0
    cnt = Counter(map(int, hours))
    prob = np.array(list(cnt.values())) / sum(cnt.values())
    return round(float(entropy(prob, base=2)), 3)

def peak_period(hours):
    bins = {
        "morning_peak": range(6, 10),
        "midday": range(10, 14),
        "afternoon": range(14, 17),
        "evening_peak": range(17, 21),
        "night": list(range(21, 24)) + list(range(0, 6)),
    }
    h_cnt = Counter(map(int, hours))
    agg = {k: sum(h_cnt[h] for h in v) for k, v in bins.items()}
    return max(agg, key=agg.get) if agg else np.nan

def euclid(p1, p2):
    return sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)

def spatial_entropy(coords):
    if len(coords) < 2:
        return 0.0
    xs = [round(x, 3) for x, _ in coords]
    ys = [round(y, 3) for _, y in coords]
    px, py = Counter(xs), Counter(ys)
    probx = np.array(list(px.values())) / sum(px.values())
    proby = np.array(list(py.values())) / sum(py.values())
    return round(float(entropy(probx, base=2) + entropy(proby, base=2)), 3)

# ------------ 6. 计算特征（向量化写入） ------------
def calc_features(row):
    # ---- 时间特征 ----
    keys = [tuple(t) for t in row["traj_ids"] if tuple(t) in traj_hour_dict]
    hours = [traj_hour_dict[k] for k in keys]

    avg_start_hour = round(float(np.mean(hours)), 2) if hours else np.nan
    t_entropy = time_entropy(hours) if hours else np.nan
    p_period = peak_period(hours) if hours else np.nan

    # ---- 空间特征 ----
    coords = [coord_map.get(n) for n in row["path"] if n in coord_map]
    coords = [c for c in coords if c is not None]

    path_length = len(row["path"])
    if len(coords) >= 2:
        spatial_len = round(
            sum(euclid(coords[i], coords[i + 1]) for i in range(len(coords) - 1)), 3
        )
        cx, cy = np.mean([c[0] for c in coords]), np.mean([c[1] for c in coords])
        s_entropy = spatial_entropy(coords)
    else:
        spatial_len, cx, cy, s_entropy = (np.nan,) * 4

    return pd.Series(
        [
            avg_start_hour,
            t_entropy,
            p_period,
            path_length,
            spatial_len,
            round(cx, 6) if not np.isnan(cx) else np.nan,
            round(cy, 6) if not np.isnan(cy) else np.nan,
            s_entropy,
        ],
        index=[
            "avg_start_hour",
            "time_entropy",
            "peak_period",
            "path_length",
            "spatial_length",
            "center_x",
            "center_y",
            "spatial_entropy",
        ],
    )

merged_df = merged_df.join(merged_df.apply(calc_features, axis=1))

# ------------ 7. 重新序列化 & 保存 ------------
for col in ["path", "traj_ids", "source"]:
    merged_df[col] = merged_df[col].apply(json.dumps)

merged_df.to_csv(TARGET_FILE, index=False, quoting=csv.QUOTE_NONNUMERIC)
print(f"路径时空特征已写入: {TARGET_FILE}")

路径时空特征已写入: /Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/outputs/merged_hotspot_paths_with_time_space.csv


In [7]:
# 节点级 POI 信息提取（高德地图 API）
# 功能：读取 nodes.csv（含 node_id, x, y），调用逆地理编码 API
# 输出 nodes_with_poi.csv，含 POI 名称、类型、地址

AMAP_API_KEY = '' # 自行申请
INPUT_FILE = os.path.join('..', 'outputs', 'nodes.csv')
OUTPUT_FILE = os.path.join('..', 'outputs', 'nodes_with_poi.csv')
CACHE_FILE = os.path.join('..', 'outputs', 'poi_cache.csv')
SLEEP_INTERVAL = 0.5

if os.path.exists(CACHE_FILE):
    poi_cache = pd.read_csv(CACHE_FILE).set_index('node_id').to_dict('index')
else:
    poi_cache = {}

nodes_df = pd.read_csv(INPUT_FILE)

# API 请求函数
def query_poi(x, y):
    url = f"https://restapi.amap.com/v3/geocode/regeo"
    params = {
        'location': f"{x},{y}",
        'key': AMAP_API_KEY,
        'output': 'json',
        'radius': 100,
        'extensions': 'all'
    }
    try:
        response = requests.get(url, params=params, timeout=5)
        data = response.json()
        if 'regeocode' in data and 'pois' in data['regeocode'] and len(data['regeocode']['pois']) > 0:
            poi = data['regeocode']['pois'][0]  # 取第一个最相关的POI
            return poi.get('name', None), poi.get('type', None), poi.get('address', None)
        else:
            return None, None, None
    except Exception as e:
        print(f"请求失败: {e}")
        return None, None, None

# 遍历节点并提取 POI
results = []

for row in nodes_df.itertuples():
    nid = row.node_id
    x, y = row.x, row.y

    if nid in poi_cache:
        info = poi_cache[nid]
    else:
        name, typ, addr = query_poi(x, y)
        info = {'poi_name': name, 'poi_type': typ, 'poi_address': addr}
        poi_cache[nid] = info
        time.sleep(SLEEP_INTERVAL)

    results.append({
        'node_id': nid,
        'x': x,
        'y': y,
        'poi_name': info['poi_name'],
        'poi_type': info['poi_type'],
        'poi_address': info['poi_address']
    })

poi_df = pd.DataFrame(results)
poi_df.to_csv(OUTPUT_FILE, index=False)
print(f"POI提取完成，共保存 {len(poi_df)} 条节点信息：{OUTPUT_FILE}")

cache_df = pd.DataFrame.from_dict(poi_cache, orient='index')
cache_df.index.name = 'node_id'
cache_df.reset_index().to_csv(CACHE_FILE, index=False)
print(f"缓存已更新：{CACHE_FILE}")


POI提取完成，共保存 247 条节点信息：../outputs/nodes_with_poi.csv
缓存已更新：../outputs/poi_cache.csv


In [31]:
# 路径级时间 + 空间 + 语义（POI）特征增强模块
# 输入：
# - merged_hotspot_paths_with_time_space.csv
# - nodes_with_poi.csv（含 node_id, poi_type, ...）
# 输出：
# - merged_hotspot_paths_with_time_space_semantic.csv

import os, json, warnings, numpy as np, pandas as pd
from collections import Counter
from scipy.stats import entropy

warnings.filterwarnings("ignore")
CWD = os.getcwd()
output_dir = os.path.abspath(
    os.path.join(CWD, "..", "outputs")
)

FILE_TIME_SPACE = os.path.join(
    output_dir, "merged_hotspot_paths_with_time_space.csv"
)
FILE_POI = os.path.join(output_dir, "nodes_with_poi.csv")

# ---------- 读取 ----------
df  = pd.read_csv(FILE_TIME_SPACE)
poi = pd.read_csv(FILE_POI)

# ---- 反序列化 path / traj_ids（确保是 list）----
if "path" in df.columns and df["path"].dtype == object:
    df["path"] = df["path"].apply(json.loads)

if "traj_ids" in df.columns and df["traj_ids"].dtype == object:
    df["traj_ids"] = df["traj_ids"].apply(json.loads)

# ---------- 处理 POI 表 ----------
# nodes_with_poi.csv 至少要有：
#   node_id, poi_type
# 若 poi_type 多级（例：美食;快餐;汉堡） → 只取第一级
def pick_main_cat(poi_type):
    if pd.isna(poi_type):
        return None
    return poi_type.split(";")[0]

poi["main_poi"] = poi["poi_type"].apply(pick_main_cat)

# node_id -> 一级 POI
node2poi = dict(zip(poi["node_id"], poi["main_poi"]))

# ---------- 辅助函数 ----------
def poi_entropy(pois):
    """离散熵，衡量 POI 多样性"""
    if not pois:
        return np.nan
    c = Counter(pois)
    prob = np.array(list(c.values())) / prob.sum() if (prob := np.array(list(c.values()))) .sum() else prob
    return round(float(entropy(prob, base=2)), 3) if len(prob) > 1 else 0.0

# ---------- 遍历路径 & 计算 ----------
sem_rows = []

for row in df.itertuples():
    pois = [node2poi[n] for n in row.path if n in node2poi and node2poi[n] is not None]
    if pois:
        sem_rows.append({
            "path"        : json.dumps(row.path, ensure_ascii=False),
            "poi_types"   : json.dumps(sorted(set(pois)), ensure_ascii=False),
            "dominant_poi": Counter(pois).most_common(1)[0][0],
            "poi_entropy" : poi_entropy(pois)
        })
    else:                          # 找不到任何 POI
        sem_rows.append({
            "path"        : json.dumps(row.path, ensure_ascii=False),
            "poi_types"   : "[]",
            "dominant_poi": None,
            "poi_entropy" : np.nan
        })

sem_df = pd.DataFrame(sem_rows)

# ---------- 合并 & 输出 ----------
# 注意：df 里的 path 列目前是 list，需要转为 JSON 字符串才能 on='path' merge
df["path_json"]  = df["path"].apply(lambda x: json.dumps(x, ensure_ascii=False))
sem_df.rename(columns={"path": "path_json"}, inplace=True)

merged_sem = pd.merge(df.drop(columns=["path"]), sem_df, on="path_json", how="left")

# 把 path_json 改回名为 path，保持和之前格式一致
merged_sem.rename(columns={"path_json": "path"}, inplace=True)

TARGET_FILE = os.path.join(output_dir, "merged_hotspot_paths_with_time_space_semantic.csv")
merged_sem.to_csv(TARGET_FILE, index=False, encoding="utf-8-sig")
print(f"语义特征已追加，文件保存到：{TARGET_FILE}\n共 {len(merged_sem)} 条路径")

语义特征已追加，文件保存到：/Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/outputs/merged_hotspot_paths_with_time_space_semantic.csv
共 1134 条路径


In [32]:
import os, json, warnings, pandas as pd
import numpy as np

warnings.filterwarnings("ignore", category=FutureWarning)

OUTPUT_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "outputs"))

SRC_FILE = os.path.join(OUTPUT_DIR,
                        "merged_hotspot_paths_with_time_space_semantic.csv")
DST_FILE = os.path.join(OUTPUT_DIR, "cleaned_paths.csv")

# ------------- 读取 -------------
df = pd.read_csv(SRC_FILE)

# ------------- 基础清洗 -------------
# 1) 去掉空 path 或 path_len<=1
df["path"] = df["path"].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
df["path_length"] = df["path"].apply(len)
df = df[df["path_length"] > 1].reset_index(drop=True)

# 2) dominant_poi / poi_types 缺失填充
df["dominant_poi"] = df["dominant_poi"].fillna("无")

# 保证 poi_types 为合法 JSON 字符串
def safe_poi_types(x):
    if pd.isna(x) or x == "" or x == "[]":
        return "[]"
    if isinstance(x, list):
        return json.dumps(x, ensure_ascii=False)
    try:
        _ = json.loads(x)
        return x
    except Exception:
        # 粗糙字符串，例如 "[商场,住宅]" → 转成真 JSON
        items = [s.strip(" '\"") for s in x.strip("[]").split(",") if s.strip()]
        return json.dumps(items, ensure_ascii=False)

df["poi_types"] = df["poi_types"].apply(safe_poi_types)

# ------------- IQR 修剪 spatial_length -------------
# 有些行 spatial_length 可能 NaN —— 先丢再算分位数
df_nonan = df.dropna(subset=["spatial_length"])
q1, q3 = df_nonan["spatial_length"].quantile([0.25, 0.75])
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = max(0, q1 - 1.5 * iqr)

df = df[(df["spatial_length"].isna()) |  # 保留无法计算长度的
        ((df["spatial_length"] >= lower) & (df["spatial_length"] <= upper))] \
        .reset_index(drop=True)

# ------------- 再序列化 JSON 列 -------------
json_cols = ["path", "traj_ids", "source"]
for col in json_cols:
    df[col] = df[col].apply(lambda x: json.dumps(x, ensure_ascii=False)
                            if not isinstance(x, str) else x)

# ------------- 保存 -------------
df.to_csv(DST_FILE, index=False, encoding="utf-8")
print(f"高级清洗完成 -> {DST_FILE}")
print(f"剩余路径数: {len(df)}")

高级清洗完成 -> /Users/chenenqiang/Desktop/Undergraduate Life/Undergraduate Life/创新实验2025春/FrequentPatternMiningBasedOnHotspotTrajectories/DataPreprocess/outputs/cleaned_paths.csv
剩余路径数: 1106
