# Segment Data
Segment trajectories to segment_data.pkl

In [4]:
import os
import pandas as pd

# 获取当前脚本的路径
current_dir = os.getcwd()

# 指定主目录路径
source_data_path = os.path.join(current_dir, '..', 'data')

# 输出文件夹路径
segment_data_path = os.path.join(current_dir, '..', 'output')

# 加载数据
filtered_data = pd.read_pickle(source_data_path + '/combined_data.pkl')
filtered_data['post_time'] = pd.to_datetime(filtered_data['post_time'])
filtered_data.sort_values(['mmsi', 'post_time'], inplace=True)

# 计算每个数据点与前一个数据点的时间差
filtered_data['time_diff'] = filtered_data.groupby('mmsi')['post_time'].diff()

# 标记超过30分钟的时间差为新轨迹段的开始
filtered_data['new_segment'] = filtered_data['time_diff'] > pd.Timedelta(minutes=30)

# 确保第一个数据点每个mmsi也被标记为新轨迹的开始
filtered_data.loc[0, 'new_segment'] = True
filtered_data['new_segment'] = filtered_data['new_segment'].fillna(True)

# 生成轨迹段ID
filtered_data['segment_id'] = filtered_data['new_segment'].cumsum()

In [6]:
filtered_data.to_pickle(segment_data_path + '/segmented_data.pkl')

In [5]:
filtered_data.head(10)

Unnamed: 0,mmsi,longitude,latitude,cog,speed,post_time,time_diff,new_segment,segment_id
103887,0.0,106.651237,29.594469,911.0,72.0,2023-05-23 15:11:57,NaT,False,0
471128,0.0,106.586693,29.576618,0.0,0.0,2023-06-01 06:11:53,8 days 14:59:56,True,1
472790,0.0,106.586655,29.576612,0.0,0.0,2023-06-01 06:48:10,0 days 00:36:17,True,2
515656,0.0,106.58667,29.576601,0.0,0.0,2023-06-02 01:46:06,0 days 18:57:56,True,3
540568,0.0,106.58667,29.57662,0.0,0.0,2023-06-02 13:13:09,0 days 11:27:03,True,4
569819,0.0,106.586655,29.576611,0.0,0.0,2023-06-03 01:40:38,0 days 12:27:29,True,5
623242,0.0,106.586693,29.576611,0.0,0.0,2023-06-04 05:48:09,1 days 04:07:31,True,6
2276479,0.0,106.627975,29.594999,0.0,0.0,2023-08-28 15:45:45,85 days 09:57:36,True,7
2276999,0.0,106.622787,29.612057,3332.0,214.0,2023-08-28 15:57:41,0 days 00:11:56,False,7
2724105,0.0,106.64743,29.591801,701.0,254.0,2023-09-06 10:18:17,8 days 18:20:36,True,8
