# Segment Data
Segment trajectories to segment_data.pkl

In [52]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString

# 获取当前脚本的路径
current_dir = os.getcwd()

# 文件夹路径
data_path = os.path.join(current_dir, '..', 'output')

# 加载数据
filtered_data = pd.read_pickle(data_path + '/filtered_data.pkl')
filtered_data['post_time'] = pd.to_datetime(filtered_data['post_time'])
filtered_data.sort_values(['mmsi', 'post_time'], inplace=True)

# 计算每个数据点与前一个数据点的时间差
filtered_data['time_diff'] = filtered_data.groupby('mmsi')['post_time'].diff()

# 标记超过30分钟的时间差为新轨迹段的开始
filtered_data['new_segment'] = filtered_data['time_diff'] > pd.Timedelta(minutes=30)

# 确保每个 mmsi 的第一个点标记为新轨迹的开始
filtered_data.loc[0, 'new_segment'] = True

# 生成轨迹段ID
filtered_data['segment_id'] = filtered_data['new_segment'].cumsum()

In [54]:
import geopandas as gpd
from shapely.geometry import LineString

# 创建 GeoDataFrame
gdf = gpd.GeoDataFrame(filtered_data, geometry=gpd.points_from_xy(filtered_data.longitude, filtered_data.latitude))

# 设置 CRS (假设是 WGS84 坐标系)
gdf.crs = "EPSG:4326"

# 转换为线形 GeoDataFrame
lines = gdf.groupby(['mmsi', 'segment_id']).apply(
    lambda x: LineString(x['geometry'].tolist()) if len(x) > 1 else None, include_groups=False
).dropna()

# 重新生成GeoDataFrame并设置几何列和CRS
line_gdf = gpd.GeoDataFrame(lines, columns=['geometry']).reset_index()
line_gdf = line_gdf.set_geometry('geometry')

# 确保MMSI是字符串
line_gdf['mmsi'] = line_gdf['mmsi'].astype(str)

# 如果有需要, 可以设置CRS
line_gdf.crs = gdf.crs

  lines = gdf.groupby(['mmsi', 'segment_id']).apply(


In [56]:
# 保存 segment_data.pkl
filtered_data.to_pickle(data_path + '/segmented_data.pkl')

# 保存 GeoDataFrame
line_gdf.to_file(os.path.join(data_path + "/saved_trajectories.shp"))

In [47]:
filtered_data.head(10)

Unnamed: 0,mmsi,longitude,latitude,cog,speed,post_time,time_diff,new_segment,segment_id
103887,0.0,106.651237,29.594469,911.0,72.0,2023-05-23 15:11:57,NaT,False,0
471128,0.0,106.586693,29.576618,0.0,0.0,2023-06-01 06:11:53,8 days 14:59:56,True,1
472790,0.0,106.586655,29.576612,0.0,0.0,2023-06-01 06:48:10,0 days 00:36:17,True,2
515656,0.0,106.58667,29.576601,0.0,0.0,2023-06-02 01:46:06,0 days 18:57:56,True,3
540568,0.0,106.58667,29.57662,0.0,0.0,2023-06-02 13:13:09,0 days 11:27:03,True,4
569819,0.0,106.586655,29.576611,0.0,0.0,2023-06-03 01:40:38,0 days 12:27:29,True,5
623242,0.0,106.586693,29.576611,0.0,0.0,2023-06-04 05:48:09,1 days 04:07:31,True,6
2276479,0.0,106.627975,29.594999,0.0,0.0,2023-08-28 15:45:45,85 days 09:57:36,True,7
2276999,0.0,106.622787,29.612057,3332.0,214.0,2023-08-28 15:57:41,0 days 00:11:56,False,7
2724105,0.0,106.64743,29.591801,701.0,254.0,2023-09-06 10:18:17,8 days 18:20:36,True,8


In [41]:
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16922954 entries, 103887 to 3470645
Data columns (total 9 columns):
 #   Column       Dtype          
---  ------       -----          
 0   mmsi         float64        
 1   longitude    float32        
 2   latitude     float32        
 3   cog          float32        
 4   speed        float32        
 5   post_time    datetime64[ns] 
 6   time_diff    timedelta64[ns]
 7   new_segment  bool           
 8   segment_id   int64          
dtypes: bool(1), datetime64[ns](1), float32(4), float64(1), int64(1), timedelta64[ns](1)
memory usage: 1.4 GB


In [63]:
print(filtered_data['longitude'].describe())
print(filtered_data['latitude'].describe())

count    1.692295e+07
mean     1.066427e+02
std      2.123554e-02
min      1.065000e+02
25%      1.066258e+02
50%      1.066350e+02
75%      1.066642e+02
max      1.067000e+02
Name: longitude, dtype: float64
count    1.692295e+07
mean     2.959095e+01
std      1.430824e-02
min      2.950000e+01
25%      2.957572e+01
50%      2.959253e+01
75%      2.960149e+01
max      2.969981e+01
Name: latitude, dtype: float64


In [60]:
line_gdf.head(10)

Unnamed: 0,mmsi,segment_id,geometry
0,0.0,7,"LINESTRING (106.62798 29.595, 106.62279 29.61206)"
1,0.0,9,"LINESTRING (106.64959 29.59242, 106.62792 29.5..."
2,0.0,10,"LINESTRING (106.60794 29.62096, 106.62727 29.5..."
3,18907.0,10,"LINESTRING (106.57319 29.56624, 106.57195 29.5..."
4,18907.0,11,"LINESTRING (106.58002 29.57122, 106.58012 29.5..."
5,18907.0,12,"LINESTRING (106.5806 29.57032, 106.5806 29.57032)"
6,18907.0,13,"LINESTRING (106.59078 29.61503, 106.59078 29.6..."
7,18907.0,14,"LINESTRING (106.5853 29.6074, 106.58534 29.607..."
8,18907.0,15,"LINESTRING (106.58031 29.5699, 106.58031 29.5699)"
9,18907.0,16,"LINESTRING (106.57358 29.56673, 106.56768 29.5..."


In [59]:
line_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 36509 entries, 0 to 36508
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   mmsi        36509 non-null  object  
 1   segment_id  36509 non-null  int64   
 2   geometry    36509 non-null  geometry
dtypes: geometry(1), int64(1), object(1)
memory usage: 855.8+ KB
