In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
# data 기본 경로
ABSOLUTE_PATH = "C:\\Users\\rudnf\\vscode\\Graduation\\final\\data\\"
MONTHS = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
SEASONS = ["spring", "summer", "fall", "winter"]
KS = [25, 50, 75, 100]

In [4]:
def season_match(season):
    months = []
    if season == "spring":
        months = ["03", "04", "05"]
    if season == "summer":
        months = ["06", "07", "08"]
    if season == "fall":
        months = ["09", "10", "11"]
    if season == "winter":
        months = ["12", "01", "02"]
    return months

In [5]:
def load_POI_cluster_data(path):
    locations = pd.read_csv(path)
        
    locations = locations[["longitude", "latitude"]]
    locations.index = [f'POI{i}' for i in range(len(locations))]
    
    return locations

In [6]:
def load_GPS_data(path):
    df = pd.read_csv(path)
    
    # csv파일로 불러온 시간(Type : str)을 datetime으로 변경
    # ms단위는 버림
    df['collection_dt'] = pd.to_datetime(df['collection_dt'])
    df = df.sort_values(by='collection_dt', ascending=True).reset_index(drop=True)
    
    return df

## GPS 데이터로 Trajectory 만들기
    trajectory : (trajectory_id, start_point, end_point, POI_sequence, time_period_sequence)

In [7]:
def _map_group_time_period(rows):
    time_values = rows['collection_dt'].dt.time.values
    time_period_values = {
        'dawn' : datetime.time(6, 0, 0),
        'morning' : datetime.time(12, 0, 0),
        'afternoon' : datetime.time(18, 0, 0),
        'night' : datetime.time(0, 0, 0),
    }
    
    conditions = [
        time_values < time_period_values['dawn'],
        (time_values >= time_period_values['dawn']) & (time_values < time_period_values['morning']),
        (time_values >= time_period_values['morning']) & (time_values < time_period_values['afternoon']),
        (time_values >= time_period_values['afternoon']) | (time_values == time_period_values['night'])
    ]

    rows['time_period'] = np.select(conditions, time_period_values.keys())
        
    return rows

In [8]:
def _haversine_distance(GPS_lon, GPS_lat, locations_lon, locations_lat):
    R = 6371  # 지구의 반지름 (단위: km)
    
    # 모든 rows와 POI의 위도 경도를 벡터화
    GPS_lon_rad = np.radians(GPS_lon)
    GPS_lat_rad = np.radians(GPS_lat)
    POI_lon_rad = np.radians(locations_lon)
    POI_lat_rad = np.radians(locations_lat)
    
    
    diff_lon = POI_lon_rad - GPS_lon_rad
    diff_lat = POI_lat_rad - GPS_lat_rad

    a = np.sin(diff_lon/2)**2 + np.cos(GPS_lat_rad) * np.cos(POI_lat_rad) * np.sin(diff_lat/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    distance = R * c

    return distance

In [9]:
def _calculate_POI_sequence(rows, locations):
    POI_sequence = []
    
    GPS_lon = rows['longitude'].values
    GPS_lat = rows['latitude'].values
    
    # np.newaxis : 배열에 차원을 추가
    distances = _haversine_distance(GPS_lon[:, np.newaxis], GPS_lat[:, np.newaxis], locations['longitude'].values, locations['latitude'].values)

    close_indices = np.where(distances <= 1)
    unique_rows, unique_indices = np.unique(close_indices[0], return_index=True)
    unique_locations = close_indices[1][unique_indices]

    for i in range(len(unique_rows)):
        POI_sequence.append(locations.index[unique_locations[i]])

    return POI_sequence

In [10]:
# v.0.5.0 : _remove_duplicated_path에서 시간대가 걸쳐있는 path는 중복된 POI를 제거 x
def _remove_duplicated_rows(path, rows):
    removed_POI_sequence = []
    removed_time_period_sequence = []

    prev_location = path[0]
    start_idx = 0
    end_idx = len(path)-1

    for idx, location in enumerate(path):

        if (location != prev_location) or (idx == len(path)-1) :
            end_idx = idx

            same_POI_rows = rows.iloc[start_idx:end_idx]
            time_period_unique = same_POI_rows['time_period'].unique()
            
            if len(time_period_unique) != 1:
                for each_time_period in time_period_unique:
                    removed_POI_sequence.append(prev_location)
                    removed_time_period_sequence.append(each_time_period)
            else:
                removed_POI_sequence.append(prev_location)
                removed_time_period_sequence.append(time_period_unique.item())
            
            prev_location = location
            start_idx = idx       

    return removed_POI_sequence, removed_time_period_sequence

In [11]:
def make_trajectory(df, locations):
        
    # DataFrame을 (년, 월, 일, oid)로 grouping
    grouped = df.groupby([df['collection_dt'].dt.year, 
                            df['collection_dt'].dt.month, 
                            df['collection_dt'].dt.day, 
                            df['oid']])

    trajectories = pd.DataFrame(columns=['trajectory_id', 'start_point','end_point', 'POI_sequence', 'time_period_sequence'])
    concat_row = pd.DataFrame(columns=['trajectory_id', 'start_point','end_point', 'POI_sequence', 'time_period_sequence'])

    for group_key, rows in grouped:
        
        # Grouping한 GPS data가 100개 이하일 경우 의미있는 경로가 나오지 않을 것이라 판단하여 포함 x
        if (len(rows) <= 100):
            continue
        
        # Grouping 한 rows에서 collection_dt의 값이 동일한 data가 존재
        rows.drop_duplicates(subset=['collection_dt'], inplace=True)

        # Grouping한 rows의 time_period를 mapping
        rows = _map_group_time_period(rows)

        path = _calculate_POI_sequence(rows, locations)
        
        if len(path) == 0:
            continue
        
        removed_POI_sequence, removed_time_period_sequence = _remove_duplicated_rows(path, rows)
  
        # 최종길이가 5 미만이라면 drop
        if len(removed_POI_sequence) <= 4:
            continue

        start_point, end_point = removed_POI_sequence[0], removed_POI_sequence[-1]
        
        column_list = ['trajectory_id', 'start_point', 'end_point', 'POI_sequence', 'time_period_sequence']
        concat_row.loc[0, column_list] = [ group_key, start_point, end_point, removed_POI_sequence, removed_time_period_sequence ]
        
        trajectories = pd.concat([trajectories, concat_row], ignore_index=True)
       
    return trajectories

## 계절별 Trajectory 생성

In [13]:
for season in ['spring','summer', "fall", "winter"]:
    months = season_match(season)
    for month in months:    
        for k in [25, 50, 75, 100]:
            print(f"month : {month}, K : {k}")
            df = load_GPS_data(ABSOLUTE_PATH + f"GPS_data\\month_{month}.csv")
            locations = load_POI_cluster_data(ABSOLUTE_PATH + f"stay_point\\cluster_data\\{season}\\{season}_cluster_{k}.csv")
        
            trajectories = make_trajectory(df, locations)
        
            trajectories.to_csv(ABSOLUTE_PATH + f"trajectory\\{season}\\trajectory_{month}_cluster_{k}.csv", index=False)

month : 03, K : 25
month : 03, K : 50
month : 03, K : 75
month : 03, K : 100
month : 04, K : 25
month : 04, K : 50
month : 04, K : 75
month : 04, K : 100
month : 05, K : 25
month : 05, K : 50
month : 05, K : 75
month : 05, K : 100
month : 06, K : 25
month : 06, K : 50
month : 06, K : 75
month : 06, K : 100


## 전체 Trajectory 생성

In [12]:
df_list = []

for month in MONTHS:
    df_list.append(load_GPS_data(ABSOLUTE_PATH + f"GPS_data\\month_{month}.csv"))
df = pd.concat(df_list)

for k in [25, 50, 75, 100]:
    print(f"K : {k}")
    locations = load_POI_cluster_data(ABSOLUTE_PATH + f"stay_point\\cluster_data\\total\\total_cluster_{k}.csv")

    trajectories = make_trajectory(df, locations)

    trajectories.to_csv(ABSOLUTE_PATH + f"trajectory\\total\\trajectory_total_cluster_{k}.csv", index=False)

K : 25
K : 50
K : 75
K : 100
