In [8]:
import pandas as pd
import networkx as nx
from scipy.spatial import KDTree
import matplotlib.pyplot as plt
from tqdm import tqdm
from joblib import Parallel, delayed
from folium.plugins import TimestampedGeoJson
from geopy.distance import geodesic
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame
import time
import numpy as np
import folium

## Useful support function

In [2]:
def create_plot(data):
    # Ambil baris pertama dari DataFrame
    first_row = data.iloc[0]

    # Ambil nilai latitude dan longitude dari baris pertama
    latitude = first_row['latitude']
    longitude = first_row['longitude']
    
    m = folium.Map(location=[latitude, longitude], zoom_start=25)

    # Add CircleMarkers for each point
    for index, row in data.iterrows():
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=5,  # Marker size
            color="blue",  # Marker color
            fill=True,
            fill_color="blue",  # Fill color of the marker
            fill_opacity=0.7,  # Opacity of the marker fill
            popup=f"User ID: {row['maid']}<br>Latitude: {row['latitude']}<br>Longitude: {row['longitude']}",
        ).add_to(m)
    
    return m

In [3]:
def create_pivot(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df_pivot = df['maid'].groupby(df['timestamp'].dt.date).value_counts()
    pivot = df_pivot.unstack().fillna(0).astype(int)

    total_counts = pivot.sum(axis=0)
    sorted_columns = total_counts.sort_values(ascending=False).index
    pivot_sorted = pivot[sorted_columns]
    return pivot_sorted

In [4]:
def create_line(data):
    data = data.to_crs('EPSG:4326')
    data = data['geometry']
    # Buat peta dengan lokasi awal berdasarkan rata-rata koordinat dari data linestring
    avg_lat = data.apply(lambda x: x.centroid.y).mean()
    avg_lon = data.apply(lambda x: x.centroid.x).mean()
    m = folium.Map(location=[avg_lat, avg_lon], zoom_start=15)

    # Tambahkan polyline untuk setiap linestring
    for linestring in data:
        coordinates = [(lat, lon) for lon, lat in linestring.coords]
        folium.PolyLine(locations=coordinates, color='purple', weight=8, opacity=0.7).add_to(m)

    # Tambahkan GeoJson dari data GeoPandas
    folium.GeoJson(data.to_json(), name='Garis Jalan').add_to(m)

    return m

## Modified process_group function to ensure each GPS data addition follows the node sequence of the road network graph

In [5]:
# Reading the gps CSV file into a DataFrame
gps = pd.read_csv('./filter4_20m_60min_immobility_malar_des.csv')
gps.head()

Unnamed: 0,maid,latitude,longitude,timestamp,adjusted_longitude,adjusted_latitude
0,00012afc-6daf-461f-96a8-181e5af69db9,-7.800291,110.364998,2021-12-01 17:54:11,110.364996,-7.80021
1,00012afc-6daf-461f-96a8-181e5af69db9,-7.8003,110.364998,2021-12-01 17:54:22,110.364996,-7.80021
2,00012afc-6daf-461f-96a8-181e5af69db9,-7.8003,110.364998,2021-12-01 17:54:22,110.364996,-7.80021
3,000136a6-c76e-4f76-841d-1329727ad906,-7.78573,110.36664,2021-12-06 18:38:17,110.366636,-7.785796
4,000136a6-c76e-4f76-841d-1329727ad906,-7.78573,110.366638,2021-12-08 12:08:42,110.366636,-7.785796


In [6]:
# Modified process_group function to ensure each GPS data addition follows the node sequence of the road network graph
def process_group(group, graph):
    # Map Matching (Assumed to be Accurate)
    mapped_points_group = group[['adjusted_longitude', 'adjusted_latitude']].values
    
    # Initialize the resulting path and timestamps
    resulting_path_group = [tuple(mapped_points_group[0])]
    new_timestamps = [group['timestamp'].values[0]]
    
    prev_node = tuple(mapped_points_group[0])
    
    for i in range(1, len(mapped_points_group)):
        current_node = tuple(mapped_points_group[i])
        
        # Use Shortest Path on Road Graph to get the sequence of nodes from prev_node to current_node
        path = find_path_on_graph(graph, prev_node, current_node)
        
        # Extend the resulting path with nodes from the graph (no points outside the graph)
        # This also ensures that the added GPS data follows the node sequence of the road network graph
        resulting_path_group.extend(path[1:])
        
        # Time Interpolation (Assuming the path only contains points from the graph)
        if len(path) > 2:
            time_start = group['timestamp'].values[i-1]
            time_end = group['timestamp'].values[i]
            time_duration = (time_end - time_start) // np.timedelta64(1, 's')
            
            num_new_points = len(path) - 2
            time_interval = time_duration // num_new_points if num_new_points > 0 else 0
            
            # Adding validation for interpolated timestamps
            if time_interval < 0:
                print("Negative time interval found, skipping this part.")
                continue
            
            interpolated_timestamps = [time_start + np.timedelta64(int(time_interval * j), 's') for j in range(1, num_new_points + 1)]
            new_timestamps.extend(interpolated_timestamps)
        
        # Append the timestamp of the current node
        new_timestamps.append(group['timestamp'].values[i])
        
        # Update prev_node for the next iteration
        prev_node = current_node
    
    # Use the original 'maid' values
    maids = list(group['maid'].values) + [group['maid'].values[-1]] * (len(resulting_path_group) - len(group))
    
    return [(maids[i], point[1], point[0], new_timestamps[i]) for i, point in enumerate(resulting_path_group)]

In [7]:
# change format to datetime just to make sure
gps['timestamp'] = pd.to_datetime(gps['timestamp'])

grouped_data = gps.groupby(['maid', gps['timestamp'].dt.date])

start = time.time()
processed_results = Parallel(n_jobs=40)(delayed(process_group)(group, G) for _, group in tqdm(grouped_data))
end = time.time()

flattened_results = [item for sublist in processed_results for item in sublist]
resulting_data = pd.DataFrame(flattened_results, columns=['maid', 'latitude', 'longitude', 'timestamp'])

  0%|                                                 | 0/72781 [00:00<?, ?it/s]

NameError: name 'G' is not defined

In [None]:
print("The time of execution of above program is:", (end - start) / 1000, "s")

In [None]:
# Write DataFrame to CSV file
resulting_data.to_csv("filter5_20m_60min_immobility_adjusted_malar_des.csv", index=False)