In [1]:
import pandas as pd
import networkx as nx
from scipy.spatial import KDTree
import matplotlib.pyplot as plt
from tqdm import tqdm
from joblib import Parallel, delayed
from folium.plugins import TimestampedGeoJson
from geopy.distance import geodesic
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame
import time
import numpy as np
import folium

## Useful support function

In [2]:
def create_plot(data):
    # Ambil baris pertama dari DataFrame
    first_row = data.iloc[0]

    # Ambil nilai latitude dan longitude dari baris pertama
    latitude = first_row['latitude']
    longitude = first_row['longitude']
    
    m = folium.Map(location=[latitude, longitude], zoom_start=25)

    # Add CircleMarkers for each point
    for index, row in data.iterrows():
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=5,  # Marker size
            color="blue",  # Marker color
            fill=True,
            fill_color="blue",  # Fill color of the marker
            fill_opacity=0.7,  # Opacity of the marker fill
            popup=f"User ID: {row['maid']}<br>Latitude: {row['latitude']}<br>Longitude: {row['longitude']}",
        ).add_to(m)
    
    return m

In [3]:
def create_pivot(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df_pivot = df['maid'].groupby(df['timestamp'].dt.date).value_counts()
    pivot = df_pivot.unstack().fillna(0).astype(int)

    total_counts = pivot.sum(axis=0)
    sorted_columns = total_counts.sort_values(ascending=False).index
    pivot_sorted = pivot[sorted_columns]
    return pivot_sorted

In [4]:
def create_line(data):
    data = data.to_crs('EPSG:4326')
    data = data['geometry']
    # Buat peta dengan lokasi awal berdasarkan rata-rata koordinat dari data linestring
    avg_lat = data.apply(lambda x: x.centroid.y).mean()
    avg_lon = data.apply(lambda x: x.centroid.x).mean()
    m = folium.Map(location=[avg_lat, avg_lon], zoom_start=15)

    # Tambahkan polyline untuk setiap linestring
    for linestring in data:
        coordinates = [(lat, lon) for lon, lat in linestring.coords]
        folium.PolyLine(locations=coordinates, color='purple', weight=8, opacity=0.7).add_to(m)

    # Tambahkan GeoJson dari data GeoPandas
    folium.GeoJson(data.to_json(), name='Garis Jalan').add_to(m)

    return m

## Map matching with KDTree to mapping all gps points to the road

In [None]:
# Reading the gps CSV file into a DataFrame
gps = pd.read_csv('./filter3_20m_60min_malar_des.csv')
gps.head()

In [5]:
# Reading the road CSV file into a DataFrame
road = pd.read_csv('./expanded_road_df_DIY.csv')
road.head()

Unnamed: 0.1,Unnamed: 0,name,start_x,start_y,end_x,end_y
0,0,,110.145676,-7.652724,110.145675,-7.652734
1,1,,110.145676,-7.652724,110.145679,-7.652716
2,2,,110.398799,-7.861408,110.398598,-7.861393
3,3,,110.398799,-7.861408,110.398927,-7.861439
4,4,,110.581033,-7.788455,110.580238,-7.788295


In [6]:
# Create Graph from Road Network Data
G = nx.Graph()
for index, row in road.iterrows():
    G.add_edge((row['start_x'], row['start_y']), (row['end_x'], row['end_y']), weight=1, name=row['name'])

In [None]:
# Map Matching untuk menarik data yg tidak di jalan menjadi berada di jalan

sorted_gps = gps.sort_values(by=['maid', 'timestamp'])
node_list = list(G.nodes())

tree = KDTree(node_list)
distances, indices = tree.query(sorted_gps[['longitude', 'latitude']].values)
all_mapped_points_kdtree = [node_list[index] for index in indices]

sorted_gps['adjusted_longitude'] = [point[0] for point in all_mapped_points_kdtree]
sorted_gps['adjusted_latitude'] = [point[1] for point in all_mapped_points_kdtree]
sorted_gps['timestamp'] = pd.to_datetime(sorted_gps['timestamp'])

## Filter immobility data with mas Arkham algorithm

In [None]:
def filter_immobility_arkham(name, group):
    filtered_data = []
    group = group.sort_values('timestamp')
    prev_row = None
    immobile_start_row = None

    for i, row in group.iterrows():
        if prev_row is not None:
            if row['latitude'] == prev_row['latitude'] and row['longitude'] == prev_row['longitude']:
                if immobile_start_row is None:
                    immobile_start_row = prev_row
            else:
                if immobile_start_row is not None:
                    filtered_data.append(immobile_start_row)
                    immobile_start_row = None
                filtered_data.append(row)

        prev_row = row

    if immobile_start_row is not None:
        filtered_data.append(immobile_start_row)
    else:
        if prev_row is not None:
            filtered_data.append(prev_row)

    return pd.DataFrame(filtered_data)

In [None]:
def filter_immobility_parallel(data):
    grouped = data.groupby('maid')
    results = Parallel(n_jobs=40)(delayed(filter_immobility_arkham)(name, group) for name, group in tqdm(grouped))
    return pd.concat(results, ignore_index=True)

In [None]:
start = time.time()

compressed_data = filter_immobility_parallel(sorted_gps)

end = time.time()

In [None]:
len(gps) - len(compressed_data)

In [None]:
compressed_data.head()

In [None]:
create_pivot(compressed_data)

In [None]:
# Write DataFrame to CSV file
# compressed_data.to_csv("filter4_20m_60min_immobility_malar_nov.csv", index=False)

## Adjusted path also based on mas Arkham algorithm

In [7]:
# Reading the gps CSV file into a DataFrame
gps = pd.read_csv('./filter4_20m_60min_immobility_malar_des.csv')
gps.head()

Unnamed: 0,maid,latitude,longitude,timestamp,adjusted_longitude,adjusted_latitude
0,00012afc-6daf-461f-96a8-181e5af69db9,-7.800291,110.364998,2021-12-01 17:54:11,110.364996,-7.80021
1,00012afc-6daf-461f-96a8-181e5af69db9,-7.8003,110.364998,2021-12-01 17:54:22,110.364996,-7.80021
2,00012afc-6daf-461f-96a8-181e5af69db9,-7.8003,110.364998,2021-12-01 17:54:22,110.364996,-7.80021
3,000136a6-c76e-4f76-841d-1329727ad906,-7.78573,110.36664,2021-12-06 18:38:17,110.366636,-7.785796
4,000136a6-c76e-4f76-841d-1329727ad906,-7.78573,110.366638,2021-12-08 12:08:42,110.366636,-7.785796


In [8]:
def find_path_on_graph(graph, start, end):
    """Find the shortest path on the graph between two nodes using the A* algorithm."""
    try:
        # Custom heuristic function considering traffic_density on nodes
        def heuristic(u, v):
            node_u_density = graph.nodes[u].get('traffic_density', 0)
            node_v_density = graph.nodes[v].get('traffic_density', 0)
            average_density = (node_u_density + node_v_density) / 2
            return 0.1 * average_density  # Adjust the heuristic as needed
        
        path = nx.astar_path(graph, start, end, heuristic=heuristic, weight='weight')
        return path
    except nx.NetworkXNoPath:
        return []

In [9]:
# Modified process_group function to ensure each GPS data addition follows the node sequence of the road network graph
def process_group(group, graph):
    # Map Matching (Assumed to be Accurate)
    mapped_points_group = group[['adjusted_longitude', 'adjusted_latitude']].values
    
    # Initialize the resulting path and timestamps
    resulting_path_group = [tuple(mapped_points_group[0])]
    new_timestamps = [group['timestamp'].values[0]]
    
    prev_node = tuple(mapped_points_group[0])
    
    for i in range(1, len(mapped_points_group)):
        current_node = tuple(mapped_points_group[i])
        
        # Use Shortest Path on Road Graph to get the sequence of nodes from prev_node to current_node
        path = find_path_on_graph(graph, prev_node, current_node)
        
        # Extend the resulting path with nodes from the graph (no points outside the graph)
        # This also ensures that the added GPS data follows the node sequence of the road network graph
        resulting_path_group.extend(path[1:])
        
        # Time Interpolation (Assuming the path only contains points from the graph)
        if len(path) > 2:
            time_start = group['timestamp'].values[i-1]
            time_end = group['timestamp'].values[i]
            time_duration = (time_end - time_start) // np.timedelta64(1, 's')
            
            num_new_points = len(path) - 2
            time_interval = time_duration // num_new_points if num_new_points > 0 else 0
            
            # Adding validation for interpolated timestamps
            if time_interval < 0:
                print("Negative time interval found, skipping this part.")
                continue
            
            interpolated_timestamps = [time_start + np.timedelta64(int(time_interval * j), 's') for j in range(1, num_new_points + 1)]
            new_timestamps.extend(interpolated_timestamps)
        
        # Append the timestamp of the current node
        new_timestamps.append(group['timestamp'].values[i])
        
        # Update prev_node for the next iteration
        prev_node = current_node
    
    # Use the original 'maid' values
    maids = list(group['maid'].values) + [group['maid'].values[-1]] * (len(resulting_path_group) - len(group))
    
    return [(maids[i], point[1], point[0], new_timestamps[i]) for i, point in enumerate(resulting_path_group)]

In [None]:
# change format to datetime just to make sure
gps['timestamp'] = pd.to_datetime(gps['timestamp'])

grouped_data = gps.groupby(['maid', gps['timestamp'].dt.date])

start = time.time()
processed_results = Parallel(n_jobs=40)(delayed(process_group)(group, G) for _, group in tqdm(grouped_data))
end = time.time()

flattened_results = [item for sublist in processed_results for item in sublist]
resulting_data = pd.DataFrame(flattened_results, columns=['maid', 'latitude', 'longitude', 'timestamp'])

  0%|                                    | 120/72781 [01:59<28:43:22,  1.42s/it]

In [None]:
print("The time of execution of above program is:", (end - start) / 1000, "s")

In [None]:
# Write DataFrame to CSV file
# resulting_data.to_csv("filter5_20m_60min_immobility_adjusted_malar_des.csv", index=False)

## visualization moving point by point 

In [None]:
# Modified function for animated visualization based on timestamp with connecting lines
def visualisasi_pergerakan_animated_with_lines(selected_date, data, user_id, title):
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    
    if isinstance(selected_date, str):
        selected_date = pd.to_datetime(selected_date)

    selected_day_data = data[(data['timestamp'].dt.date == selected_date.date()) & (data['maid'] == user_id)]

    if selected_day_data.empty:
        print(f"No data available for user_id {user_id} on {selected_date.date()}")
        return

    m = folium.Map(location=[selected_day_data.iloc[0]['latitude'], selected_day_data.iloc[0]['longitude']], zoom_start=15)
    
    # Prepare data for TimestampedGeoJson
    features = []
    
    prev_point = None  # To store the previous point in the loop
    for index, row in selected_day_data.iterrows():
        point = [row['longitude'], row['latitude']]
        feature = {
            'type': 'Feature',
            'geometry': {
                'type': 'Point',
                'coordinates': point,
            },
            'properties': {
                'time': row['timestamp'].strftime('%Y-%m-%d %H:%M:%S'),  # Add timestamp information here
                'icon': 'circle',
                'iconstyle': {
                    'fillColor': 'blue',
                    'fillOpacity': 0.6,
                    'stroke': 'false',
                    'radius': 5
                },
                'popup': row['timestamp'].strftime('%Y-%m-%d %H:%M:%S'),
            }
        }
        features.append(feature)
        
        # Jika ini bukan titik pertama, gambar garis ke titik sebelumnya
        if prev_point:
            line_feature = {
                'type': 'Feature',
                'geometry': {
                    'type': 'LineString',
                    'coordinates': [prev_point, point]
                },
                'properties': {
                    'times': [row['timestamp'].strftime('%Y-%m-%d %H:%M:%S')] * 2,
                    'style': {'color': 'blue', 'weight': 3}
                }
            }
            features.append(line_feature)
        
        # Update previous point
        prev_point = point

    timestamped_geojson = TimestampedGeoJson(
        {'type': 'FeatureCollection', 'features': features},
        period='PT1M',
        add_last_point=True,
        auto_play=False,
        loop=False,
        max_speed=1,
        loop_button=True,
        date_options='YYYY-MM-DD HH:mm:ss',
        time_slider_drag_update=True
    )

    timestamped_geojson.add_to(m)
    return m

In [None]:
map_view = visualisasi_pergerakan_animated_with_lines('2021-11-02', compressed_data, '0bfd22ec-9aee-4a38-939e-6ca3022ef392','uji coba')

In [None]:
map_view