In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
import skmob
from skmob.preprocessing import filtering
from skmob.preprocessing import detection
from skmob.preprocessing import compression
import folium
import os

In [2]:
def count_maid(data):
    filtered_data = data.copy()
    
    # Ganti 'datetime_wib' sesuai nama kolom datetime di data-mu
    filtered_data['datetime_wib'] = pd.to_datetime(filtered_data['datetime_wib'])
    filtered_data['tanggal'] = filtered_data['datetime_wib'].dt.date

    # Hitung jumlah 'maid' unik per hari
    jumlah_maid_unik_per_hari = filtered_data.groupby('tanggal')['maid'].nunique()

    # Print hasilnya
    # print("Jumlah ID user unik: ", data['maid'].nunique())
    maid = data['maid'].nunique()
    return(maid)
#     print(jumlah_maid_unik_per_hari)

In [3]:
def bar_chart(data):
    filtered_data = data.copy()

    # Hapus duplikat 'maid' berdasarkan tanggal
    filtered_data_unique_maid = filtered_data.drop_duplicates(subset=['tanggal', 'maid'])

    # Hitung jumlah unik per hari
    jumlah_unik_per_hari = filtered_data_unique_maid.groupby('tanggal').size()

    # Plotting
    plt.figure(figsize=(10, 6))
    jumlah_unik_per_hari.plot(kind='bar', color='skyblue')
    maid = count_maid(data)
    plt.title(f'Jumlah Data Maid unik per Hari: {maid}')
    plt.xlabel('Tanggal')
    plt.ylabel('Jumlah Data Unik')
    plt.xticks(rotation=75)
    plt.show()

In [4]:
def create_plot(data):
    # Ambil baris pertama dari DataFrame
    first_row = data.iloc[0]

    # Ambil nilai latitude dan longitude dari baris pertama
    latitude = first_row['latitude']
    longitude = first_row['longitude']
    
    m = folium.Map(location=[latitude, longitude], zoom_start=25)

    # Add CircleMarkers for each point
    for index, row in data.iterrows():
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=5,  # Marker size
            color="blue",  # Marker color
            fill=True,
            fill_color="blue",  # Fill color of the marker
            fill_opacity=0.7,  # Opacity of the marker fill
            popup=f"User ID: {row['maid']}<br>Latitude: {row['latitude']}<br>Longitude: {row['longitude']}",
        ).add_to(m)
    
    return m

In [5]:
def create_pivot(data):
    df1_1 = data.copy()
    df1_pivot = df1_1['maid'].groupby(df1_1['tanggal']).value_counts()
    pivot = df1_pivot.unstack().fillna(0).astype(int)

    total_counts = pivot.sum(axis=0)
    sorted_columns = total_counts.sort_values(ascending=False).index
    pivot_sorted = pivot[sorted_columns]
    return pivot_sorted

In [7]:
def drop_single_occurrences(df, column_name):
    # Count occurrences of each value in the specified column
    value_counts = df[column_name].value_counts()

    # Filter values that occur more than once
    valid_values = value_counts[value_counts > 1].index

    # Drop rows where specified column value occurs only once
    df_filtered = df[df[column_name].isin(valid_values)]

    return df_filtered

In [8]:
def remove_stops(gps, stops):
    stops_df = stops.copy()
    gps_df = gps.copy()

    # Convert datetime columns to datetime objects
    stops_df['datetime_wib'] = pd.to_datetime(stops_df['datetime_wib'])
    stops_df['leaving_datetime'] = pd.to_datetime(stops_df['leaving_datetime'])
    gps_df['datetime_wib'] = pd.to_datetime(gps_df['datetime_wib'])

    # Iterate through each row in Stops DataFrame
    for index, row in stops_df.iterrows():
        uid = row['maid']
        start_time = row['datetime_wib']
        end_time = row['leaving_datetime']

        # Filter GPS DataFrame based on UID and time range
        filtered_gps_df = gps_df[(gps_df['maid'] == uid) & (gps_df['datetime_wib'] >= start_time) & (gps_df['datetime_wib'] < end_time)]
        # Remove corresponding rows from GPS DataFrame
        gps_df = gps_df.drop(filtered_gps_df.index)
    
    # Reset index of GPS DataFrame
    gps_df = gps_df.reset_index(drop=True)
    gps_df_clean = drop_single_occurrences(gps_df, "maid")   

    # Resulting GPS DataFrame without eliminated data
    print("Number of points dropped from original data: %d" %(len(gps) - len(gps_df_clean)))
    
    return gps_df_clean

In [9]:
def third_preprocessing(df):
    # 1. Noise filtering >200 kmh
    df1 = df.copy()

    df1['datetime_wib'].astype(str)
    tdf = skmob.TrajDataFrame(df1, latitude='latitude', longitude='longitude', user_id='maid', datetime='datetime_wib')
    ftdf = filtering.filter(tdf, max_speed_kmh=200.)
    temp = ftdf.copy()
    df2 = pd.DataFrame(temp)
    df2.rename(columns={'uid': 'maid', 'lat': 'latitude', 'lng':'longitude', 'datetime':'datetime_wib'}, inplace=True)

    # 2. Exclude stop points for time threshold <5min and spatial threshold <100meter
    stdf = detection.stay_locations(ftdf, minutes_for_a_stop=5.0, spatial_radius_km=0.1, leaving_time=True)
    temp = stdf.copy()
    stdf_df = pd.DataFrame(temp)
    stdf_df.rename(columns={'uid': 'maid', 'lat': 'latitude', 'lng':'longitude', 'datetime':'datetime_wib'}, inplace=True)

    df3 = remove_stops(df2, stdf_df)    
    
    # 3. Trajectory compression
#     df3_tdf = skmob.TrajDataFrame(df3, latitude='latitude', longitude='longitude', user_id='maid', datetime='datetime_wib')
#     ctdf = compression.compress(df3_tdf, spatial_radius_km = 0.01)
#     temp = ctdf.copy()
#     ctdf_df = pd.DataFrame(temp)
#     ctdf_df.rename(columns={'uid': 'maid', 'lat': 'latitude', 'lng':'longitude', 'datetime':'datetime_wib'}, inplace=True)

    return df3

In [10]:
def stay(data):    
    data['datetime_wib'] = pd.to_datetime(data['datetime_wib'])

    tdf = skmob.TrajDataFrame(data, latitude='latitude', longitude='longitude', datetime='datetime_wib', user_id='maid')

    stays = detection.stay_locations(
        tdf, 
        spatial_radius_km=0.2,
        minutes_for_a_stop=5,
        leaving_time=True)
    stays = pd.DataFrame(stays)
    stays.rename(columns={'uid': 'maid', 'lat': 'latitude', 'lng':'longitude', 'datetime':'datetime_wib'}, inplace=True)

    return stays

In [11]:
data = pd.DataFrame({'maid': [1] * (13),
                     'latitude': [1] * 5  + [2] * 3 + [3, 3] + [1] * 3,  
                     'longitude': [1] * 5  + [2] * 3 + [3, 3] + [1] * 3,
                     'datetime_wib': ['2018-01-01 ' + f'00:{x}' for x in [0, 1, 2, 3, 6, 20, 21, 22, 23, 30, 32, 34, 35]]})
stays = stay(data)
data1 = remove_stops(data, stays)

stays1 = stay(data1)
data2 = remove_stops(data1, stays1)

stays2 = stay(data2)
data3 = remove_stops(data2, stays2)
print(data)
print(data1)
print(data2)

Number of points dropped from original data: 7
Number of points dropped from original data: 3
Number of points dropped from original data: 0
    maid  latitude  longitude        datetime_wib
0      1         1          1 2018-01-01 00:00:00
1      1         1          1 2018-01-01 00:01:00
2      1         1          1 2018-01-01 00:02:00
3      1         1          1 2018-01-01 00:03:00
4      1         1          1 2018-01-01 00:06:00
5      1         2          2 2018-01-01 00:20:00
6      1         2          2 2018-01-01 00:21:00
7      1         2          2 2018-01-01 00:22:00
8      1         3          3 2018-01-01 00:23:00
9      1         3          3 2018-01-01 00:30:00
10     1         1          1 2018-01-01 00:32:00
11     1         1          1 2018-01-01 00:34:00
12     1         1          1 2018-01-01 00:35:00
   maid  latitude  longitude        datetime_wib
0     1         2          2 2018-01-01 00:20:00
1     1         2          2 2018-01-01 00:21:00
2     1     

In [12]:
path = '../DataGPS_Malioboro/filter2_malioboro_des.csv'
df = pd.read_csv(path)
stay_df = stay(df)
filter3_df = remove_stops(df, stay_df)

stay_df1 = stay(filter3_df)
filter4_df = remove_stops(filter3_df, stay_df1)

Number of points dropped from original data: 35821
Number of points dropped from original data: 167


In [13]:
def find_dropped_rows(df1, df2):
    
    # Find rows in df1 that are not in df2
    dropped_rows = df1.merge(df2, indicator=True, how='left').loc[lambda x: x['_merge'] == 'left_only']

    # Drop the indicator column
    dropped_rows.drop(columns='_merge', inplace=True)

    return dropped_rows


In [14]:
# data original
df_maid = df[(df["maid"] == "45ee6ef7-1938-401b-8b7a-9b9260109311")]
df_maid

Unnamed: 0,maid,latitude,longitude,datetime_wib,geometry,Kelurahan/Desa,Kecamatan,Kabupaten,tanggal
2003,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.789932,110.366997,2021-12-04 10:15:17,POINT (110.36699676513672 -7.789932),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2004,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.78994,110.366997,2021-12-04 10:15:27,POINT (110.36699676513672 -7.78994),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2005,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.79002,110.366837,2021-12-04 10:16:37,POINT (110.36683654785156 -7.79002),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2006,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.790017,110.36684,2021-12-04 10:16:47,POINT (110.36684 -7.790017),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2036,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.7899,110.36623,2021-12-04 12:01:16,POINT (110.36623 -7.7899),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2037,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.7899,110.366211,2021-12-04 12:01:26,POINT (110.3662109375 -7.7899),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2038,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.78989,110.366188,2021-12-04 12:02:36,POINT (110.3661880493164 -7.78989),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2039,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.78989,110.366188,2021-12-04 12:02:46,POINT (110.3661880493164 -7.78989),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2040,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.78991,110.366211,2021-12-04 12:04:17,POINT (110.3662109375 -7.78991),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
2041,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.78991,110.36621,2021-12-04 12:04:27,POINT (110.36621 -7.78991),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04


In [15]:
# data stay locations 1
filter3_df_maid = filter3_df[(filter3_df["maid"] == "45ee6ef7-1938-401b-8b7a-9b9260109311")]
filter3_df_maid

Unnamed: 0,maid,latitude,longitude,datetime_wib,geometry,Kelurahan/Desa,Kecamatan,Kabupaten,tanggal
171,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.79287,110.366318,2021-12-04 12:10:22,POINT (110.36631774902344 -7.79287),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
1816,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.80107,110.36532,2021-12-04 10:10:34,POINT (110.36532 -7.80107),Kelurahan Ngupasan,Gondomanan,Kota Yogyakarta,2021-12-04
5792,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.78994,110.365379,2021-12-04 15:51:58,POINT (110.3653793334961 -7.78994),Kelurahan Sosromenduran,Gedongtengen,Kota Yogyakarta,2021-12-04


In [16]:
create_plot(filter3_df_maid)

In [17]:
# data stay locations 2
filter4_df_maid = filter4_df[filter4_df["maid"] == "45ee6ef7-1938-401b-8b7a-9b9260109311"]
filter4_df_maid

Unnamed: 0,maid,latitude,longitude,datetime_wib,geometry,Kelurahan/Desa,Kecamatan,Kabupaten,tanggal


In [18]:
a = find_dropped_rows(filter3_df, filter4_df)
a_maid = a[a["maid"] == "45ee6ef7-1938-401b-8b7a-9b9260109311"]
a_maid

Unnamed: 0,maid,latitude,longitude,datetime_wib,geometry,Kelurahan/Desa,Kecamatan,Kabupaten,tanggal
82,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.79287,110.366318,2021-12-04 12:10:22,POINT (110.36631774902344 -7.79287),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-04
925,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.80107,110.36532,2021-12-04 10:10:34,POINT (110.36532 -7.80107),Kelurahan Ngupasan,Gondomanan,Kota Yogyakarta,2021-12-04
2751,45ee6ef7-1938-401b-8b7a-9b9260109311,-7.78994,110.365379,2021-12-04 15:51:58,POINT (110.3653793334961 -7.78994),Kelurahan Sosromenduran,Gedongtengen,Kota Yogyakarta,2021-12-04


In [19]:
df_maid_stay= stay(df_maid)
df_maid_filtered = remove_stops(df_maid, df_maid_stay)

df_maid_stay1 = stay(df_maid_filtered)
df_maid_filtered1 = remove_stops(df_maid_filtered, df_maid_stay1)

Number of points dropped from original data: 23
Number of points dropped from original data: 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['datetime_wib'] = pd.to_datetime(data['datetime_wib'])


In [20]:
df_maid = df[(df["maid"] == "6065d08e-1b9a-45ed-9042-fc0773b25d3a") & (df["tanggal"] == "2021-12-03")]
df_maid

Unnamed: 0,maid,latitude,longitude,datetime_wib,geometry,Kelurahan/Desa,Kecamatan,Kabupaten,tanggal
1664,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79347,110.365753,2021-12-03 07:30:36,POINT (110.36575317382812 -7.79347),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-03
1665,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79349,110.365753,2021-12-03 07:30:46,POINT (110.36575317382812 -7.79349),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-03
1666,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79287,110.365810,2021-12-03 07:32:32,POINT (110.36581 -7.79287),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-03
1667,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79287,110.365810,2021-12-03 07:32:41,POINT (110.36581 -7.79287),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-03
1669,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79102,110.366150,2021-12-03 07:40:04,POINT (110.36614990234376 -7.79102),Kelurahan Suryatmajan,Danurejan,Kota Yogyakarta,2021-12-03
...,...,...,...,...,...,...,...,...,...
34638,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79377,110.364822,2021-12-03 22:43:27,POINT (110.36482238769533 -7.79377),Kelurahan Sosromenduran,Gedongtengen,Kota Yogyakarta,2021-12-03
34639,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79377,110.364822,2021-12-03 22:49:03,POINT (110.36482238769533 -7.79377),Kelurahan Sosromenduran,Gedongtengen,Kota Yogyakarta,2021-12-03
34640,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79378,110.364807,2021-12-03 22:49:54,POINT (110.36480712890624 -7.79378),Kelurahan Sosromenduran,Gedongtengen,Kota Yogyakarta,2021-12-03
34641,6065d08e-1b9a-45ed-9042-fc0773b25d3a,-7.79378,110.364807,2021-12-03 22:59:34,POINT (110.36480712890624 -7.79378),Kelurahan Sosromenduran,Gedongtengen,Kota Yogyakarta,2021-12-03


In [21]:
create_plot(df_maid)

In [26]:
create_pivot(df)

maid,9a63fd0e-d363-478a-8136-99b1eb82693f,f501c8a1-e251-4b8c-87fc-9a1a26d2e19d,6065d08e-1b9a-45ed-9042-fc0773b25d3a,a2b6a27a-35e9-4735-909b-6cfbafd288ac,4a2de4fb-2878-41ae-bc12-2890dc0f62b8,52875375-e02d-4ada-aaa5-10c79e2ac8f9,07c459f8-7331-479a-a109-0e6e0b419a9e,15c7b94d-a39c-4f55-bb9c-585f6afcddf7,e3b468cd-558b-485a-9ffb-19b307798fdf,5fd88d93-817a-4804-a6cd-3392c125e0a9,...,31e3cb72-7c36-40f8-a5f8-f1e6480f1e83,31f0338b-8ce2-4c4b-9578-cc60073cfaac,84023cc6-9052-4a3c-892f-acd277f5080d,56d58bf2-6bfe-42ff-ab2e-946731bd876b,ba13e423-3e6e-4baa-a9b5-98fa147e5e15,b9f91dc7-e98e-4db6-a1a9-d946e71e8344,84085ef1-30fc-44a1-b2db-a28cea80e759,b9e3686d-4361-4f35-b5df-e0b208603b65,840c5677-2922-47a6-8aaa-0c31e11403de,69205336-4054-4a3a-800f-23037bc73e45
tanggal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01,0,34,130,551,0,0,0,0,63,4,...,0,0,0,0,0,0,0,0,0,0
2021-12-02,0,0,285,0,20,0,0,0,125,0,...,0,0,0,0,0,0,0,0,0,0
2021-12-03,0,12,316,0,25,0,0,0,48,0,...,0,0,0,0,0,0,0,0,0,0
2021-12-04,89,18,44,0,13,0,0,0,0,0,...,0,0,2,0,0,0,2,0,0,0
2021-12-05,128,20,0,0,10,0,0,2,3,0,...,0,0,0,0,0,0,0,0,0,0
2021-12-06,76,22,0,0,16,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0
2021-12-07,84,0,0,0,15,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0
2021-12-08,0,0,0,0,17,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2021-12-09,0,0,0,0,31,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2021-12-10,0,0,0,0,19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
