# Qualitative analysis of outlier detection 

In [1]:
import pandas as pd
import pymove as pm
from pymove import MoveDataFrame
from pymove import filters
import pymove as pm
from pymove.visualization import folium as f, matplotlib as mpl
from pymove.utils import visual
from pymove.preprocessing import stay_point_detection, compression, segmentation
import numpy as np
from pyproj import Geod
from shapely.ops import nearest_points
import json
from shapely.geometry import Point, LineString, Polygon

## Qualititative analysis based on the shapefile

### Trajectory loading

In [2]:
df= pd.read_csv("../datasets/my_traj.csv")
f.plot_trajectories(MoveDataFrame(data=df, latitude='latitude', longitude='longitude', traj_id='id', datetime='time'))

### Shapefile loading & Ground truth computation

In [3]:
from pyproj import CRS, Geod
line = "034b"
variante = 1
shapefile= pd.read_csv("datasets/shapefiles.csv", delimiter=";")
shapefile = shapefile[(shapefile["LIGNE"]==line) & (shapefile["VARIANTE"]==variante )]["Geo Shape"].values[0]
shapefile = json.loads(shapefile)

crs_utm = CRS.from_user_input(31370)
geod = crs_utm.get_geod()  # Your data may be from a different Geod.
distance = np.zeros(len(df))
nearest_points_list = []
df = df.reset_index(drop=True)
line_shape = LineString(shapefile["coordinates"]) 
for index, row in df.iterrows():
    point = Point(row["longitude"], row["latitude"])
    closest_points = nearest_points(line_shape, point)
    nearest_points_list.append(closest_points[0])
    distance[index] = geod.geometry_length(LineString(closest_points))
    
df["distance"] = distance 
#df["nearest_point"] = nearest_points_list
mean = df["distance"].mean()
median = df["distance"].median()
std = df["distance"].std()
k=1.5
threshold = median + k * std
#df["nearest_point"] = nearest_points_list
print(mean, std)
outliers = df[df["distance"] > threshold]
is_normal = np.zeros(len(df))
for i in range(len(df['time'].values)):
    if df['time'].values[i] not in outliers['time'].values:
        is_normal[i] = 1
df["is normal"] = is_normal > 0
df

6.99931376823816 6.984997264301602


Unnamed: 0,time,latitude,longitude,id,distance,is normal
0,2023-06-30 13:34:13+00:00,50.839978,4.365087,0,6.434879,True
1,2023-06-30 13:34:15+00:00,50.840059,4.365243,0,6.241407,True
2,2023-06-30 13:34:17+00:00,50.840115,4.365356,0,6.274832,True
3,2023-06-30 13:34:55+00:00,50.840184,4.365509,0,6.957367,True
4,2023-06-30 13:34:58+00:00,50.840269,4.365628,0,4.605384,True
...,...,...,...,...,...,...
312,2023-06-30 13:55:35+00:00,50.821950,4.404838,0,4.315869,True
313,2023-06-30 13:55:37+00:00,50.821916,4.405050,0,1.384736,True
314,2023-06-30 13:55:39+00:00,50.821854,4.405199,0,3.199198,True
315,2023-06-30 13:55:41+00:00,50.821825,4.405334,0,2.081433,True


In [4]:
trajectory_df = MoveDataFrame(data=df, latitude='latitude', longitude='longitude', traj_id='id', datetime='time')

#### Speed based method

In [5]:
cleaned_df = filters.clean_gps_speed_max_radius(trajectory_df,speed_max=9)
cleaned_df.to_csv('csv_results/cleaned_df_pymove_trip.csv', index=False)
cleaned_df

VBox(children=(HTML(value=''), IntProgress(value=0, max=1)))

Unnamed: 0,id,datetime,lat,lon,distance,is normal,dist_to_prev,time_to_prev,speed_to_prev
0,0,2023-06-30 13:34:13,50.839978,4.365087,6.434879,True,,,
1,0,2023-06-30 13:34:15,50.840059,4.365243,6.241407,True,14.206847,2.0,7.103424
2,0,2023-06-30 13:34:17,50.840115,4.365356,6.274832,True,10.097854,2.0,5.048927
3,0,2023-06-30 13:34:55,50.840184,4.365509,6.957367,True,13.190598,38.0,0.347121
4,0,2023-06-30 13:34:58,50.840269,4.365628,4.605384,True,12.599354,3.0,4.199785
...,...,...,...,...,...,...,...,...,...
312,0,2023-06-30 13:55:35,50.821950,4.404838,4.315869,True,15.924017,2.0,7.962009
313,0,2023-06-30 13:55:37,50.821916,4.405050,1.384736,True,15.397282,2.0,7.698641
314,0,2023-06-30 13:55:39,50.821854,4.405199,3.199198,True,12.475083,2.0,6.237542
315,0,2023-06-30 13:55:41,50.821825,4.405334,2.081433,True,10.032891,2.0,5.016446


In [6]:
outlier_df = pd.DataFrame(trajectory_df).merge(pd.DataFrame(cleaned_df).drop_duplicates(), on=['datetime'], how='left', indicator=True)
outlier_df = outlier_df[outlier_df["_merge"] == "left_only"]
outlier_df

Unnamed: 0,datetime,lat_x,lon_x,id_x,distance_x,is normal_x,id_y,lat_y,lon_y,distance_y,is normal_y,dist_to_prev,time_to_prev,speed_to_prev,_merge
32,2023-06-30 13:37:09,50.839815,4.369780,0,3.761563,True,,,,,,,,,left_only
33,2023-06-30 13:37:11,50.839754,4.369984,0,2.121975,True,,,,,,,,,left_only
38,2023-06-30 13:37:20,50.839453,4.370871,0,9.762360,True,,,,,,,,,left_only
39,2023-06-30 13:37:21,50.839367,4.370965,0,16.845646,False,,,,,,,,,left_only
40,2023-06-30 13:37:23,50.839255,4.371156,0,24.545542,False,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,2023-06-30 13:55:07,50.823109,4.402318,0,4.971081,True,,,,,,,,,left_only
298,2023-06-30 13:55:09,50.823024,4.402457,0,4.354680,True,,,,,,,,,left_only
302,2023-06-30 13:55:16,50.822703,4.403070,0,4.721914,True,,,,,,,,,left_only
303,2023-06-30 13:55:17,50.822627,4.403169,0,2.393939,True,,,,,,,,,left_only


In [7]:
TP = len(cleaned_df[cleaned_df["is normal"] == True])
FP = len(cleaned_df[cleaned_df["is normal"] == False])
TN = len(outlier_df[outlier_df["is normal_x"] == False])
FN = len(outlier_df[outlier_df["is normal_x"] == True])
P = TP / (TP + FP)
A = (TP + TN) / (TP + TN + FP + FN)
R = TP / (TP + FN)
F1 = 2 * (P * R) / (P + R)
print("TP:{}\nFP:{}\nTN:{}\nFN:{}".format(TP,FP,TN,FN))
print("Precision:", round(P,3))
print("Accuracy::",round(A,3))
print("Recall:", round(R,3))
print("F1:", round(F1,3))

TP:219
FP:29
TN:6
FN:63
Precision: 0.883
Accuracy:: 0.71
Recall: 0.777
F1: 0.826


## Qualititative analysis based on the introduction of outliers

In [9]:
df= pd.read_csv("datasets/berlinMOD_with_outlier.csv")
trajectory_df = MoveDataFrame(data=df, latitude='latitude', longitude='longitude', traj_id='id', datetime='time')

#### speed based method

In [10]:
cleaned_df = filters.clean_gps_speed_max_radius(trajectory_df,speed_max=25)
cleaned_df
cleaned_df.to_csv('csv_results/cleaned_df_pymove_MOD.csv', index=False)

VBox(children=(HTML(value=''), IntProgress(value=0, max=1)))

In [11]:
outlier_df = pd.DataFrame(trajectory_df).merge(pd.DataFrame(cleaned_df).drop_duplicates(), on=['datetime'], how='left', indicator=True)
outlier_df = outlier_df[outlier_df["_merge"] == "left_only"]
outlier_df

Unnamed: 0,id_x,datetime,lat_x,lon_x,OLD_Latitude_x,OLD_Longitude_x,is normal_x,lat_var_x,lon_var_x,id_y,...,lon_y,OLD_Latitude_y,OLD_Longitude_y,is normal_y,lat_var_y,lon_var_y,dist_to_prev,time_to_prev,speed_to_prev,_merge
11,8,2020-06-01 08:56:28.127688,50.861891,4.520971,50.861891,4.465701,False,0.000000,0.055269,,...,,,,,,,,,,left_only
12,8,2020-06-01 08:56:29.028663,50.861911,4.465593,50.861911,4.465593,True,0.000000,0.000000,,...,,,,,,,,,,left_only
13,8,2020-06-01 08:56:32.628663,50.861987,4.465161,50.861987,4.465161,True,0.000000,0.000000,,...,,,,,,,,,,left_only
81,8,2020-06-01 08:58:00.120432,50.918525,4.463031,50.863289,4.463031,False,0.055235,0.000000,,...,,,,,,,,,,left_only
82,8,2020-06-01 08:58:01.200432,50.863343,4.462927,50.863343,4.462927,True,0.000000,0.000000,,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4859,8,2020-06-04 21:55:29.454902,50.863380,4.462846,50.863380,4.462846,True,0.000000,0.000000,,...,,,,,,,,,,left_only
4860,8,2020-06-04 21:55:29.829902,50.863364,4.462883,50.863364,4.462883,True,0.000000,0.000000,,...,,,,,,,,,,left_only
4871,8,2020-06-04 21:55:44.586342,50.863171,4.557394,50.863171,4.463224,False,0.000000,0.094170,,...,,,,,,,,,,left_only
4872,8,2020-06-04 21:55:50.239350,50.863164,4.463236,50.863164,4.463236,True,0.000000,0.000000,,...,,,,,,,,,,left_only


In [12]:
TP = len(cleaned_df[cleaned_df["is normal"] == True])
FP = len(cleaned_df[cleaned_df["is normal"] == False])
TN = len(outlier_df[outlier_df["is normal_x"] == False])
FN = len(outlier_df[outlier_df["is normal_x"] == True])
P = TP / (TP + FP)
A = (TP + TN) / (TP + TN + FP + FN)
R = TP / (TP + FN)
F1 = 2 * (P * R) / (P + R)
print("TP:{}\nFP:{}\nTN:{}\nFN:{}".format(TP,FP,TN,FN))
print("Precision:", round(P,3))
print("Accuracy::",round(A,3))
print("Recall:", round(R,3))
print("F1:", round(F1,3))

TP:4650
FP:0
TN:99
FN:193
Precision: 1.0
Accuracy:: 0.961
Recall: 0.96
F1: 0.98
