# Qualitative analysis of outlier detection 

In [1]:
import pandas as pd
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.temporal_features import TemporalFeatures as temporal
from ptrail.features.kinematic_features import KinematicFeatures as kinematic
from ptrail.preprocessing.filters import Filters as filt
from ptrail.visualization import TrajPlotter
import ptrail.utilities.constants as const
import numpy as np
import folium
from pyproj import Geod
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points
import json

In [2]:
LAT = "lat"
LONG = "lng"
ID = 'tid'
def visualization(trajectory_df,color="blue"):    
    m2 = folium.Map(location=[(np.max(trajectory_df[const.LAT])+np.min(trajectory_df[const.LAT]))/2, (np.max(trajectory_df[const.LONG])+np.min(trajectory_df[const.LONG]))/2], zoom_start=14, width="70%", height="100") # create the map
    ids = trajectory_df.index.get_level_values('traj_id').unique() # get all uniques ids
    for i in ids: # add a line per trajectory and avoid linked trajectories
        tmp_df = trajectory_df[trajectory_df.index.get_level_values('traj_id') == i]
        coords = [zip(tmp_df[const.LAT], tmp_df[const.LONG])]
        folium.PolyLine(coords,
                        color=color,
                        weight=4,
                        opacity=0.7).add_to(m2)
    return m2

## Qualititative analysis based on the shapefile

### Trajectory loading

In [3]:
threshold = 3
df= pd.read_csv("datasets/my_traj.csv")
visualization(PTRAILDataFrame(data_set = df, latitude='latitude', longitude='longitude', datetime='time', traj_id='id'),"#0080FF")

### Shapefile loading & Ground truth computation

In [4]:
from pyproj import CRS, Geod
line = "034b"
variante = 1
shapefile= pd.read_csv("datasets/shapefiles.csv", delimiter=";")
shapefile = shapefile[(shapefile["LIGNE"]==line) & (shapefile["VARIANTE"]==variante )]["Geo Shape"].values[0]
shapefile = json.loads(shapefile)

crs_utm = CRS.from_user_input(31370)
geod = crs_utm.get_geod()  # Your data may be from a different Geod.
distance = np.zeros(len(df))
nearest_points_list = []
df = df.reset_index(drop=True)
line_shape = LineString(shapefile["coordinates"]) 
for index, row in df.iterrows():
    point = Point(row["longitude"], row["latitude"])
    closest_points = nearest_points(line_shape, point)
    nearest_points_list.append(closest_points[0])
    distance[index] = geod.geometry_length(LineString(closest_points))
    
df["distance"] = distance 
#df["nearest_point"] = nearest_points_list
mean = df["distance"].mean()
median = df["distance"].median()
std = df["distance"].std()
k=1.5
threshold = median + k * std
#df["nearest_point"] = nearest_points_list
print(mean, std)
outliers = df[df["distance"] > threshold]
is_normal = np.zeros(len(df))
for i in range(len(df['time'].values)):
    if df['time'].values[i] not in outliers['time'].values:
        is_normal[i] = 1
df["is normal"] = is_normal > 0
df

6.99931376823816 6.984997264301602


Unnamed: 0,time,latitude,longitude,id,distance,is normal
0,2023-06-30 13:34:13+00:00,50.839978,4.365087,0,6.434879,True
1,2023-06-30 13:34:15+00:00,50.840059,4.365243,0,6.241407,True
2,2023-06-30 13:34:17+00:00,50.840115,4.365356,0,6.274832,True
3,2023-06-30 13:34:55+00:00,50.840184,4.365509,0,6.957367,True
4,2023-06-30 13:34:58+00:00,50.840269,4.365628,0,4.605384,True
...,...,...,...,...,...,...
312,2023-06-30 13:55:35+00:00,50.821950,4.404838,0,4.315869,True
313,2023-06-30 13:55:37+00:00,50.821916,4.405050,0,1.384736,True
314,2023-06-30 13:55:39+00:00,50.821854,4.405199,0,3.199198,True
315,2023-06-30 13:55:41+00:00,50.821825,4.405334,0,2.081433,True


In [5]:
trajectory_df = PTRAILDataFrame(data_set = df, latitude='latitude', longitude='longitude', datetime='time', traj_id='id')
trajectory_df

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,distance,is normal
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2023-06-30 13:34:13+00:00,50.839978,4.365087,6.434879,True
0,2023-06-30 13:34:15+00:00,50.840059,4.365243,6.241407,True
0,2023-06-30 13:34:17+00:00,50.840115,4.365356,6.274832,True
0,2023-06-30 13:34:55+00:00,50.840184,4.365509,6.957367,True
0,2023-06-30 13:34:58+00:00,50.840269,4.365628,4.605384,True
0,...,...,...,...,...
0,2023-06-30 13:55:35+00:00,50.821950,4.404838,4.315869,True
0,2023-06-30 13:55:37+00:00,50.821916,4.405050,1.384736,True
0,2023-06-30 13:55:39+00:00,50.821854,4.405199,3.199198,True
0,2023-06-30 13:55:41+00:00,50.821825,4.405334,2.081433,True


#### Hampel based method

In [6]:
trajectory_df = kinematic.create_speed_column(trajectory_df)
trajectory_df

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,distance,is normal,Distance,Speed
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2023-06-30 13:34:13+00:00,50.839978,4.365087,6.434879,True,,
0,2023-06-30 13:34:15+00:00,50.840059,4.365243,6.241407,True,14.206847,7.103424
0,2023-06-30 13:34:17+00:00,50.840115,4.365356,6.274832,True,10.097854,5.048927
0,2023-06-30 13:34:55+00:00,50.840184,4.365509,6.957367,True,13.190598,0.347121
0,2023-06-30 13:34:58+00:00,50.840269,4.365628,4.605384,True,12.599354,4.199785
0,...,...,...,...,...,...,...
0,2023-06-30 13:55:35+00:00,50.821950,4.404838,4.315869,True,15.924017,7.962009
0,2023-06-30 13:55:37+00:00,50.821916,4.405050,1.384736,True,15.397282,7.698641
0,2023-06-30 13:55:39+00:00,50.821854,4.405199,3.199198,True,12.475083,6.237542
0,2023-06-30 13:55:41+00:00,50.821825,4.405334,2.081433,True,10.032891,5.016446


In [7]:
cleaned_df = filt.hampel_outlier_detection(trajectory_df, column_name='Speed')
cleaned_df
cleaned_df.to_csv('csv_results/cleaned_df_ptrail_hampel_trip.csv', index=False)

#visualization(cleaned_df,"#FF00FF")

Process Process-2:
multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/pierre-cedric/miniconda3/envs/ptrail/lib/python3.9/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/pierre-cedric/miniconda3/envs/ptrail/lib/python3.9/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "/home/pierre-cedric/miniconda3/envs/ptrail/lib/python3.9/site-packages/ptrail/preprocessing/helpers.py", line 390, in hampel_help
    to_return = df.drop(df.index[outlier_indices])
  File "/home/pierre-cedric/miniconda3/envs/ptrail/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 5382, in __getitem__
    result = getitem(key)
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call 

ValueError: No objects to concatenate

In [11]:
outlier_df = trajectory_df.reset_index().merge(cleaned_df.reset_index(), on=['DateTime'], how='left', indicator=True)
outlier_df = outlier_df[outlier_df["_merge"] == "left_only"]
outlier_df

NameError: name 'cleaned_df' is not defined

In [12]:
TP = len(cleaned_df[cleaned_df["is normal"] == True])
FP = len(cleaned_df[cleaned_df["is normal"] == False])
TN = len(outlier_df[outlier_df["is normal_x"] == False])
FN = len(outlier_df[outlier_df["is normal_x"] == True])

print("TP:{}\nFP:{}\nTN:{}\nFN:{}".format(TP,FP,TN,FN))
print("Precision:", TP / (TP + FP))
print("Accuracy::",(TP + TN) / (TP + TN + FP + FN))
print("Recall:", TP / (TP + FN))

NameError: name 'cleaned_df' is not defined

#### IQR based  method

In [16]:
cleaned_df = filt.filter_outliers_by_consecutive_speed(dataframe=trajectory_df)
visualization(cleaned_df,"#FF00FF")
cleaned_df.to_csv('csv_results/cleaned_df_ptrail_iqr_trip.csv', index=False)

In [17]:
outlier_df = trajectory_df.reset_index().merge(cleaned_df.reset_index(), on=['DateTime'], how='left', indicator=True)
outlier_df = outlier_df[outlier_df["_merge"] == "left_only"]
outlier_df

Unnamed: 0,traj_id_x,DateTime,lat_x,lon_x,distance_x,is normal_x,Distance_x,Speed_x,traj_id_y,lat_y,lon_y,distance_y,is normal_y,Distance_y,Speed_y,_merge
0,0,2023-06-30 13:34:13+00:00,50.839978,4.365087,6.434879,False,,,,,,,,,,left_only
45,0,2023-06-30 13:37:32+00:00,50.839175,4.37187,16.647453,False,14.536831,14.536831,,,,,,,,left_only
64,0,2023-06-30 13:38:53+00:00,50.837625,4.372092,11.784644,False,19.286196,19.286196,,,,,,,,left_only
98,0,2023-06-30 13:41:56+00:00,50.835857,4.374173,0.858093,True,29.308207,14.654104,,,,,,,,left_only
120,0,2023-06-30 13:43:03+00:00,50.835689,4.377672,13.56484,False,15.98328,15.98328,,,,,,,,left_only
132,0,2023-06-30 13:43:41+00:00,50.835871,4.379739,6.307232,False,16.253435,16.253435,,,,,,,,left_only
152,0,2023-06-30 13:44:49+00:00,50.834753,4.383009,0.930106,True,16.437426,16.437426,,,,,,,,left_only
292,0,2023-06-30 13:55:01+00:00,50.823541,4.401665,8.955597,False,16.051997,16.051997,,,,,,,,left_only


In [18]:
TP = len(cleaned_df[cleaned_df["is normal"] == True])
FP = len(cleaned_df[cleaned_df["is normal"] == False])
TN = len(outlier_df[outlier_df["is normal_x"] == False])
FN = len(outlier_df[outlier_df["is normal_x"] == True])

print("TP:{}\nFP:{}\nTN:{}\nFN:{}".format(TP,FP,TN,FN))
print("Precision:", TP / (TP + FP))
print("Accuracy::",(TP + TN) / (TP + TN + FP + FN))
print("Recall:", TP / (TP + FN))

TP:99
FP:210
TN:6
FN:2
Precision: 0.32038834951456313
Accuracy:: 0.3312302839116719
Recall: 0.9801980198019802


## Qualititative analysis based on the introduction of outliers

### Ground truth computation

In [19]:
PROB = 0.99
EPS = 10**(-3)
df= pd.read_csv("datasets/cars_sample.csv")
variation_lat = np.zeros(len(df))
variation_lon = np.zeros(len(df))
is_normal = np.zeros(len(df))
for i in range(len(variation_lat)):
    p = np.random.random()
    if p > PROB:
        variation_lat[i] = np.random.random()*EPS
    p = np.random.random()
    if p > PROB:
        variation_lon[i] = np.random.random()*EPS
    if variation_lat[i] == 0 and variation_lon[i] == 0:
        is_normal[i] = 1
df["Latitude"] += variation_lat
df["Longitude"] += variation_lon
df["is normal"] = is_normal > 0
df = df.rename(columns={"Longitude": "longitude", "Latitude": "latitude", "Timestamp": "time"})
df

Unnamed: 0,id,time,latitude,longitude,is normal
0,8,2020-06-01 08:56:08.148+00,50.861073,4.465373,True
1,8,2020-06-01 08:56:09.648+00,50.861100,4.465386,True
2,8,2020-06-01 08:56:10.398+00,50.861127,4.465398,True
3,8,2020-06-01 08:56:10.797745+00,50.861146,4.465406,True
4,8,2020-06-01 08:56:24.229685+00,50.861756,4.465684,True
...,...,...,...,...,...
40318,141,2020-06-01 22:13:04.924541+00,50.860548,4.493296,True
40319,141,2020-06-01 22:13:06.924541+00,50.860522,4.493314,True
40320,141,2020-06-01 22:13:07.781683+00,50.860496,4.493333,True
40321,141,2020-06-01 22:13:09.04435+00,50.860442,4.493372,True


In [20]:
df= pd.read_csv("datasets/berlinMOD_sample_.csv")
trajectory_df = PTRAILDataFrame(data_set = df, latitude='latitude', longitude='longitude', datetime='time', traj_id='id')

#### Hampel based method

In [21]:
trajectory_df = kinematic.create_speed_column(trajectory_df)
cleaned_df= filt.hampel_outlier_detection(dataframe=trajectory_df, column_name='Speed')
cleaned_df

Process Process-15:
multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/pierre-cedric/miniconda3/envs/ptrail/lib/python3.9/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/pierre-cedric/miniconda3/envs/ptrail/lib/python3.9/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "/home/pierre-cedric/miniconda3/envs/ptrail/lib/python3.9/site-packages/ptrail/preprocessing/helpers.py", line 390, in hampel_help
    to_return = df.drop(df.index[outlier_indices])
  File "/home/pierre-cedric/miniconda3/envs/ptrail/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 5382, in __getitem__
    result = getitem(key)
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call

ValueError: No objects to concatenate

In [22]:
outlier_df = trajectory_df.reset_index().merge(cleaned_df.reset_index(), on=['DateTime'], how='left', indicator=True)
outlier_df = outlier_df[outlier_df["_merge"] == "left_only"]
outlier_df

Unnamed: 0,traj_id_x,DateTime,lat_x,lon_x,is normal_x,Distance_x,Speed_x,traj_id_y,lat_y,lon_y,distance,is normal_y,Distance_y,Speed_y,_merge
0,141,2020-06-01 08:25:35.332000+00:00,50.860205,4.493563,True,,,,,,,,,,left_only
1,141,2020-06-01 08:25:36.832000+00:00,50.860231,4.493543,True,3.152541,2.101694,,,,,,,,left_only
2,141,2020-06-01 08:25:37.582000+00:00,50.860256,4.493522,True,3.152539,4.203386,,,,,,,,left_only
3,141,2020-06-01 08:25:39.382000+00:00,50.860332,4.493461,True,9.457607,5.254226,,,,,,,,left_only
4,141,2020-06-01 08:25:40.305076+00:00,50.860357,4.493440,True,3.152532,3.415247,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40318,8,2020-06-04 21:56:59.514958+00:00,50.861842,4.465724,True,1.174545,4.203256,,,,,,,,left_only
40319,8,2020-06-04 21:56:59.943340+00:00,50.861822,4.465716,True,2.250746,5.254062,,,,,,,,left_only
40320,8,2020-06-04 21:57:01.415195+00:00,50.861756,4.465684,True,7.733216,5.254061,,,,,,,,left_only
40321,8,2020-06-04 21:57:14.847135+00:00,50.861146,4.465406,True,70.572705,5.254096,,,,,,,,left_only


In [23]:
TP = len(cleaned_df[cleaned_df["is normal"] == True])
FP = len(cleaned_df[cleaned_df["is normal"] == False])
TN = len(outlier_df[outlier_df["is normal_x"] == False])
FN = len(outlier_df[outlier_df["is normal_x"] == True])

print("TP:{}\nFP:{}\nTN:{}\nFN:{}".format(TP,FP,TN,FN))
print("Precision:", TP / (TP + FP))
print("Accuracy::",(TP + TN) / (TP + TN + FP + FN))
print("Recall:", TP / (TP + FN))

TP:99
FP:210
TN:845
FN:39478
Precision: 0.32038834951456313
Accuracy:: 0.02323291986611538
Recall: 0.0025014528640371933


#### IQR based  method

In [24]:
cleaned_df = filt.filter_outliers_by_consecutive_speed(dataframe=trajectory_df)

In [25]:
outlier_df = trajectory_df.reset_index().merge(cleaned_df.reset_index(), on=['DateTime'], how='left', indicator=True)
outlier_df = outlier_df[outlier_df["_merge"] == "left_only"]
outlier_df

Unnamed: 0,traj_id_x,DateTime,lat_x,lon_x,is normal_x,Distance_x,Speed_x,traj_id_y,lat_y,lon_y,is normal_y,Distance_y,Speed_y,_merge
0,141,2020-06-01 08:25:35.332000+00:00,50.860205,4.493563,True,,,,,,,,,left_only
11,141,2020-06-01 08:26:00.854047+00:00,50.861914,4.492638,False,101.645187,160.938454,,,,,,,left_only
120,141,2020-06-01 08:29:14.471114+00:00,50.856244,4.485793,False,64.764284,86.352379,,,,,,,left_only
121,141,2020-06-01 08:29:14.971114+00:00,50.856216,4.484857,True,65.753658,131.507315,,,,,,,left_only
351,141,2020-06-01 08:34:07.872222+00:00,50.850572,4.480175,False,102.275599,94.698401,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40071,8,2020-06-04 20:48:31.659883+00:00,50.862079,4.465729,False,26.376513,47.195729,,,,,,,left_only
40100,8,2020-06-04 20:49:17.632206+00:00,50.862688,4.464107,False,50.780428,70.528372,,,,,,,left_only
40118,8,2020-06-04 20:49:40.899044+00:00,50.862854,4.463572,True,20.785274,34.642123,,,,,,,left_only
40130,8,2020-06-04 20:49:50.090916+00:00,50.864079,4.463321,False,109.289269,1041.971545,,,,,,,left_only


In [26]:
TP = len(cleaned_df[cleaned_df["is normal"] == True])
FP = len(cleaned_df[cleaned_df["is normal"] == False])
TN = len(outlier_df[outlier_df["is normal_x"] == False])
FN = len(outlier_df[outlier_df["is normal_x"] == True])

print("TP:{}\nFP:{}\nTN:{}\nFN:{}".format(TP,FP,TN,FN))
print("Precision:", TP / (TP + FP))
print("Accuracy::",(TP + TN) / (TP + TN + FP + FN))
print("Recall:", TP / (TP + FN))

TP:38849
FP:196
TN:649
FN:629
Precision: 0.9949801511076962
Accuracy:: 0.979540212781787
Recall: 0.9840670753330969
