# Qualitative analysis of outlier detection 

In [1]:
import skmob
import pandas as pd
import json
import numpy as np
import folium
from shapely.geometry import Point, LineString, Polygon
from skmob.preprocessing import compression, filtering, detection
from shapely.ops import nearest_points

ModuleNotFoundError: No module named 'skmob'

## Qualititative analysis based on the shapefile

### Trajectory loading

In [None]:
df= pd.read_csv("datasets/my_traj.csv")
#df["time"] = pd.to_datetime(df["time"],format="mixed")
skmob.TrajDataFrame(data=df, latitude='latitude', longitude='longitude', trajectory_id='id', datetime='time').plot_trajectory(tiles="OpenStreetMap")

### Shapefile loading & Ground truth computation

In [None]:
from pyproj import CRS, Geod
line = "034b"
variante = 1
shapefile= pd.read_csv("datasets/shapefiles.csv", delimiter=";")
shapefile = shapefile[(shapefile["LIGNE"]==line) & (shapefile["VARIANTE"]==variante )]["Geo Shape"].values[0]
shapefile = json.loads(shapefile)

crs_utm = CRS.from_user_input(31370)
geod = crs_utm.get_geod()  # Your data may be from a different Geod.
distance = np.zeros(len(df))
nearest_points_list = []
df = df.reset_index(drop=True)
line_shape = LineString(shapefile["coordinates"]) 
for index, row in df.iterrows():
    point = Point(row["longitude"], row["latitude"])
    closest_points = nearest_points(line_shape, point)
    nearest_points_list.append(closest_points[0])
    distance[index] = geod.geometry_length(LineString(closest_points))
    
df["distance"] = distance 
#df["nearest_point"] = nearest_points_list
mean = df["distance"].mean()
median = df["distance"].median()
std = df["distance"].std()
k=1.5
threshold = median + k * std
#df["nearest_point"] = nearest_points_list
print(mean, std)
outliers = df[df["distance"] > threshold]
is_normal = np.zeros(len(df))
for i in range(len(df['time'].values)):
    if df['time'].values[i] not in outliers['time'].values:
        is_normal[i] = 1
df["is normal"] = is_normal > 0
df

In [None]:
trajectory_df = skmob.TrajDataFrame(data=df, latitude='latitude', longitude='longitude', trajectory_id='id', datetime='time')

#### Speed based method

In [None]:
cleaned_df = filtering.filter(tdf=trajectory_df,max_speed_kmh=9*3.6)
cleaned_df.to_csv('csv_results/cleaned_df_skmob_trip.csv', index=False)
cleaned_df.plot_trajectory(weight=4,tiles="OpenStreetMap",zoom=16,hex_color="#0000FF")

In [None]:
outlier_df = trajectory_df.merge(cleaned_df.drop_duplicates(), on=['datetime'], how='left', indicator=True)
outlier_df = outlier_df[outlier_df["_merge"] == "left_only"]
outlier_df


In [None]:
TP = len(cleaned_df[cleaned_df["is normal"] == True])
FP = len(cleaned_df[cleaned_df["is normal"] == False])
TN = len(outlier_df[outlier_df["is normal_x"] == False])
FN = len(outlier_df[outlier_df["is normal_x"] == True])
P = TP / (TP + FP)
A = (TP + TN) / (TP + TN + FP + FN)
R = TP / (TP + FN)
F1 = 2 * (P * R) / (P + R)
print("TP:{}\nFP:{}\nTN:{}\nFN:{}".format(TP,FP,TN,FN))
print("Precision:", round(P,3))
print("Accuracy::",round(A,3))
print("Recall:", round(R,3))
print("F1:", round(F1,3))

## Qualititative analysis based on the introduction of outliers

In [None]:
df= pd.read_csv("datasets/berlinMOD_with_outliers.csv")
trajectory_df = skmob.TrajDataFrame(data=df, latitude='latitude', longitude='longitude', trajectory_id='id', datetime='time')
trajectory_df

#### speed based method

In [None]:
cleaned_df = filtering.filter(tdf=trajectory_df,max_speed_kmh=25*3.6)
cleaned_df.to_csv('csv_results/cleaned_df_skmob_MOD.csv', index=False)

In [None]:
outlier_df = trajectory_df.merge(cleaned_df.drop_duplicates(), on=['datetime'], how='left', indicator=True)
outlier_df = outlier_df[outlier_df["_merge"] == "left_only"]
len(outlier_df)

In [None]:
TP = len(cleaned_df[cleaned_df["is normal"] == True])
FP = len(cleaned_df[cleaned_df["is normal"] == False])
TN = len(outlier_df[outlier_df["is normal_x"] == False])
FN = len(outlier_df[outlier_df["is normal_x"] == True])
P = TP / (TP + FP)
A = (TP + TN) / (TP + TN + FP + FN)
R = TP / (TP + FN)
F1 = 2 * (P * R) / (P + R)
print("TP:{}\nFP:{}\nTN:{}\nFN:{}".format(TP,FP,TN,FN))
print("Precision:", round(P,3))
print("Accuracy::",round(A,3))
print("Recall:", round(R,3))
print("F1:", round(F1,3))

In [None]:
df.to_csv("test.csv",index=False)