In [None]:
import os
import sys

sys.path.append("..")

from ast import literal_eval

import numpy as np
import pandas as pd
from tqdm import tqdm

from pipelines.utils import PLAIN_DATASET_NAME, PRE_MAP_DATASET_NAME, ROOT_DIR, DATASET_NAME, PEPROCESSED_DATASET_NAME, load_road_network
from preprocessing.utils import PREPROCESS_MAP

from preprocessing.rs_mapping import create_road_mapping_df, post_processing_mapped_df, merge_preprocessed_and_fmm

from preprocessing.cell_mapping import clean_and_output_data
from preprocessing.visualize import plot_gps_traj, plot_cpath
from pipelines.utils import load_config, generate_train_test_split, generate_train_val_test_split

config = load_config(name="porto", ctype="dataset")

### Full Preprocessing ###

We start with plain_dataset.parquet, which is the original data set as downloaded

In [None]:
# load original dataset
df = pd.read_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], PLAIN_DATASET_NAME)
)

In [None]:
df.head()

In [None]:
need_preprocessing = False # Makes a Linestring from POLYLINE, add coords column, timestamps column
if need_preprocessing:
        df = PREPROCESS_MAP[config["city"]](df)

In [None]:
df_preprocessed = clean_and_output_data(df, config)

In [None]:
df_preprocessed.to_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], PRE_MAP_DATASET_NAME)
    ) 

In [None]:
# load df_preprocessed dataset
df_preprocessed = pd.read_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], PRE_MAP_DATASET_NAME)
)

In [None]:
create_road_mapping_df(df_preprocessed, config["city"])

In [None]:
# Now need to map match with fmm. See mapping.sh for how to do this.

In [None]:
df_fmm = pd.read_csv(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], "mr.txt"), delimiter=";"
)

In [None]:
df_merged = merge_preprocessed_and_fmm(df_preprocessed, df_fmm)
df = post_processing_mapped_df(df_merged)

In [None]:
df.to_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], PEPROCESSED_DATASET_NAME)
)

In [None]:
# Load preprocessed dataset
df = pd.read_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], PEPROCESSED_DATASET_NAME)
)

In [None]:
df.head()

In [None]:
# For final dataset, select only necessary columns
df = df[['TAXI_ID', 'coord_seq', 'opath', 'timestamps', 'cpath', 'speed', 'road_timestamps']]

# Convert weired array(array([...]))) to list of lists [[...]] -> does not work, after loading still array of arrays
#df['merc_seq'] = df['merc_seq'].apply(lambda x: [list(y) for y in x])
#df['coord_seq'] = df['coord_seq'].apply(lambda x: [list(y) for y in x])

# For classification Task we need to transform TAXI_ID to numeric values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['TAXI_ID'] = le.fit_transform(df['TAXI_ID'])
df["TAXI_ID"].max()


df.to_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], DATASET_NAME)
)

In [None]:
# load
df = pd.read_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], DATASET_NAME)
)

In [None]:
## Split data into train, val, test
train, val, test = generate_train_val_test_split(config['city'], config['seed'])

In [None]:
# Save train, val, test
train.to_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], "train", f"train_{config['seed']}.parquet")
)
val.to_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], "val", f"val_{config['seed']}.parquet")
)
test.to_parquet(
    os.path.join(ROOT_DIR, "datasets/trajectory", config["city"], "test", f"test_{config['seed']}.parquet")
)

In [None]:
df.head()

In [None]:
df.timestamps

In [None]:
n = 4

In [None]:
len(df.cpath[n])

In [None]:
len(df.coord_seq[n])

In [None]:
len(df.opath[n])

In [None]:
target_coord_seq = [coord_seq[i] for i in range(len(opath)) if opath[i] in target_path]

### Plot GPS Traj and Road Seg. Traj.

In [None]:
idx = 3    
traj = df.coord_seq[idx]
traj = [[y, x] for x, y in traj]
cpath = df.cpath[idx]
opath = df.opath[idx]


# Test printing on map
edge_df, nodes_df, G, LG = load_road_network(config["city"])

In [None]:
plot_gps_traj(traj)

In [None]:
plot_cpath(opath, edge_df)

In [None]:
import folium
import geopandas as gpd
def plot_cpath(cpath, edge_df, zoom=14, width=1000, height=500, tiles='cartodbpositron'):
    linestrings = edge_df.iloc[list(cpath)]['geometry'].reset_index(drop=True)

    # Create a GeoDataFrame with the LINESTRING data
    gdf = gpd.GeoDataFrame(geometry=linestrings)

    # Obtain lat_center and lon_center
    bounds = gdf.total_bounds
    lat_center = (bounds[1] + bounds[3]) / 2
    lon_center = (bounds[0] + bounds[2]) / 2

    # Create a Folium map
    f = folium.Figure(width=width, height=height)
    map = folium.Map(location=(lat_center, lon_center), zoom_start=zoom, tiles=tiles).add_to(f)

    # Add the LINESTRINGs to the map
    for _, row in gdf.iterrows():
        folium.PolyLine(
            locations=[(lat, lon) for lon, lat in row['geometry'].coords]
        ).add_to(map)

    # Display the map
    return f

In [None]:
def plot_gps_traj(coords, zoom=14, width=1000, height=500, tiles='cartodbpositron'):
    # Calculate the center of the trajectory
    lat_center = sum([coord[0] for coord in coords])/len(coords)
    lon_center = sum([coord[1] for coord in coords])/len(coords)

    # Create a map object centered on the trajectory
    f = folium.Figure(width=width, height=height)
    map = folium.Map(location=(lat_center, lon_center), zoom_start=zoom, tiles=tiles).add_to(f)

    # Plot the coordinates on the map
    for coord in coords:
        folium.CircleMarker(location=coord, radius=2).add_to(map)

    # Return the map object
    return f

### OSMNx Network tests ###

In [None]:
gdf_edges, gdf_nodes, G, line_G = load_road_network("porto")
import osmnx as ox
G = ox.graph_from_place('Porto, Portugal')
fig, ax = ox.plot_graph(G, node_color='b', node_zorder=3)

G2 = ox.speed.add_edge_speeds(G)
G2 = ox.speed.add_edge_travel_times(G2)