In [None]:
!pip install trackintel

Collecting trackintel
  Downloading trackintel-1.3.1-py3-none-any.whl.metadata (8.7 kB)
Collecting geoalchemy2 (from trackintel)
  Downloading GeoAlchemy2-0.15.2-py3-none-any.whl.metadata (2.1 kB)
Collecting osmnx (from trackintel)
  Downloading osmnx-1.9.4-py3-none-any.whl.metadata (4.9 kB)
Collecting similaritymeasures (from trackintel)
  Downloading similaritymeasures-1.2.0.tar.gz (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.3/401.3 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting geopandas>=0.12.0 (from trackintel)
  Downloading geopandas-0.14.4-py3-none-any.whl.metadata (1.5 kB)
Collecting networkx (from trackintel)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fiona>=1.8.21 (from geopandas>=0.12.0->trackintel)
  Downloading fiona-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import trackintel as ti
import gzip
import os
import geopandas as gpd


In [None]:
# Define the path to the .gz files
data_files = {
    'A': "./cityA_groundtruthdata.csv.gz",
    'B': "./cityB_challengedata.csv.gz",
    'C': "./cityC_challengedata.csv.gz",
    'D': "./cityD_challengedata.csv.gz"
}


In [None]:
# Function to load .gz files into DataFrame
def load_gz_file(file_path):
    with gzip.open(file_path, 'rt') as f:
        return pd.read_csv(f)


In [None]:
# Function to add 'tracked_at' column, rename uid to user_id, x to longtitude , y to latitude, and drop rows with -999 coordinates
def preprocess_data(city):
    #load data
    df = load_gz_file(data_files[city])

    # Drop rows with missing data (x, y marked as -999)
    df = df[(df['x'] != -999) & (df['y'] != -999)]

    # Add the 'tracked_at' column by combining 'd' and 't' columns
    df['date'] = pd.to_datetime(df['d'], format='%j', errors='coerce')
    df['time'] = pd.to_timedelta(df['t'] * 30, unit='m')
    df['tracked_at'] = df['date'] + df['time']
    df['tracked_at'] = df['tracked_at'].dt.tz_localize('UTC')

    # Drop the intermediate 'date' and 'time' columns
    df.drop(columns=['date', 'time'], inplace=True)

    # Rename columns
    df.rename(columns={'uid': 'user_id', 'x': 'longitude', 'y': 'latitude'}, inplace=True)
    df.to_csv(f'data_{city}_preprocessed.csv', index=False)
    print(f"Data saved to csv")

In [None]:
def custom_write_triplegs_csv(triplegs, filename, **kwargs):
    """Saves triplegs data to a CSV file with custom options.

    Args:
        triplegs: The triplegs data.
        filename: The name of the output CSV file.
        **kwargs: Additional keyword arguments to pass to to_csv.
    """
    triplegs_df = triplegs.to_wkt(rounding_precision=-1, trim=False)
    triplegs_df.to_csv(filename, **kwargs)  # Use kwargs to control index etc.



In [None]:
def gen_triplegs(city):
    #preprocess data and save as csv
    print(f"Preprocessing city {city} data")
    preprocess_data(city)

    # Load preprocessed data and generate positionfixes
    print(f"Loading preprocessed data: city {city}")
    pfs = ti.read_positionfixes_csv(f'data_{city}_preprocessed.csv')

    #gerenate staypoints
    print("Generating staypoints")
    pfs , sp = pfs.as_positionfixes.generate_staypoints(
    method='sliding',
    dist_threshold=1,  # Distance threshold in meters (1 cell = 500 meters)
    time_threshold=90,  # Time threshold  (minutes) #test with different values 
    gap_threshold=300,  # Gap threshold  (minutes)  #test with different values 
    distance_metric='haversine',  # Distance metric
    include_last=True,  # Include the last staypoint
    print_progress=True,  # Print progress
    exclude_duplicate_pfs=True,  # Exclude duplicate position fixes
    n_jobs=-1  # Number of jobs for parallel processing
)

    #generate triplegs using positionfixes and staypoints generated earlier
    print("Generating triplegs")
    
    #test with different values
    #larger gap leads to lesser and more continuous trip
    #gap threshold in generate_triplegs should be configured in relation to both time and the gap threshold used in generate_staypoints
    pfs , tpls = ti.preprocessing.generate_triplegs(pfs , sp , method='between_staypoints' , gap_threshold=90) 
    print("Saving Triplegs to csv")
    custom_write_triplegs_csv(tpls, f'triplegs{city}.csv', index=False)
    print("Triplegs saved to csv")




'\ntime_threshold: This is the minimum amount of time a person needs to stay within a small area (defined by dist_threshold) for that area to be considered a staypoint. If the time spent in that spot meets or exceeds time_threshold, it qualifies as a staypoint.\n\ngap_threshold: This is the maximum time allowed between consecutive position fixes for them to be considered part of a continuous sequence. If the time gap between two position fixes is larger than gap_threshold, they are considered discontinuous, meaning they won’t be grouped into the same staypoint.\n'

In [None]:
gen_triplegs('A')

Preprocessing city A data
Data saved to csv
Loading preprocessed data: city A




Generating staypoints


100%|██████████| 100000/100000 [29:49<00:00, 55.89it/s]


Generating triplegs


  pfs["tripleg_id"] = pfs["tripleg_id"].ffill()


Saving Triplegs to csv
Triplegs saved to csv
