In [33]:
# Python standard library imports
import time
from geopy.distance import geodesic

# Third-party imports for database connection and data manipulation
from sqlalchemy import create_engine
import pandas as pd

# Third-party imports for mapping
import folium

Section 2: Connection

In [34]:
# Database connection parameters
dbname = 'DataMining'
user = 'postgres'
password = 'datamining'
host = 'localhost'  # localhost or the server address
port = '5433'  # default PostgreSQL port is 5432

# Establish a connection to the database
connection_str = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
engine = create_engine(connection_str)

The first function gets all the data for a certain veh_id sorted on timestamp
The second function checks for a dataframe that the concurrent entries are correct in the sense that they are very close location wise

In [42]:
def fetch_data(veh_id):
    # Define the query with placeholders for parameters
    query = f"""
    SELECT * FROM vehicle_data 
    WHERE mapped_veh_id = {veh_id}
    ORDER BY timestamps_UTC;
    """

    # Start timing
    start_time = time.time()

    # Execute the query and fetch the data into a DataFrame
    df = pd.read_sql_query(query, engine)

    # End timing
    end_time = time.time()
    print(f"Query took {end_time - start_time} seconds to run.")
    # Close the database connection
    engine.dispose()
    
    return df

def entries_not_close(df, treshold_speed):
    """
    Check if consecutive entries in a DataFrame are not within a certain distance of each other
    and the time difference between them is less than a specified threshold. Also, calculate and 
    return the time difference, location difference, and speed (in km/h) between these entries.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the location and time data.
    threshold_distance (float): The distance threshold in meters.
    time_threshold (str): The time difference threshold in Pandas timedelta string format.

    Returns:
    list of dicts: A list containing details of pairs of rows that are not close to each other 
                   and have a time difference less than the specified threshold.
    """
    not_close_entries = []

    for i in range(len(df) - 1):
        current_row = df.iloc[i]
        next_row = df.iloc[i + 1]

        # Calculate time difference in seconds
        time_diff = abs(pd.to_datetime(next_row['timestamps_utc']) - pd.to_datetime(current_row['timestamps_utc']))
        time_diff_seconds = time_diff.total_seconds()

        if time_diff_seconds <= 60:
            # Calculate the distance in meters
            distance = geodesic((current_row['lat'], current_row['lon']), (next_row['lat'], next_row['lon'])).meters
            
            speed_kmh = (distance / time_diff_seconds) * 3.6 if time_diff_seconds > 0 else 0

            # Check if the distance exceeds the threshold
            if speed_kmh > treshold_speed:
                # Calculate speed in km/h (meters per second to km/h conversion)

                # Append the result with details
                not_close_entries.append({
                    'indices': (i, i + 1),
                    'time_diff_seconds': time_diff_seconds,
                    'distance_meters': distance,
                    'speed_kmh': speed_kmh
                })

    return not_close_entries


In [36]:
df = fetch_data(181)

Query took 8.514650821685791 seconds to run.


In [43]:
print(entries_not_close(df, 10000))

[{'indices': (18339, 18340), 'time_diff_seconds': 1.0, 'distance_meters': 2968.634126572451, 'speed_kmh': 10687.082855660823}, {'indices': (18343, 18344), 'time_diff_seconds': 1.0, 'distance_meters': 4052.4783512860686, 'speed_kmh': 14588.922064629847}, {'indices': (18345, 18346), 'time_diff_seconds': 1.0, 'distance_meters': 4480.233072526793, 'speed_kmh': 16128.839061096454}, {'indices': (50160, 50161), 'time_diff_seconds': 1.0, 'distance_meters': 5088.420746354056, 'speed_kmh': 18318.3146868746}, {'indices': (50162, 50163), 'time_diff_seconds': 1.0, 'distance_meters': 5889.42138002374, 'speed_kmh': 21201.916968085465}, {'indices': (50170, 50171), 'time_diff_seconds': 1.0, 'distance_meters': 8862.308238027852, 'speed_kmh': 31904.309656900266}, {'indices': (50177, 50178), 'time_diff_seconds': 2.0, 'distance_meters': 8865.54202842724, 'speed_kmh': 15957.975651169034}, {'indices': (50736, 50737), 'time_diff_seconds': 1.0, 'distance_meters': 5653.663076806266, 'speed_kmh': 20353.187076502

Here we observe that there are quite some entries where the speed can not be correct. After we checked on the map we saw that the points did not make sense. 

In [47]:
# Index values identified by the function
index_pair = (240542, 240543)
print(df.iloc[index_pair[0]])
print(df.iloc[index_pair[1]])

mapped_veh_id                         181
timestamps_utc        2023-08-23 08:36:25
lat                             51.164911
lon                              4.735919
rs_e_inairtemp_pc1                   46.0
rs_e_inairtemp_pc2                   41.0
rs_e_oilpress_pc1                   403.0
rs_e_oilpress_pc2                   351.0
rs_e_rpm_pc1                       1888.0
rs_e_rpm_pc2                       1930.0
rs_e_wattemp_pc1                     88.0
rs_e_wattemp_pc2                     87.0
rs_t_oiltemp_pc1                     84.0
rs_t_oiltemp_pc2                     87.0
Name: 240542, dtype: object
mapped_veh_id                         181
timestamps_utc        2023-08-23 08:36:30
lat                             51.171492
lon                              4.480429
rs_e_inairtemp_pc1                   46.0
rs_e_inairtemp_pc2                   41.0
rs_e_oilpress_pc1                   410.0
rs_e_oilpress_pc2                   351.0
rs_e_rpm_pc1                       1918.0
rs_e_r

In [46]:
belgium_center = [50.5039, 4.4699]  # Roughly the center of Belgium
map_belgium = folium.Map(location=belgium_center, zoom_start=8)

folium.CircleMarker(
        location=[51.168687, 4.786373],
        radius=3,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(map_belgium)

folium.CircleMarker(
        location=[51.171492, 4.480429],
        radius=3,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(map_belgium)
map_belgium

Here for example the distance is more than 20km but the difference in time is 5 seconds.