In [1]:
# Python standard library imports
import time

# Third-party imports for database connection and data manipulation
from sqlalchemy import create_engine
import pandas as pd

# Third-party imports for mapping
import folium

Section 2: Connection

In [2]:
# Database connection parameters
dbname = 'DataMining'
user = 'postgres'
password = 'datamining'
host = 'localhost'  # localhost or the server address
port = '5433'  # default PostgreSQL port is 5432

# Establish a connection to the database
connection_str = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
engine = create_engine(connection_str)

The following SQL query is designed to identify and count duplicate rows in the vehicle_data table.

In [3]:
# Define the query with placeholders for parameters
query = f"""
SELECT 
    mapped_veh_id, 
    timestamps_UTC, 
    lat, 
    lon, 
    RS_E_InAirTemp_PC1, 
    RS_E_InAirTemp_PC2, 
    RS_E_OilPress_PC1, 
    RS_E_OilPress_PC2, 
    RS_E_RPM_PC1, 
    RS_E_RPM_PC2, 
    RS_E_WatTemp_PC1, 
    RS_E_WatTemp_PC2, 
    RS_T_OilTemp_PC1, 
    RS_T_OilTemp_PC2, 
    COUNT(*) AS duplicate_count
FROM 
    vehicle_data
GROUP BY 
    mapped_veh_id, 
    timestamps_UTC, 
    lat, 
    lon, 
    RS_E_InAirTemp_PC1, 
    RS_E_InAirTemp_PC2, 
    RS_E_OilPress_PC1, 
    RS_E_OilPress_PC2, 
    RS_E_RPM_PC1, 
    RS_E_RPM_PC2, 
    RS_E_WatTemp_PC1, 
    RS_E_WatTemp_PC2, 
    RS_T_OilTemp_PC1, 
    RS_T_OilTemp_PC2
HAVING 
    COUNT(*) > 1;

"""

# Start timing
start_time = time.time()

# Execute the query and fetch the data into a DataFrame
df = pd.read_sql_query(query, engine)

# End timing
end_time = time.time()
print(f"Query took {end_time - start_time} seconds to run.")
# Close the database connection
engine.dispose()

Query took 27.09993028640747 seconds to run.


In [7]:
df.size

0

The size of the executed query is zero, so we can conclude that there are no exact duplicate values. 