In [None]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

 add two new columns (longitude and latitude) after extracting their values for each point


In [None]:
# Path to your original Parquet file containing the GeoDataFrame
input_file_path = 'Data/gdf_24-01-01_24-02-01.parquet'
output_file_path = 'Data/gdf_24-01-01_24-02-01_modified.parquet'

# Load the data into a GeoDataFrame
gdf = gpd.read_parquet(input_file_path)

# Ensure the data is recognized as a GeoDataFrame if it's not being automatically detected
if not isinstance(gdf, gpd.GeoDataFrame):
    gdf = gpd.GeoDataFrame(gdf, geometry='geometry')

# Convert geometries to Well-Known Text (WKT) format to avoid GEOS version issues
# But first extract longitude and latitude
gdf['longitude'] = gdf['geometry'].x
gdf['latitude'] = gdf['geometry'].y
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt)

# Since the geometry is now just a string, you can convert the GeoDataFrame to a regular DataFrame
df = pd.DataFrame(gdf)

# Save the DataFrame to a new Parquet file
df.to_parquet(output_file_path)

# Optional: Print the first few rows to confirm the changes
print(df[['longitude', 'latitude', 'geometry']].head())

In [None]:
#convert to point
# Example coordinates (longitude, latitude)
latitude = 48.772624
longitude = 11.441808

# Create a Point geometry using longitude and latitude
point_geo = Point(longitude, latitude)

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame([{'geometry': point_geo}], crs='EPSG:4326')

# Convert from Geographic coordinate system (WGS84) to UTM Zone 32N
gdf_utm = gdf.to_crs('EPSG:32632')

# Output the converted point
print("Converted Point in UTM 32N:", gdf_utm.iloc[0]['geometry'])

In [None]:
#convert point to longitude and latitude
from pyproj import Transformer
transformer = Transformer.from_crs("EPSG:32632", "EPSG:4326")
longitude, latitude = transformer.transform(679701.9162724982, 5405649.375394682)
print(latitude, longitude)

In [None]:
#edit specific point
import geopandas as gpd
from shapely.geometry import Point

# Read the Parquet file into a GeoDataFrame
gdf = gpd.read_parquet('Data/stop_lines_cut.parquet')

# Identify the row index for the stop to be updated
index = gdf[gdf['Stop Name'] == 'Goethestra√üe'].index[0]

# Define the new geometry (e.g., new coordinates)
new_geometry = Point(679423.6852358369, 5405094.876865458)  # Replace with the desired coordinates

# Update the geometry for the specified row
gdf.at[index, 'geometry'] = new_geometry

# Save the updated GeoDataFrame to a Parquet file
gdf.to_parquet('Data/stop_lines_cut_1.parquet')

# Print the updated GeoDataFrame to verify the change
print(gdf)

In [None]:
# Remove specific number of rows

import geopandas as gpd

# Load the Parquet file into a GeoDataFrame
gdf = gpd.read_parquet('Data/stop_lines_cut.parquet')

# Remove the first three rows
gdf_modified = gdf.iloc[:7].reset_index(drop=True)

# Save the modified GeoDataFrame back to a Parquet file
gdf_modified.to_parquet('Data/stop_lines_cut_1.parquet')


In [None]:
# Filter and clean data (select runs between two points)
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from shapely.ops import nearest_points

def filter_and_trim_runs(group, start_point, end_point, max_distance=5):
    # Ensure the group is sorted by 'utcTime' and reset the index
    group = group.sort_values('utcTime').reset_index(drop=True)
    
    # Finding nearest points in the run to the given start and end points
    start_distances = group.geometry.distance(start_point)
    end_distances = group.geometry.distance(end_point)
    
    # Get indices and distances of the nearest start and end points
    start_index, start_min_distance = start_distances.idxmin(), start_distances.min()
    end_index, end_min_distance = end_distances.idxmin(), end_distances.min()
    
    # Check if the nearest points are within the maximum allowed distance and start comes before end
    if (start_index is not None and end_index is not None and
        start_min_distance <= max_distance and
        end_min_distance <= max_distance and
        start_index < end_index):
        # Slice the DataFrame to only include points between start and end points
        return group.loc[start_index:end_index]
    return None  

# Define the target points using the coordinates
target_start_point = Point(X, Y)  # UTM coordinates
target_end_point = Point(X, Y) 

# Load your data
gdf = gpd.read_parquet('INSERT YOUR PARQUET FILE')

# Ensure geometry column is correctly formatted as geometrical objects
if isinstance(gdf['geometry'].iloc[0], str):
    gdf['geometry'] = gdf['geometry'].apply(wkt.loads)

gdf_reset = gdf.reset_index()
# Reset index to ensure clean, sequential indices before processing
gdf_sorted = gdf_reset.reset_index(drop=True).sort_values(by=['run', 'utcTime'])

# Apply the filter and trim
valid_runs = gdf_sorted.groupby('run', group_keys=False).apply(filter_and_trim_runs, 
                                                               target_start_point, target_end_point, 
                                                               max_distance=50).dropna(how='all')

# Reset index of the final DataFrame to ensure it is clean and sequential
valid_runs.reset_index(drop=True, inplace=True)

# Save the filtered and trimmed data to a new Parquet file
output_file = 'CLEANED PARQUET FILE'
valid_runs.to_parquet(output_file)




In [None]:
# Reverse the index
import geopandas as gdf

# Read the data from Parquet
gdf = gdf.read_parquet('Data/stop_lines.parquet')

# Reverse the order of the DataFrame and reset the index
reversed_df = gdf.iloc[::-1].reset_index(drop=True)

# Save the sorted DataFrame back to a new Parquet file
reversed_df.to_parquet('Data/stop_lines3.parquet')

In [None]:
#Creating new data without dwell time

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import pyarrow as pa

# Load your data (replace 'path_to_runs.parquet' with your actual file path)
runs = gpd.read_parquet('ENTER YOUR PARQUET FILE')

# Ensure that the 'utcTime' column is in datetime format
runs['utcTime'] = pd.to_datetime(runs['utcTime'])

# Function to adjust utcTime based on waiting periods
def adjust_utc_time(run):
    adjusted_times = []
    accumulated_waiting_period = 0
    stop_start_time = None

    for i, row in run.iterrows():
        current_time = row['utcTime']
        speed = row['speed']

        if speed == 0:
            if stop_start_time is None:
                stop_start_time = current_time
            waiting_period = (current_time - stop_start_time).total_seconds()
        else:
            if stop_start_time is not None:
                accumulated_waiting_period += (current_time - stop_start_time).total_seconds()
                stop_start_time = None

        # Adjust time for all points after the bus has started moving again
        adjusted_time = current_time - pd.to_timedelta(accumulated_waiting_period, unit='s')
        adjusted_times.append(adjusted_time)

    run['utcTime'] = adjusted_times
    return run

# Iterate through each run and adjust utcTime
adjusted_runs = runs.groupby('run').apply(adjust_utc_time).reset_index(drop=True)

# Remove all frames (rows) with zero speed
adjusted_runs = adjusted_runs[adjusted_runs['speed'] != 0]

# Save the adjusted data to a new parquet file
adjusted_runs.to_parquet('ENTER YOUR NEW PARQUET FILE')