In [None]:
import pandas as pd
import os
import numpy as np

# Define the base folder where all the GTFS folders are stored
base_folder = r'C:\Users\rsiddiq2\Downloads\FBCENC-GTFS' #Provide the GTFS folder; place your sub folders in this folder. 

# List all subfolders (each subfolder represents an agency)
agency_folders = os.listdir(base_folder)

# Iterate over each folder, read the GTFS files, and assign the folder name as the agency_id
all_agency_stops = []

for folder_name in agency_folders:
    # Construct the path to the folder
    folder_path = os.path.join(base_folder, folder_name)
    
    # Load GTFS files from this folder (adjust the paths to your GTFS file structure)
    routes_path = os.path.join(folder_path, 'routes.txt')
    stops_path = os.path.join(folder_path, 'stops.txt')
    trips_path = os.path.join(folder_path, 'trips.txt')
    stop_times_path = os.path.join(folder_path, 'stop_times.txt')
    
    try:
        # Attempt to read the files, skipping gracefully if any are missing
        routes_df = pd.read_csv(routes_path) if os.path.exists(routes_path) else pd.DataFrame()
        stops_df = pd.read_csv(stops_path) if os.path.exists(stops_path) else pd.DataFrame()
        trips_df = pd.read_csv(trips_path) if os.path.exists(trips_path) else pd.DataFrame()
        stop_times_df = pd.read_csv(stop_times_path) if os.path.exists(stop_times_path) else pd.DataFrame()
    except FileNotFoundError:
        print(f"GTFS files not found in {folder_name}")
        continue
    
    # If agency_id or route_short_name is missing, assign the folder name as agency_id
    if not routes_df.empty:
        if 'agency_id' not in routes_df.columns:
            routes_df['agency_id'] = folder_name  # Use the folder name as agency_id
        if 'route_short_name' not in routes_df.columns:
            routes_df['route_short_name'] = f"{folder_name} route"  # Placeholder route_short_name if missing
    
    # Handle the case where certain DataFrames might be empty
    if routes_df.empty or trips_df.empty or stops_df.empty or stop_times_df.empty:
        print(f"Skipping {folder_name} due to missing critical files.")
        continue

    # Perform the merges with available data
    routes_trips_df = pd.merge(routes_df, trips_df, on='route_id', how='inner', suffixes=('', '_trips'))
    trips_stop_times_df = pd.merge(routes_trips_df, stop_times_df, on='trip_id', how='inner', suffixes=('', '_stoptimes'))
    agency_stops_df = pd.merge(trips_stop_times_df, stops_df, on='stop_id', how='inner', suffixes=('', '_stops'))
    
    # Drop duplicates and include route_short_name
    agency_stops_df = agency_stops_df.drop_duplicates()
    
    # Include the folder name (agency) in the final dataset, handling missing columns gracefully
    agency_stops_df = agency_stops_df[['agency_id', 'route_id', 'route_long_name', 'route_short_name', 'stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
    
    # Add the folder name as the agency name if missing
    agency_stops_df['agency_name'] = folder_name

    # Step 1: Replace NaN in route_short_name with route_long_name
    agency_stops_df['route_short_name'] = np.where(
        agency_stops_df['route_short_name'].isna(),  # Check if route_short_name is NaN
        agency_stops_df['route_long_name'],  # Use route_long_name if NaN
        agency_stops_df['route_short_name']  # Otherwise, use route_short_name
    )

    # Step 2: Ensure 'agency_name' and 'route_short_name' are strings, then create 'Route_name'
    agency_stops_df['Route_name'] = agency_stops_df['agency_name'].astype(str) + '_' + agency_stops_df['route_short_name'].astype(str)

    # Append to the list of all agency stops
    all_agency_stops.append(agency_stops_df)

# Concatenate all stops from all agencies into a single DataFrame
if all_agency_stops:
    final_agency_stops = pd.concat(all_agency_stops)

    # Drop duplicates from the final DataFrame based on relevant columns
    final_agency_stops = final_agency_stops.drop_duplicates(subset=['agency_id', 'route_id', 'stop_id'])

    # Save the final DataFrame to a CSV (commented out for now)
    # final_agency_stops.to_csv('path/to/output_all_agency_stops.csv', index=False)

    print("All agency stops data, including 'Route_name', saved successfully.")
else:
    print("No valid data to save.")


In [None]:
final_agency_stops.to_csv(r'C:\Users', index=False) 
#save to 

In [None]:
import pandas as pd
import json
import os

# Define the base folder where all the GTFS folders are stored
base_folder = r'C:\Users\' #GTFS folder 

# List all subfolders (each subfolder represents an agency)
agency_folders = os.listdir(base_folder)

# Initialize a GeoJSON structure
geojson = {
    "type": "FeatureCollection",
    "features": []
}

# Iterate over each folder, read the GTFS files, and assign the folder name as the agency_id
for folder_name in agency_folders:
    # Construct the path to the folder
    folder_path = os.path.join(base_folder, folder_name)
    
    # Load GTFS files from this folder (adjust the paths to your GTFS file structure)
    routes_path = os.path.join(folder_path, 'routes.txt')
    shapes_path = os.path.join(folder_path, 'shapes.txt')
    trips_path = os.path.join(folder_path, 'trips.txt')
    
    try:
        routes_df = pd.read_csv(routes_path)
        shapes_df = pd.read_csv(shapes_path)
        trips_df = pd.read_csv(trips_path)
    except FileNotFoundError:
        print(f"GTFS files not found in {folder_name}")
        continue

    # Check if trips.txt has 'route_id' and 'shape_id'
    if 'route_id' in trips_df.columns and 'shape_id' in trips_df.columns:
        # Merge trips.txt with routes.txt on 'route_id'
        routes_trips_df = pd.merge(trips_df[['route_id', 'shape_id']], routes_df, on='route_id', how='left')

        # Now merge with shapes.txt on 'shape_id'
        merged_df = pd.merge(shapes_df, routes_trips_df, on='shape_id', how='left')

        # Sort the merged dataframe by shape_id and shape_pt_sequence
        merged_df = merged_df.sort_values(by=['shape_id', 'shape_pt_sequence'])

        for shape_id in merged_df['shape_id'].unique():
            # Extract the shape coordinates for this shape_id and ensure they are ordered by shape_pt_sequence
            route_shapes = merged_df[merged_df['shape_id'] == shape_id]
            coordinates = route_shapes[['shape_pt_lon', 'shape_pt_lat']].values.tolist()

            # Convert shape_id to string
            shape_id_str = str(shape_id)

            # Handle missing route_id
            if pd.isna(route_shapes['route_id'].values[0]):
                route_id = "unknown_route_id"  # Assign a default route_id when missing
            else:
                route_id = str(route_shapes['route_id'].values[0])  # Treat route_id as a string

            # Extract route-specific information
            route_long_name = str(route_shapes['route_long_name'].values[0] if 'route_long_name' in route_shapes.columns else 'N/A')
            route_short_name = str(route_shapes['route_short_name'].values[0] if 'route_short_name' in route_shapes.columns else 'N/A')

            # Create a combined route name (use short name if available, otherwise long name)
            route_name = f"{route_short_name}" if route_short_name != 'N/A' else f"{route_long_name}"

            # Convert all fields to standard Python types
            route_long_name = str(route_long_name)
            route_short_name = str(route_short_name)
            route_name = str(route_name)

            # Ensure any int64 or float64 values are converted to Python native types
            shape_id_str = str(route_shapes['shape_id'].values[0])
            route_id = str(route_shapes['route_id'].values[0])

            # Create a GeoJSON feature for this route
            feature = {
                "type": "Feature",
                "properties": {
                    "shape_id": shape_id_str,  # Use shape_id as a string
                    "agency_name": folder_name,  # Using folder name as agency name
                    "route_id": route_id,      # Add route_id as string
                    "route_name": route_name,  # Combined route name
                    "route_long_name": route_long_name,  # Add route long name
                    "route_short_name": route_short_name,  # Add route short name
                },
                "geometry": {
                    "type": "MultiLineString",
                    "coordinates": [coordinates]  # Wrap in a list to match MultiLineString format
                }
            }

            # Add this feature to the GeoJSON structure
            geojson["features"].append(feature)

    else:
        print(f"'route_id' or 'shape_id' missing in trips.txt for {folder_name}. Skipping this folder.")

# Save the final GeoJSON to a file
output_file = 'combined_routes_with_shapes_and_names_all_agencies.geojson'
with open(output_file, 'w') as f:
    json.dump(geojson, f, indent=2)

print(f"All agency routes have been combined and saved to '{output_file}'")


In [None]:
import geopandas as gpd

# Load the GeoJSON file
geojson_file = "C:\\Users\....\\combined_routes_with_shapes_and_names_all_agencies.geojson"
gdf = gpd.read_file(geojson_file)

# Save to a shapefile
shapefile_path = "output_shapefile.shp"
gdf.to_file(shapefile_path, driver='ESRI Shapefile')

print(f"GeoJSON file has been converted to Shapefile and saved at {shapefile_path}")
