In [174]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import geopandas as gpd

In [206]:
folder_path = Path('stops_files')
transit_dfs = {}
for file in folder_path.glob("*.txt"):
    df = pd.read_csv(file)
    transit_dfs[file.name] = df

common_cols = set(transit_dfs['1206.txt'].columns)
for df in transit_dfs.values():
    common_cols = common_cols.intersection(df.columns)
    
for yymm in transit_dfs:
    df = transit_dfs[yymm]
    df = df[list(common_cols)]
    df = df.drop(['stop_code', 'stop_place', 'reference_place', 'parent_station', 'wheelchair_boarding', 'intersection_code', 'stop_name'], axis=1)
    transit_dfs[yymm] = df

In [208]:
transit_gdfs = {}
for yymm in transit_dfs:
    transit_gdfs[yymm] = gpd.GeoDataFrame(transit_dfs[yymm], geometry=gpd.points_from_xy(y=transit_dfs[yymm].stop_lon, x=transit_dfs[yymm].stop_lat), crs="EPSG:4326")

In [209]:
neighborhoods = gpd.read_file("SDPD_Beats_shapefile/SDPD_Beats.shp")
neighborhoods['area'] = neighborhoods.geometry.area
neighborhoods = neighborhoods[neighborhoods['area'] > 150000]
zones = gpd.read_file("Zoning_Base_SD_shapefile/Zoning_Base_SD.shp")
uncounted_zones = zones[zones["ZONE_NAME"].isin(["AR-1-1", "AG-1-1", "AR-1-2"])]
neighborhoods = neighborhoods.to_crs(epsg=32611)
uncounted_zones = uncounted_zones.to_crs(epsg=32611)
neighborhoods_cleaned = gpd.overlay(neighborhoods, uncounted_zones, how='difference')
zones_c = zones[~zones["ZONE_NAME"].isin(["AR-1-1", "AG-1-1", "AR-1-2"])]
zones_c['zone_id'] = zones_c.index.astype(str)
zones_c = zones_c.to_crs(epsg=32611)
neighborhoods_cleaned = neighborhoods_cleaned.to_crs(epsg=32611)

  neighborhoods_cleaned = gpd.overlay(neighborhoods, uncounted_zones, how='difference')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [210]:

zones_with_neighborhoods = gpd.overlay(zones_c, neighborhoods_cleaned[['NAME', 'geometry']], how='intersection')
zones_with_neighborhoods = zones_with_neighborhoods.rename(columns={'NAME': 'neighborhood'})

  zones_with_neighborhoods = gpd.overlay(zones_c, neighborhoods_cleaned[['NAME', 'geometry']], how='intersection')


In [211]:
zones_with_neighborhoods['centroid'] = zones_with_neighborhoods.geometry.centroid
zones_centroids = zones_with_neighborhoods.set_geometry('centroid')
zones_centroids['buffer_1000m'] = zones_centroids.geometry.buffer(1000)
zones_buffers = zones_centroids.set_geometry('buffer_1000m')

In [212]:
for yymm in transit_gdfs:
    transit_gdfs[yymm] = transit_gdfs[yymm].to_crs(epsg=32611)


In [213]:
transit_gdf = transit_gdfs['1410.txt']
transit_gdf = transit_gdf.to_crs(zones_buffers.crs)
stops_within_buffers = gpd.sjoin(transit_gdf, zones_buffers[['zone_id', 'buffer_1000m']], predicate='within', how='inner')
stops_within_buffers

Unnamed: 0,stop_name_short,stop_lon,stop_lat,location_type,stop_id,geometry,index_right,zone_id


In [216]:
print("Invalid geometries count:", (~transit_gdf.geometry.is_valid).sum())

Invalid geometries count: 4700


In [None]:
stop_counts = stops_within_buffers.groupby('zone_id').size().reset_index(name='stop_count')
zones_with_counts = zones_with_neighborhoods.merge(stop_counts, on='zone_id', how='left')
zones_with_counts['stop_count'] = zones_with_counts['stop_count'].fillna(0).astype(int)

zones_with_counts[zones_with_counts['stop_count'] != 0]

In [None]:
#NOT WORKING
neighborhoods_cleaned = neighborhoods_cleaned.to_crs(epsg=26911)
zones_c = zones_c.to_crs(epsg=26911)

zones_c['centroid'] = zones_c.geometry.centroid
zones_centroids = zones_c
zones_centroids['centroid'] = zones_centroids.geometry.centroid
zones_centroids = zones_centroids.set_geometry('centroid')
zones_centroids['buffer_850m'] = zones_centroids.geometry.buffer(850)
buffers = zones_centroids.set_geometry('buffer_850m')
zone_centroids = zones_centroids.set_geometry('centroid')
zones_with_nhoods = gpd.sjoin(zone_centroids, neighborhoods_cleaned, predicate='within', how='left')[['zone_id', 'NAME']]
zones_c = zones_c.merge(zones_with_nhoods, on='zone_id', how='left')


for yymm in transit_gdfs:
    transit_gdf = transit_gdfs[yymm]
    stops_with_buffers = gpd.sjoin(transit_gdf, buffers, predicate='within', how='left')
    stops_with_buffers = stops_with_buffers.rename(columns={'zone_id': 'zone_id'})
    
    stop_zone_counts = stops_with_buffers.groupby(['stop_id', 'NAME']).size().reset_index(name='n_zone_buffers')
    stops_with_weights = stops_with_buffers.merge(stop_zone_counts, on=['stop_id', 'NAME'], how='left')
    stops_with_weights['weighted_contribution'] = 1 / stops_with_weights['n_zone_buffers']

    zone_scores = stops_with_weights.groupby(['zone_id', 'NAME'])['weighted_contribution'].sum().reset_index()
    neighborhood_scores = zone_scores.groupby('NAME')['weighted_contribution'].sum().reset_index(name='neighborhood_score')

    neighborhoods_cleaned_l = neighborhoods_cleaned.merge(neighborhood_scores, on='NAME', how='left').fillna({'neighborhood_score': 0})
    neighborhoods_cleaned_l.plot(column='neighborhood_score', cmap='plasma', legend=True, figsize=(12,12))
    plt.title("Transit Stop Score (10 min walk adjusted)", fontsize=16)
    plt.axis("off")
    plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


KeyError: 'NAME'

In [101]:
print(zones_centroids.columns)

Index(['ZONE_NAME', 'IMP_DATE', 'ORDNUM', 'Shape_Leng', 'Shape_Area',
       'geometry', 'zone_id', 'centroid', 'buffer_850m'],
      dtype='object')


In [None]:
# Load zoning and classify
zones = gpd.read_file("Zoning_Base_SD_shapefile/Zoning_Base_SD.shp")

def classify_zone(z):
    if z.startswith("RS") or z.startswith("RM") or z.startswith("RX"):
         return "Residential"
    elif z.startswith("CO") or z.startswith("CN") or z.startswith("CC"):
         return "Commercial"
    elif z.startswith ("IP") or z.startswith("IL") or z.startswith("IH"):
         return "Industrial"
    elif z.startswith("AG") or z.startswith("AR"):
         return "Agricultural"
    else:
        return "Other"

zones["ZONE_NAME"] = zones["ZONE_NAME"].apply(classify_zone)

neighborhoods = gpd.read_file("SDPD_Beats_shapefile/SDPD_Beats.shp")
if neighborhoods.crs != zones.crs:
    zones = zones.to_crs(neighborhoods.crs)

zone_in_neighborhoods = gpd.sjoin(zones,neighborhoods, how="inner", predicate="intersects")
zone_counts = zone_in_neighborhoods.groupby(["NAME","ZONE_NAME"]).size().unstack(fill_value=0).reset_index()

for yymm in transit_gdfs:
    transit_gdfs[yymm] = transit_gdfs[yymm].to_crs(neighborhoods.crs)

    stops_in_neighborhoods = gpd.sjoin(transit_gdfs[yymm], neighborhoods, how="inner", predicate="within")
    stop_counts = stops_in_neighborhoods.groupby("NAME").size().reset_index(name="stop_count")

    neighborhood_stats = zone_counts.merge(stop_counts, on="NAME", how="left")
    neighborhood_stats["stop_count"].fillna(0, inplace=True)

    melted = neighborhood_stats.melt(
        id_vars=["NAME","stop_count"],
        value_vars=["Residential","Commercial","Industrial","Agricultural", "Other"],
        var_name="ZONE_NAME",
        value_name="zone_count"
    )

    melted["stops_per_zone_unit"] = melted["stop_count"] / (melted["zone_count"] + 1e-6)

    plt.figure(figsize=(10,6))
    sns.boxplot(data=melted, x="ZONE_NAME", y="stops_per_zone_unit")
    plt.title("Transit Stop Density by Zone Type in Neighborhoods")
    plt.ylabel("Transit Stops per Zone Unit")
    plt.xlabel("Zone Type")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
