In [102]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import geopandas as gpd

In [103]:
folder_path = Path('stops_files')
transit_dfs = {}
for file in folder_path.glob("*.txt"):
    print(file.name + " loaded into transit_dfs")
    df = pd.read_csv(file)
    transit_dfs[file.name] = df

1709.txt loaded into transit_dfs
1906.txt loaded into transit_dfs
2406.txt loaded into transit_dfs
2201.txt loaded into transit_dfs
2001.txt loaded into transit_dfs
1509.txt loaded into transit_dfs
2206.txt loaded into transit_dfs
2004.txt loaded into transit_dfs
2401.txt loaded into transit_dfs
1901.txt loaded into transit_dfs
1309.txt loaded into transit_dfs
2006.txt loaded into transit_dfs
2101.txt loaded into transit_dfs
1409.txt loaded into transit_dfs
1609.txt loaded into transit_dfs
1806.txt loaded into transit_dfs
2301.txt loaded into transit_dfs
2310.txt loaded into transit_dfs
2106.txt loaded into transit_dfs
1209.txt loaded into transit_dfs
2306.txt loaded into transit_dfs
1801.txt loaded into transit_dfs
2111.txt loaded into transit_dfs
2501.txt loaded into transit_dfs
1401.txt loaded into transit_dfs
2109.txt loaded into transit_dfs
1206.txt loaded into transit_dfs
2309.txt loaded into transit_dfs
1601.txt loaded into transit_dfs
1406.txt loaded into transit_dfs
1410.txt l

In [104]:
common_cols = set(transit_dfs['1206.txt'].columns)
for df in transit_dfs.values():
    common_cols = common_cols.intersection(df.columns)
print(common_cols)

{'stop_place', 'stop_code', 'reference_place', 'parent_station', 'intersection_code', 'stop_name', 'wheelchair_boarding', 'stop_name_short', 'stop_lon', 'stop_lat', 'location_type', 'stop_id'}


Now, we need to standardize the columns across our dataframes.

In [105]:
for yymm in transit_dfs:
    df = transit_dfs[yymm]
    df = df[list(common_cols)]
    df = df.drop(['stop_code', 'stop_place', 'reference_place', 'parent_station', 'wheelchair_boarding', 'intersection_code', 'stop_name'], axis=1)
    transit_dfs[yymm] = df
    print(df.isna().sum())

stop_name_short    0
stop_lon           0
stop_lat           0
location_type      0
stop_id            0
dtype: int64
stop_name_short    0
stop_lon           0
stop_lat           0
location_type      0
stop_id            0
dtype: int64
stop_name_short    0
stop_lon           0
stop_lat           0
location_type      0
stop_id            0
dtype: int64
stop_name_short    0
stop_lon           0
stop_lat           0
location_type      0
stop_id            0
dtype: int64
stop_name_short    0
stop_lon           0
stop_lat           0
location_type      0
stop_id            0
dtype: int64
stop_name_short    0
stop_lon           0
stop_lat           0
location_type      0
stop_id            0
dtype: int64
stop_name_short    0
stop_lon           0
stop_lat           0
location_type      0
stop_id            0
dtype: int64
stop_name_short    0
stop_lon           0
stop_lat           0
location_type      0
stop_id            0
dtype: int64
stop_name_short    0
stop_lon           0
stop_lat      

In [106]:
transit_gdfs = {}
for yymm in transit_dfs:
    transit_gdfs[yymm] = gpd.GeoDataFrame(transit_dfs[yymm], geometry=gpd.points_from_xy(y=transit_dfs[yymm].stop_lat, x=transit_dfs[yymm].stop_lon), crs="EPSG:26911")

In [107]:
neighborhoods = gpd.read_file("SDPD_Beats_shapefile/SDPD_Beats.shp")
neighborhoods['area'] = neighborhoods.geometry.area
neighborhoods = neighborhoods[neighborhoods['area'] > 150000]

In [108]:
zones = gpd.read_file("Zoning_Base_SD_shapefile/Zoning_Base_SD.shp")
uncounted_zones = zones[zones["ZONE_NAME"].isin(["AR-1-1", "AG-1-1", "AR-1-2"])]

In [109]:
neighborhoods = neighborhoods.to_crs(epsg=26911)
uncounted_zones = uncounted_zones.to_crs(epsg=26911)


In [110]:
neighborhoods_cleaned = gpd.overlay(neighborhoods, uncounted_zones, how='difference')

  neighborhoods_cleaned = gpd.overlay(neighborhoods, uncounted_zones, how='difference')


In [None]:
#NOT WORKING

zones_c = zones[~zones["ZONE_NAME"].isin(["AR-1-1", "AG-1-1", "AR-1-2"])]
zones_c['zone_id'] = zones_c.index.astype(str)
neighborhoods_cleaned = neighborhoods_cleaned.to_crs(epsg=26911)
zones_c = zones_c.to_crs(epsg=26911)

zones_c['centroid'] = zones_c.geometry.centroid
zones_centroids = zones_c
zones_centroids['centroid'] = zones_centroids.geometry.centroid
zones_centroids = zones_centroids.set_geometry('centroid')
zones_centroids['buffer_850m'] = zones_centroids.geometry.buffer(850)
buffers = zones_centroids.set_geometry('buffer_850m')
zone_centroids = zones_centroids.set_geometry('centroid')
zones_with_nhoods = gpd.sjoin(zone_centroids, neighborhoods_cleaned, predicate='within', how='left')[['zone_id', 'NAME']]
zones_c = zones_c.merge(zones_with_nhoods, on='zone_id', how='left')


for yymm in transit_gdfs:
    transit_gdf = transit_gdfs[yymm]
    stops_with_buffers = gpd.sjoin(transit_gdf, buffers, predicate='within', how='left')
    stops_with_buffers = stops_with_buffers.rename(columns={'zone_id': 'zone_id'})
    
    stop_zone_counts = stops_with_buffers.groupby(['stop_id', 'NAME']).size().reset_index(name='n_zone_buffers')
    stops_with_weights = stops_with_buffers.merge(stop_zone_counts, on=['stop_id', 'NAME'], how='left')
    stops_with_weights['weighted_contribution'] = 1 / stops_with_weights['n_zone_buffers']

    zone_scores = stops_with_weights.groupby(['zone_id', 'NAME'])['weighted_contribution'].sum().reset_index()
    neighborhood_scores = zone_scores.groupby('NAME')['weighted_contribution'].sum().reset_index(name='neighborhood_score')

    neighborhoods_cleaned_l = neighborhoods_cleaned.merge(neighborhood_scores, on='NAME', how='left').fillna({'neighborhood_score': 0})
    neighborhoods_cleaned_l.plot(column='neighborhood_score', cmap='plasma', legend=True, figsize=(12,12))
    plt.title("Transit Stop Score (10 min walk adjusted)", fontsize=16)
    plt.axis("off")
    plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


KeyError: 'NAME'

In [101]:
print(zones_centroids.columns)

Index(['ZONE_NAME', 'IMP_DATE', 'ORDNUM', 'Shape_Leng', 'Shape_Area',
       'geometry', 'zone_id', 'centroid', 'buffer_850m'],
      dtype='object')


In [None]:
# Load zoning and classify
zones = gpd.read_file("Zoning_Base_SD_shapefile/Zoning_Base_SD.shp")

def classify_zone(z):
    if z.startswith("RS") or z.startswith("RM") or z.startswith("RX"):
         return "Residential"
    elif z.startswith("CO") or z.startswith("CN") or z.startswith("CC"):
         return "Commercial"
    elif z.startswith ("IP") or z.startswith("IL") or z.startswith("IH"):
         return "Industrial"
    elif z.startswith("AG") or z.startswith("AR"):
         return "Agricultural"
    else:
        return "Other"

zones["ZONE_NAME"] = zones["ZONE_NAME"].apply(classify_zone)

neighborhoods = gpd.read_file("SDPD_Beats_shapefile/SDPD_Beats.shp")
if neighborhoods.crs != zones.crs:
    zones = zones.to_crs(neighborhoods.crs)

zone_in_neighborhoods = gpd.sjoin(zones,neighborhoods, how="inner", predicate="intersects")
zone_counts = zone_in_neighborhoods.groupby(["NAME","ZONE_NAME"]).size().unstack(fill_value=0).reset_index()

for yymm in transit_gdfs:
    transit_gdfs[yymm] = transit_gdfs[yymm].to_crs(neighborhoods.crs)

    stops_in_neighborhoods = gpd.sjoin(transit_gdfs[yymm], neighborhoods, how="inner", predicate="within")
    stop_counts = stops_in_neighborhoods.groupby("NAME").size().reset_index(name="stop_count")

    neighborhood_stats = zone_counts.merge(stop_counts, on="NAME", how="left")
    neighborhood_stats["stop_count"].fillna(0, inplace=True)

    melted = neighborhood_stats.melt(
        id_vars=["NAME","stop_count"],
        value_vars=["Residential","Commercial","Industrial","Agricultural", "Other"],
        var_name="ZONE_NAME",
        value_name="zone_count"
    )

    melted["stops_per_zone_unit"] = melted["stop_count"] / (melted["zone_count"] + 1e-6)

    plt.figure(figsize=(10,6))
    sns.boxplot(data=melted, x="ZONE_NAME", y="stops_per_zone_unit")
    plt.title("Transit Stop Density by Zone Type in Neighborhoods")
    plt.ylabel("Transit Stops per Zone Unit")
    plt.xlabel("Zone Type")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
