# Processing our 211info.org dataset
We manually pulled this data from 211info.org.  I also manually coded the "description" field into other fields we are using here.
- library: 0 = not a library, 1 = library
- open: 1 = open for business, 0 = currently closed.  Some libraries are closed for construction as of 8/10/2025
- temporary_shelter: 0 = no temp sleeping facilities, 1 = sleep facilities
- day_center: 0 = not a day center, 1 = open to the public for day use
- coord_reentry: 0 = not a designated reentry service center, 1 = is a designated reentry service center
- group_therapy: 0 = not primarilly a therapy provider, 1 = group therapy sessions available
- showers: 0 = showers not available, 1 = showers available
- meals: 0 = meals not available, 1 = some meals available
- food_box: 0 = no boxed food provided, 1 = food bank style food service available
- laundry: 0 = no laundry services, 1 = laundry services available
- health_services: 0 = no health services available, 1 = some health services available
- misc_services: 0 = no other services listed, 1 = a catch all for additional services not otherwise currently coded into this dataset
- URL: TBD = waiting on data
  
8/10/2025

by stephen.peters@gmail.com

In [1]:
# first, let's make sure the pandas library is installed so we have access to dataframes
!pip install pandas
print("pandas installed!")

pandas installed!


In [2]:
# now we'll import our libraries, and including some graphing ones, just in case
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
print("libraries imported!")

libraries imported!


In [5]:
# let's take a look at our current dataset
# first we set the path to our files:
# you'll need to edit this for your system
base_dir = Path("C:/Users/Steph/local/OIT-class/project-files/datasets/original/project_data")
df = pd.read_csv(base_dir / "211_Info_data_compiled-csv.csv")
# The "head" command will show us an excel-style display of our data with the columns across the top and the first four rows.
# "df" stands for "dataframe" and you'll see it's a very common generic variable name used in these cases.
df.head()

Unnamed: 0,Resource Name,Address,Zip,hours,library,open,temporary_shelter,day_center,coord_reentry,group_therapy,showers,meals,food_box,laundry,health_services,misc_services,description,URL
0,Albina Library,205 NE Russell Street,97212,TBD,1,1,0,0,0,0,0,0,0,0,0,0,public library,TBD
1,Capitol Hill Library,10723 SW Capitol Highway,97219,TBD,1,1,0,0,0,0,0,0,0,0,0,0,public library,TBD
2,Central Library,801 SW 10th Avenue,97205,TBD,1,1,0,0,0,0,0,0,0,0,0,0,public library,TBD
3,Gregory Heights Library,7921 NE Sandy Boulevard,97213,TBD,1,1,0,0,0,0,0,0,0,0,0,0,public library,TBD
4,Hillsdale Library,1525 SW Sunset Boulevard,97239,TBD,1,1,0,0,0,0,0,0,0,0,0,0,public library,TBD


In [7]:
# and let's look at the last records in our dataset
df.tail()

Unnamed: 0,Resource Name,Address,Zip,hours,library,open,temporary_shelter,day_center,coord_reentry,group_therapy,showers,meals,food_box,laundry,health_services,misc_services,description,URL
59,Pathfinder Network,10373 NE Hancock Street,97220,Monday-Friday 8:30am-7pm,0,1,0,0,0,1,0,0,0,0,0,0,Provides strength-based group sessions through...,TBD
60,Cascadia Health,6200 SE King Road,97222,Varies by service,0,1,0,0,0,1,0,0,0,0,0,0,"Offers mental health counseling for families, ...",TBD
61,Northwest Family Services,3231 SE 50th Avenue,97206,"Monday-Thursday 8am-6pm, Friday 8am-5pm",0,1,0,0,0,1,0,0,0,0,0,0,Professional counseling provided by licensed c...,TBD
62,Quest Center For Integrative Health,12636 SE Stark Street,97233,Monday-Friday 8:30am-5pm,0,1,0,0,0,1,0,0,0,0,0,0,"Provide services, counseling, support groups, ...",TBD
63,Lifestance Health,12636 SE Stark Street,97233,Monday-Friday 8:30am-5pm,0,1,0,0,0,1,0,0,0,0,0,0,Provides mental health evaluations and individ...,TBD


In [10]:
# and let's check datatypes
df.info()
print("\nHopefully we don't see anything odd here.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Resource Name      64 non-null     object
 1   Address            64 non-null     object
 2   Zip                64 non-null     int64 
 3   hours              64 non-null     object
 4   library            64 non-null     int64 
 5   open               64 non-null     int64 
 6   temporary_shelter  64 non-null     int64 
 7   day_center         64 non-null     int64 
 8   coord_reentry      64 non-null     int64 
 9   group_therapy      64 non-null     int64 
 10  showers            64 non-null     int64 
 11  meals              64 non-null     int64 
 12  food_box           64 non-null     int64 
 13  laundry            64 non-null     int64 
 14  health_services    64 non-null     int64 
 15  misc_services      64 non-null     int64 
 16  description        64 non-null     object
 17 

In [16]:
# check for missing data
# this command will count up any nulls in our columns
df.isnull().sum()
# if we have not sneaky nulls, the numbers column will be all zeros.

Resource Name        0
Address              0
Zip                  0
hours                0
library              0
open                 0
temporary_shelter    0
day_center           0
coord_reentry        0
group_therapy        0
showers              0
meals                0
food_box             0
laundry              0
health_services      0
misc_services        0
description          0
URL                  0
dtype: int64

In [18]:
# let's take a peek at our summary statistics
df.describe()
# at the moment I don't see any negative numbers, 64 total records and otherwise is looking OK so far.

Unnamed: 0,Zip,library,open,temporary_shelter,day_center,coord_reentry,group_therapy,showers,meals,food_box,laundry,health_services,misc_services
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,97217.71875,0.25,0.90625,0.1875,0.25,0.078125,0.21875,0.296875,0.125,0.078125,0.1875,0.125,0.421875
std,14.086432,0.436436,0.293785,0.393398,0.436436,0.27049,0.416667,0.460493,0.333333,0.27049,0.393398,0.333333,0.497763
min,97202.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,97206.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,97213.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,97227.0,0.25,1.0,0.0,0.25,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
max,97266.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
# Let's save our current file as a .csv so we have it in case we need it for something.
# Define the base directory. Remember to update this for your machine.
base_dir_save = Path("C:/Users/Steph/local/oit-class/project-files/datasets/processed")

# Ensure the directory exists (create it if it doesn't)
base_dir_save.mkdir(parents=True, exist_ok=True)

# Define the full output file path
output_file = base_dir_save / "211_Info_Data_clean.csv"

# Save the DataFrame to CSV
df.to_csv(output_file, index=False)

print(f"File saved to: {output_file}")

File saved to: C:\Users\Steph\local\OIT-class\project-files\datasets\processed\211_Info_Data_clean.csv


In [24]:
# Make a new dataframe so we don't have to re-run previous cells if we make mistakes in this one
df_latlong = df.copy()
print("df copied to df_latlong")

df copied to df_latlong


In [33]:
# address_latlon_match_fixed.py

import pandas as pd
import geopandas as gpd
import re
from pathlib import Path
from rapidfuzz import process, fuzz
from shapely.geometry import Point

# --- 1. Helper: normalize addresses ---
def normalize_address(addr):
    if pd.isna(addr):
        return ""
    addr = str(addr).upper()
    addr = re.sub(r"[^\w\s]", "", addr)   # remove punctuation
    addr = re.sub(r"\bST\b", "STREET", addr)
    addr = re.sub(r"\bRD\b", "ROAD", addr)
    addr = re.sub(r"\bAVE\b", "AVENUE", addr)
    addr = re.sub(r"\s+", " ", addr).strip()
    return addr

print("Libraries imported!")
print("Processing addresses to lat/long...")

# --- 2. Load shapefile ---
base_dir = Path("C:/Users/Steph/local/oit-class/project-files/datasets/original/PDX_Active_Address_Points-shapefile-expanded")
gdf_addresses = gpd.read_file(base_dir / "Active_Address_Points.shp")

# Ensure geometry is in WGS84 for lat/lon extraction
try:
    if gdf_addresses.crs is not None and gdf_addresses.crs.to_epsg() != 4326:
        gdf_addresses = gdf_addresses.to_crs(epsg=4326)
except Exception:
    # If CRS is missing/unknown, you can set it if you know it, e.g.:
    # gdf_addresses = gdf_addresses.set_crs("EPSG:XXXXX").to_crs(epsg=4326)
    pass

# Normalize shapefile fields
gdf_addresses["ADDRESS_clean"] = gdf_addresses["ADDRESS_FU"].apply(normalize_address)
gdf_addresses["ZIP_clean"] = gdf_addresses["ZIP_CODE"].astype(str).str.strip()

# --- 3. Load your CSV (set your path) ---
# Replace with your actual CSV path:
# input_csv = "C:/path/to/your_file.csv"
# df_latlong = pd.read_csv(input_csv)

# Expecting columns: 'Address' and 'Zip' in df_latlong
df_latlong["Address_clean"] = df_latlong["Address"].apply(normalize_address)
df_latlong["Zip_clean"] = df_latlong["Zip"].astype(str).str.strip()

# --- 4. Exact match merge ---
df_merged = df_latlong.merge(
    gdf_addresses[["ADDRESS_clean", "ZIP_clean", "geometry"]],
    left_on=["Address_clean", "Zip_clean"],
    right_on=["ADDRESS_clean", "ZIP_clean"],
    how="left",
    indicator=True
)

# Stable method column (categorical) + numeric score
method_map = {"both": "exact", "left_only": "none", "right_only": "none"}
df_merged["match_method"] = pd.Categorical(
    df_merged["_merge"].map(method_map).fillna("none"),
    categories=["exact", "fuzzy", "none"]
)
df_merged.drop(columns=["_merge"], inplace=True)

# Numeric score column
df_merged["match_score"] = pd.Series([pd.NA] * len(df_merged), dtype="Float64")

# --- 5. Fuzzy match for those still missing geometry ---
missing_mask = df_merged["geometry"].isna()
missing_df = df_merged[missing_mask].copy()

# Limit candidates to same ZIP where possible for better accuracy
# Build a dict: zip -> candidate list
zip_groups = gdf_addresses.groupby("ZIP_clean")["ADDRESS_clean"].apply(lambda s: s.dropna().astype(str).unique().tolist()).to_dict()

for idx, row in missing_df.iterrows():
    q = row["Address_clean"]
    if not isinstance(q, str) or not q:
        continue

    # Prefer candidates from the same ZIP; fall back to all addresses
    zip_code = row.get("Zip_clean", None)
    candidates = zip_groups.get(zip_code, None)
    if not candidates:
        candidates = gdf_addresses["ADDRESS_clean"].dropna().astype(str).unique().tolist()

    result = process.extractOne(q, candidates, scorer=fuzz.token_sort_ratio)
    if result is None:
        continue

    match, score, _ = result  # score is 0..100 (int)
    if score >= 90:
        # Get geometry for the matched address (prefer same ZIP if available)
        if zip_code in zip_groups:
            match_row = gdf_addresses[(gdf_addresses["ZIP_clean"] == zip_code) &
                                      (gdf_addresses["ADDRESS_clean"] == match)]
            if match_row.empty:
                match_row = gdf_addresses[gdf_addresses["ADDRESS_clean"] == match]
        else:
            match_row = gdf_addresses[gdf_addresses["ADDRESS_clean"] == match]

        if not match_row.empty:
            geom = match_row.iloc[0]["geometry"]
            df_merged.at[idx, "geometry"] = geom
            df_merged.at[idx, "match_method"] = "fuzzy"       # stable categorical value
            df_merged.at[idx, "match_score"] = float(score)   # numeric; no string formatting

# Optional: display-only column (plain string), useful for exports/UI
def _fmt_method(r):
    if r["match_method"] == "fuzzy" and pd.notna(r["match_score"]):
        return f"fuzzy ({r['match_score']:.1f}%)"
    return str(r["match_method"])

df_merged["match_method_display"] = df_merged.apply(_fmt_method, axis=1).astype("string")

# --- 6. Safe lat/lon extraction ---
def get_lat(geom):
    if isinstance(geom, Point):
        return geom.y
    return None

def get_lon(geom):
    if isinstance(geom, Point):
        return geom.x
    return None

df_merged["Latitude"] = df_merged["geometry"].apply(get_lat)
df_merged["Longitude"] = df_merged["geometry"].apply(get_lon)

# --- 7. Diagnostics ---
total = len(df_merged)
exact_count = (df_merged["match_method"] == "exact").sum()
fuzzy_count = (df_merged["match_method"] == "fuzzy").sum()
unmatched_count = total - (exact_count + fuzzy_count)

print("\n--- MATCHING SUMMARY ---")
print(f"Total records: {total}")
print(f"Exact matches: {exact_count}")
print(f"Fuzzy matches: {fuzzy_count}")
print(f"Unmatched: {unmatched_count}")
print("\nSample of unmatched addresses:")
print(df_merged[df_merged["geometry"].isna()][["Address", "Zip"]].head(10))

# --- 8. (Optional) Save results ---
# df_merged.to_csv("C:/path/to/output_with_latlon.csv", index=False)


Libraries imported!
Processing addresses to lat/long...

--- MATCHING SUMMARY ---
Total records: 64
Exact matches: 47
Fuzzy matches: 7
Unmatched: 10

Sample of unmatched addresses:
                              Address    Zip
3             7921 NE Sandy Boulevard  97213
4            1525 SW Sunset Boulevard  97239
5           7905 SE Holgate Boulevard  97206
10  1038 SE César E. Chávez Boulevard  97214
18           9715 SE Powell Boulevard  97266
20        4033 SE Woodstock Boulevard  97202
24                  3 NW Third Avenue  97209
44           9715 SE Powell Boulevard  97266
48                       PO Box 42610  97242
53           4445 SW Barbur Boulevard  97239


In [34]:
df_merged.head()

Unnamed: 0,Resource Name,Address,Zip,hours,library,open,temporary_shelter,day_center,coord_reentry,group_therapy,...,Latitude,Longitude,Address_clean,Zip_clean,ADDRESS_clean,ZIP_clean,geometry,match_method,match_score,match_method_display
0,Albina Library,205 NE Russell Street,97212,TBD,1,1,0,0,0,0,...,45.541366,-122.663134,205 NE RUSSELL STREET,97212,205 NE RUSSELL STREET,97212.0,POINT (-122.66313 45.54137),exact,,exact
1,Capitol Hill Library,10723 SW Capitol Highway,97219,TBD,1,1,0,0,0,0,...,45.448042,-122.72543,10723 SW CAPITOL HIGHWAY,97219,,,POINT (-122.72543 45.44804),fuzzy,90.909091,fuzzy (90.9%)
2,Central Library,801 SW 10th Avenue,97205,TBD,1,1,0,0,0,0,...,45.519056,-122.682257,801 SW 10TH AVENUE,97205,801 SW 10TH AVENUE,97205.0,POINT (-122.68226 45.51906),exact,,exact
3,Gregory Heights Library,7921 NE Sandy Boulevard,97213,TBD,1,1,0,0,0,0,...,,,7921 NE SANDY BOULEVARD,97213,,,,none,,none
4,Hillsdale Library,1525 SW Sunset Boulevard,97239,TBD,1,1,0,0,0,0,...,,,1525 SW SUNSET BOULEVARD,97239,,,,none,,none


In [35]:
# I let ChatGPT run away with me... at this point it's created a script that finds exact matches, "fuzzy" matches and a handful
# of addresses it can't match.  With this last run there's only 10 of those.
# I'm going to export to .csv and fix these manually

base_dir_save = Path("C:/Users/Steph/local/oit-class/project-files/datasets/processed")

# Ensure the directory exists (create it if it doesn't)
base_dir_save.mkdir(parents=True, exist_ok=True)

# Define the full output file path
output_file = base_dir_save / "211_Info_Data_tmp.csv"

# Save the DataFrame to CSV
df_merged.to_csv(output_file, index=False)

print(f"File saved to: {output_file}")


File saved to: C:\Users\Steph\local\oit-class\project-files\datasets\processed\211_Info_Data_tmp.csv


In [None]:
# What I found by looking at these locations manually; is one place was a P.O. box, not an address, some were not addresses of buildings 
# A more automated process would have flagged these and found another way to get lat/long
# and at least one address had special characters, which could be fixed in an automated way
# ...and that's what we'd do if we had hundreds or thousands of records to process.
# I've left all the various columns ChatGPT added, they don't take up much space.
# 
# new file: 211_Info_Data_latlong-csv

In [41]:
# load our exciting new data into a new dataframe
base_dir = Path("C:/Users/Steph/local/OIT-class/project-files/datasets/processed")
df_LL = pd.read_csv(base_dir / "211_Info_Data_latlong-csv.csv")
df_LL.head()


Unnamed: 0,Resource Name,Address,Zip,hours,library,open,temporary_shelter,day_center,coord_reentry,group_therapy,...,Latitude,Longitude,Address_clean,Zip_clean,ADDRESS_clean,ZIP_clean,geometry,match_method,match_score,match_method_display
0,Albina Library,205 NE Russell Street,97212,TBD,1,1,0,0,0,0,...,45.541366,-122.663134,205 NE RUSSELL STREET,97212,205 NE RUSSELL STREET,97212.0,POINT (-122.6631344519686 45.541366439963305),exact,,exact
1,Capitol Hill Library,10723 SW Capitol Highway,97219,TBD,1,1,0,0,0,0,...,45.448042,-122.72543,10723 SW CAPITOL HIGHWAY,97219,,,POINT (-122.72543030106553 45.44804216118064),fuzzy,90.909091,fuzzy (90.9%)
2,Central Library,801 SW 10th Avenue,97205,TBD,1,1,0,0,0,0,...,45.519056,-122.682257,801 SW 10TH AVENUE,97205,801 SW 10TH AVENUE,97205.0,POINT (-122.68225705740664 45.51905572713763),exact,,exact
3,Gregory Heights Library,7921 NE Sandy Boulevard,97213,TBD,1,1,0,0,0,0,...,45.5518,-122.581373,7921 NE SANDY BOULEVARD,97213,,,,none,,none
4,Hillsdale Library,1525 SW Sunset Boulevard,97239,TBD,1,1,0,0,0,0,...,45.479906,-122.694098,1525 SW SUNSET BOULEVARD,97239,,,,none,,none


In [48]:
# now that we have our lat/long, let us go ahead and add in our neighborhood for each record.  We already have zip code.
# I'm borrowing the code I used in the homeless_campsites-convert notebook

from shapely.geometry import Point
print("libraries imported!")

# Step 1: Load your df_latlong
# already did this
#df_latlong = pd.read_csv("C:/Users/Steph/local/OIT-class/datasets/processed/IRP_Campsite_Reports_latlong.csv")

# Step 2: Convert df_latlong to a GeoDataFrame
geometry = [Point(xy) for xy in zip(df_LL["Longitude"], df_LL["Latitude"])]
gdf = gpd.GeoDataFrame(df_LL, geometry=geometry, crs="EPSG:4326")

# Step 3: Load shapefiles for ZIP codes and neighborhoods
# don't need the zips shapefile, so turning that off
# zip_shapefile = "C:/Users/Steph/local/OIT-class/datasets/original/zip-code-extracted/portland-oregon-zip-code-boundaries.shp"
# hood_shapefile = "C:/Users/Steph/local/oit-class/datasets/original/Neighborhoods_regions-extracted/Neighborhoods_regions.shp"

#base_dir = Path("C:/Users/Steph/local/oit-class/project-files/datasets/original/Neighborhoods_regions-extracted")
# I'm going to have to come back and fix all these dumb pathing issues later.
hood_shapefile = "C:/Users/Steph/local/oit-class/project-files/datasets/original/Neighborhoods_regions-extracted/Neighborhoods_regions.shp"

#gdf_zip = gpd.read_file(zip_shapefile).to_crs("EPSG:4326")
gdf_hood = gpd.read_file(hood_shapefile).to_crs("EPSG:4326")
print("shape files loaded!")

libraries imported!
shape files loaded!


In [49]:
# I had to solve a bunch of problems related to column names last time, so let's see what columns are in our gdf_hood
# So let's print these again
print("gdf_hood columns:", list(gdf_hood.columns))


gdf_hood columns: ['OBJECTID', 'NAME', 'COMMPLAN', 'SHARED', 'COALIT', 'HORZ_VERT', 'MAPLABEL', 'ID', 'Shape_Leng', 'Shape_Area', 'nbh_distri', 'geometry']


In [56]:
# continue our script

# --- Neighborhood-only spatial join ---

# Make sure both layers share the same CRS
if gdf.crs != gdf_hood.crs:
    gdf_hood = gdf_hood.to_crs(gdf.crs)

# Pick the neighborhood name column (supports NAME or Name)
hood_name_col = "NAME" if "NAME" in gdf_hood.columns else ("Name" if "Name" in gdf_hood.columns else None)
if hood_name_col is None:
    raise KeyError("Could not find a neighborhood name column ('NAME' or 'Name') in gdf_hood.")

# Spatial join: points -> neighborhoods
gdf_full = gpd.sjoin(
    gdf,
    gdf_hood[["geometry", hood_name_col]],
    how="left",
    predicate="within"
)

# Drop the sjoin index if present
if "index_right" in gdf_full.columns:
    gdf_full = gdf_full.drop(columns=["index_right"])

# Create a clean neighborhood column in title case
gdf_full["neighborhood"] = gdf_full[hood_name_col].astype("string").str.title()

# (Optional) drop original geometry and source name column
df_with_neighborhoods = gdf_full.drop(columns=["geometry", hood_name_col], errors="ignore")

print("Neighborhood join complete!")
# Preview a few rows:
print(df_with_neighborhoods[["neighborhood"]].head())


# this space intentionally left blank
print("\nThis means it worked!")
print("Well, maybe.  It didn't produce an error message.  That's not the same as it doing what we want.")
print("Let's take a closer look.")

Neighborhood join complete!
         neighborhood
0               Eliot
1  West Portland Park
2   Portland Downtown
3             Roseway
4           Hillsdale

This means it worked!
Well, maybe.  It didn't produce an error message.  That's not the same as it doing what we want.
Let's take a closer look.


In [52]:
# let's take a look at what we have now:
df_with_neighborhoods.head()

Unnamed: 0,Resource Name,Address,Zip,hours,library,open,temporary_shelter,day_center,coord_reentry,group_therapy,...,Latitude,Longitude,Address_clean,Zip_clean,ADDRESS_clean,ZIP_clean,match_method,match_score,match_method_display,neighborhood
0,Albina Library,205 NE Russell Street,97212,TBD,1,1,0,0,0,0,...,45.541366,-122.663134,205 NE RUSSELL STREET,97212,205 NE RUSSELL STREET,97212.0,exact,,exact,Eliot
1,Capitol Hill Library,10723 SW Capitol Highway,97219,TBD,1,1,0,0,0,0,...,45.448042,-122.72543,10723 SW CAPITOL HIGHWAY,97219,,,fuzzy,90.909091,fuzzy (90.9%),West Portland Park
2,Central Library,801 SW 10th Avenue,97205,TBD,1,1,0,0,0,0,...,45.519056,-122.682257,801 SW 10TH AVENUE,97205,801 SW 10TH AVENUE,97205.0,exact,,exact,Portland Downtown
3,Gregory Heights Library,7921 NE Sandy Boulevard,97213,TBD,1,1,0,0,0,0,...,45.5518,-122.581373,7921 NE SANDY BOULEVARD,97213,,,none,,none,Roseway
4,Hillsdale Library,1525 SW Sunset Boulevard,97239,TBD,1,1,0,0,0,0,...,45.479906,-122.694098,1525 SW SUNSET BOULEVARD,97239,,,none,,none,Hillsdale


In [54]:
# double-check for missing data
# this command will count up any nulls in our columns
df_with_neighborhoods.isnull().sum()
# if we have not sneaky nulls, the numbers column will be all zeros.

Resource Name            0
Address                  0
Zip                      0
hours                    0
library                  0
open                     0
temporary_shelter        0
day_center               0
coord_reentry            0
group_therapy            0
showers                  0
meals                    0
food_box                 0
laundry                  0
health_services          0
misc_services            0
description              0
URL                      0
full_address             0
Latitude                 0
Longitude                0
Address_clean            0
Zip_clean                0
ADDRESS_clean           16
ZIP_clean               16
match_method             0
match_score             56
match_method_display     0
neighborhood             6
dtype: int64

In [58]:
# we have six blank neighborhoods.
# now what?
# I asked ChatGPT to give me code that would attempt to fix this, but due to some kind of network error, it wasn't doing it
# Since there were only six to fix, I did it manually by looking up the addresses with Google and entering them 
# into excel into this file: 211_Info_Data_latlong_hood-neighborhood-fix
# also turns out that <cr> in the description field was a problem, removed those manually.
# now this is a csv filed named: 211_Info_Data_latlong_hoods_all-csv.csv.

print("Our final resource data file is now in processed/211_Info_Data_latlong_hoods_all-csv.csv")



Our final resource data file is now in processed/211_Info_Data_latlong_hoods_all-csv.csv


In [None]:
# ok, if I feel like it, here's the script ChatGPT suggested for doing the cleanup
# which I didn't use because I did it manually

In [59]:
print("for now, all done with making our compiled resource file with lat/long, neighborhood name")

for now, all done with making our compiled resource file with lat/long, neighborhood name


In [None]:
# the script I didn't use, but could if someone feels like it
# ----- NEIGHBORHOOD JOIN + FALLBACKS + SAVE CSV -----

import pandas as pd
import geopandas as gpd

# If needed, build gdf from df_LL (uncomment if you haven't already):
# from shapely.geometry import Point
# df_LL["Longitude"] = pd.to_numeric(df_LL["Longitude"], errors="coerce")
# df_LL["Latitude"]  = pd.to_numeric(df_LL["Latitude"],  errors="coerce")
# gdf = gpd.GeoDataFrame(
#     df_LL.copy(),
#     geometry=gpd.points_from_xy(df_LL["Longitude"], df_LL["Latitude"]),
#     crs="EPSG:4326"
# )

# If needed, load the neighborhoods (uncomment and adjust path):
# gdf_hood = gpd.read_file("C:/Users/Steph/local/OIT-class/datasets/original/Neighborhoods_regions-extracted/Neighborhoods_regions.shp")

# Where to save the result:
output_csv = "C:/Users/Steph/local/OIT-class/datasets/processed/df_with_neighborhoods.csv"

# --- 0) Align CRS & pick the hood name column ---
if gdf.crs != gdf_hood.crs:
    gdf_hood = gdf_hood.to_crs(gdf.crs)

hood_name_col = "NAME" if "NAME" in gdf_hood.columns else ("Name" if "Name" in gdf_hood.columns else None)
if hood_name_col is None:
    raise KeyError("Neighborhood layer must have a name column 'NAME' or 'Name'.")

# Repair invalid polygons (common cause of misses)
gdf_hood = gdf_hood.copy()
gdf_hood["geometry"] = gdf_hood.geometry.buffer(0)

# --- 1) First pass: within ---
gdf_full = gpd.sjoin(
    gdf,
    gdf_hood[["geometry", hood_name_col]],
    how="left",
    predicate="within"
)

if "index_right" in gdf_full.columns:
    gdf_full = gdf_full.drop(columns=["index_right"])

gdf_full["neighborhood"] = gdf_full[hood_name_col].astype("string").str.title()
gdf_full["neighborhood_source"] = gdf_full["neighborhood"].notna().map({True: "within", False: pd.NA})

# --- 2) Fallbacks for unmatched ---
unmatched_idx = gdf_full.index[gdf_full["neighborhood"].isna()]
print(f"Unmatched after 'within': {len(unmatched_idx)}")

if len(unmatched_idx) > 0:
    # Use metric CRS for buffers (Portland ≈ UTM 10N)
    metric_crs = "EPSG:32610"
    pts_m  = gdf.to_crs(metric_crs)
    hood_m = gdf_hood.to_crs(metric_crs)

    # 2a) 1 m buffer + intersects
    pts_unmatched_m = pts_m.loc[unmatched_idx].copy()
    pts_unmatched_m["geom_buf"] = pts_unmatched_m.geometry.buffer(1.0)

    retry_buf = gpd.sjoin(
        pts_unmatched_m.set_geometry("geom_buf")[["geom_buf"]],
        hood_m[["geometry", hood_name_col]],
        how="left",
        predicate="intersects"
    )
    got = retry_buf.dropna(subset=[hood_name_col])
    if not got.empty:
        gdf_full.loc[got.index, "neighborhood"] = got[hood_name_col].astype("string").str.title()
        gdf_full.loc[got.index, "neighborhood_source"] = "buffer_1m_intersects"

    # 2b) Plain intersects (no buffer)
    still_unmatched = gdf_full.index[gdf_full["neighborhood"].isna()]
    if len(still_unmatched) > 0:
        pts_left_m = pts_m.loc[still_unmatched]
        retry_inter = gpd.sjoin(
            pts_left_m,
            hood_m[["geometry", hood_name_col]],
            how="left",
            predicate="intersects"
        )
        got2 = retry_inter.dropna(subset=[hood_name_col])
        if not got2.empty:
            gdf_full.loc[got2.index, "neighborhood"] = got2[hood_name_col].astype("string").str.title()
            gdf_full.loc[got2.index, "neighborhood_source"] = "intersects"

    # 2c) Nearest polygon ≤ 100 m
    still_unmatched = gdf_full.index[gdf_full["neighborhood"].isna()]
    if len(still_unmatched) > 0:
        pts_left_m = pts_m.loc[still_unmatched]
        nearest = gpd.sjoin_nearest(
            pts_left_m,
            hood_m[["geometry", hood_name_col]],
            how="left",
            distance_col="dist_m",
            max_distance=100
        )
        got3 = nearest.dropna(subset=[hood_name_col])
        if not got3.empty:
            gdf_full.loc[got3.index, "neighborhood"] = got3[hood_name_col].astype("string").str.title()
            gdf_full.loc[got3.index, "neighborhood_source"] = "nearest_<=100m"
            gdf_full.loc[got3.index, "neighborhood_nearest_dist_m"] = got3["dist_m"]

# --- 3) Finalize & save ---
# Keep original columns + the three we promised
keep_extra = ["neighborhood", "neighborhood_source", "neighborhood_nearest_dist_m"]
for col in keep_extra:
    if col not in gdf_full.columns:
        gdf_full[col] = pd.NA

# Drop helper columns if present
drop_cols = [hood_name_col, "index_right"]
gdf_out = gdf_full.drop(columns=[c for c in drop_cols if c in gdf_full.columns], errors="ignore").copy()

# Save
gdf_out.drop(columns=["geometry"], errors="ignore").to_csv(output_csv, index=False)

print("Neighborhood join complete.")
print("Neighborhood fill summary:")
print(gdf_out["neighborhood_source"].value_counts(dropna=False))
print(f"\nSaved to: {output_csv}")

# Optional: show a few that stayed unmatched (if any)
leftovers = gdf_out[gdf_out["neighborhood"].isna()]
if not leftovers.empty:
    show_cols = [c for c in ["Latitude", "Longitude", "Address"] if c in leftovers.columns]
    print("\nStill unmatched after all passes — sample:")
    print(leftovers[show_cols].head(10))
