In [None]:
# -------------------------------------------------------
# ðŸ“¦ Install dependencies before running:
!pip install --upgrade pdfminer.six pandas geopy folium
# -------------------------------------------------------



In [None]:
# -------------------------------------------------------
# ðŸ“¦ Install dependencies before running:
# !pip install --upgrade pdfminer.six pandas geopy folium
# -------------------------------------------------------

import re, time
import pandas as pd
from pdfminer.high_level import extract_text
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
import folium

PDF_PATH = "/content/Food.pdf"
CSV_PARSED = "/content/food_bank_parsed.csv"
CSV_COUNTIES = "/content/cities_with_counties.csv"
MAP_HTML = "/content/food_bank_map.html"

# --- Extract text and split blocks ---
text = "\n".join(ln.rstrip() for ln in extract_text(PDF_PATH).splitlines())
blocks = re.split(r"\n{2,}", text)

# --- Regex patterns ---
phone_re = re.compile(r"(?:tel:)?\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(?:\s*(?:ext\.?|x)\s*\d+)?\b", re.I)
city_state_zip_re = re.compile(r"^(?P<city>.+?),\s*(?P<state>[A-Z]{2}),\s*(?P<zip>\d{5})$")
time_range_re = re.compile(r"\b(\d{1,2}:\d{2}\s*[ap]m)\s*-\s*(\d{1,2}:\d{2}\s*[ap]m)\b", re.I)
single_time_re = re.compile(r"\b\d{1,2}:\d{2}\s*[ap]m\b", re.I)
days_re = re.compile(r"\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Every\s+\w+|1st|2nd|3rd|4th|5th|Last|First)\b.*", re.I)
miles_re = re.compile(r"([0-9]+(?:\.[0-9]+)?)\s+Miles\b", re.I)
service_types = {"Pantry", "Soup Kitchen", "Mobile Food Pantry"}

# --- City â†’ County mapping ---
city_to_county = {
    "Browns Mills": "Burlington County","Pemberton": "Burlington County","Fieldsboro": "Burlington County",
    "Mt. Holly": "Burlington County","Mt Holly": "Burlington County","Camden": "Camden County",
    "Burlington": "Burlington County","Westampton": "Burlington County","Florence": "Burlington County",
    "Tabernacle": "Burlington County","Willingboro": "Burlington County","Medford": "Burlington County",
    "Mt. Laurel": "Burlington County","Edgewater Park": "Burlington County","Beverly": "Burlington County",
    "Delran": "Burlington County","Marlton": "Burlington County","Atco": "Camden County","Voorhees": "Camden County",
    "West Berlin": "Camden County","Berlin": "Camden County","Palmyra": "Burlington County","Pennsauken": "Camden County",
    "Merchantville": "Camden County","Clementon": "Camden County","Somerdale": "Camden County","Lawnside": "Camden County",
    "Haddon Heights": "Camden County","Barrington": "Camden County","Laurel Springs": "Camden County",
    "Blackwood": "Camden County","Audubon": "Camden County","Collingswood": "Camden County","Woodlynne": "Camden County",
    "Williamstown": "Gloucester County","Gloucester City": "Camden County","Deptford": "Gloucester County",
    "Turnersville": "Gloucester County","Westville": "Gloucester County","Sewell": "Gloucester County",
    "Woodbury": "Gloucester County","Glassboro": "Gloucester County","Pitman": "Gloucester County","Clayton": "Gloucester County",
    "Paulsboro": "Gloucester County","Gibbstown": "Gloucester County","Elmer": "Salem County","Swedesboro": "Gloucester County",
    "Woodstown": "Salem County","Pedricktown": "Salem County","Carney's Point": "Salem County","Carneyâ€™S Point": "Salem County",
    "Salem": "Salem County","Pennsville": "Salem County","Cherry Hill": "Camden County","Chesilhurst": "Camden County",
    "Lindenwold": "Camden County","Delair": "Camden County","Pine Hill": "Camden County","Mt. Ephraim": "Camden County",
    "Glendora": "Camden County","West Deptford": "Gloucester County"
}

def parse_block(block: str):
    lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
    if not lines: return None
    rec = {
        "name": lines[0], "address": None, "city": None, "state": None, "zip": None, "country": "US",
        "full_address": None, "phone": None, "time_hours": None, "days": None,
        "service_type": None, "distance_miles": None
    }
    for ln in lines:
        if m := phone_re.search(ln): rec["phone"] = m.group(0).replace("tel:", "").strip(); break
    for i, ln in enumerate(lines[:8]):
        if m := city_state_zip_re.match(ln):
            rec.update({"city": m.group("city"), "state": m.group("state"), "zip": m.group("zip")})
            if i - 1 >= 1: rec["address"] = lines[i - 1]
            break
    rec["full_address"] = ", ".join([v for v in [rec["address"], rec["city"], rec["state"], rec["zip"], rec["country"]] if v])
    times, days = [], []
    for ln in lines:
        if time_range_re.search(ln) or single_time_re.search(ln): times.append(ln)
        if days_re.search(ln): days.append(ln)
    rec["time_hours"], rec["days"] = "; ".join(times) or None, "; ".join(days) or None
    for ln in lines[-5:]:
        for st in service_types:
            if st.lower() in ln.lower(): rec["service_type"] = st; break
        if rec["service_type"]: break
    for ln in lines[-3:]:
        if m := miles_re.search(ln):
            try: rec["distance_miles"] = float(m.group(1))
            except: rec["distance_miles"] = None
            break
    return rec

# --- Parse, clean, and map counties ---
records = [parse_block(b) for b in blocks]
df = pd.DataFrame([r for r in records if r]).drop_duplicates().reset_index(drop=True)
df = df[df["address"].notna() & (df["address"] != "")]
df = df[~df["name"].isin(["Soup Kitchen", "Pantry"])]
df["city"] = df["city"].astype(str).str.strip().str.title()
df["County"] = df["city"].map(city_to_county)
df.to_csv(CSV_PARSED, index=False)

# --- Geocoding with retry ---
geolocator = Nominatim(user_agent="colab_app", timeout=10)
def geocode_address(address, retries=3, base_delay=2):
    if not isinstance(address, str) or not address.strip(): return None, None
    for i in range(retries):
        try:
            loc = geolocator.geocode(address); time.sleep(1)
            if loc: return loc.latitude, loc.longitude
            return None, None
        except (GeocoderTimedOut, GeocoderUnavailable):
            time.sleep(base_delay * (i + 1))
        except Exception: return None, None
    return None, None

df[["latitude", "longitude"]] = df["full_address"].apply(lambda x: pd.Series(geocode_address(x)))
df = df.dropna(subset=["latitude", "longitude"]).reset_index(drop=True)
df.to_csv(CSV_COUNTIES, index=False)

# --- Map visualization (includes name + address + time + days) ---
center_lat, center_lon = df["latitude"].mean(), df["longitude"].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=11)
for _, r in df.iterrows():
    tooltip = (
        f"<b>{r['name']}</b><br>"
        f"{r['full_address']}<br>"
        f"Time: {r['time_hours'] or 'N/A'}<br>"
        f"Days: {r['days'] or 'N/A'}"
    )
    folium.Marker([r["latitude"], r["longitude"]], tooltip=tooltip).add_to(m)
m.save(MAP_HTML)

print(f"âœ… Parsed: {CSV_PARSED}\nâœ… Counties+Coords: {CSV_COUNTIES}\nâœ… Map: {MAP_HTML}")


âœ… Parsed: /content/food_bank_parsed.csv
âœ… Counties+Coords: /content/cities_with_counties.csv
âœ… Map: /content/food_bank_map.html
