In [1]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import os
import geopy
import plotly
#import plotly.express as px
import plotly.graph_objects as go
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import time 

# Constants
DATA_URL = "https://www.phoenixopendata.com/dataset/cc08aace-9ca9-467f-b6c1-f0879ab1a358/resource/0ce3411a-2fc6-4302-a33f-167f68608a20/download/crime-data_crime-data_crimestat.csv"
LOCAL_DATA_PATH = "/Users/natebender/Desktop/repo/phx_crime_heat/data/crime_data.csv"
CACHE_FILE_PATH = "/Users/natebender/Desktop/repo/phx_crime_heat/data/geocode_cache.csv"


In [2]:
# Try to load the geocode cache if it exists; otherwise, initialize an empty dictionary
try:
    cache_df = pd.read_csv(CACHE_FILE_PATH, index_col='address_zip')
    geocode_cache = cache_df.to_dict(orient='index')
except FileNotFoundError:
    geocode_cache = {}

In [3]:
cache_df.head()

Unnamed: 0_level_0,latitude,longitude
address_zip,Unnamed: 1_level_1,Unnamed: 2_level_1
"1300 E ALMERIA RD, Phoenix, AZ",33.466771,-112.054083
"5100 N 15TH ST, Phoenix, AZ",33.512281,-112.050423
"1400 E HIGHLAND AVE, Phoenix, AZ",33.505666,-112.051749
"6900 W WOOD ST, Phoenix, AZ",33.410824,-112.209052
"N 43RD AVE & W CACTUS RD, Phoenix, AZ",,


In [4]:
def download_data(url):
    """
    Download CSV data from the provided URL and return it as a pandas DataFrame.
    Adjust the dtype of 'INC NUMBER' to string to avoid mixed types warning.
    """
    response = requests.get(url)
    if response.status_code == 200:
        data = StringIO(response.content.decode('utf-8'))
        # Specify dtype for 'INC NUMBER' column to ensure it's read as a string
        df = pd.read_csv(data, dtype={'INC NUMBER': str})
        return df
    else:
        raise Exception(f"Failed to download data: HTTP {response.status_code}")

def save_data(df, path):
    """
    Save the DataFrame to the specified local path.
    """
    df.to_csv(path, index=False)

def update_data(url, local_path):
    """
    Update the local dataset with new entries from the dataset at the provided URL.
    Adjust the dtype of 'INC NUMBER' to string when loading existing data.
    """
    # Download the latest data
    new_data = download_data(url)
    new_data = clean_addresses(new_data)
    
    if os.path.exists(local_path):
        # Load the existing data, specifying dtype for 'INC NUMBER'
        existing_data = pd.read_csv(local_path, dtype={'inc_number': str})
        
        # Combine the new data with the existing data, avoiding duplicates
        updated_data = pd.concat([existing_data, new_data]).drop_duplicates(subset=['inc_number', 'occurred_on'])
        
        # Save the combined dataset back to the local path
        save_data(updated_data, local_path)
        print("Data has been updated.")
    else:
        # If the local file does not exist, just save the new data
        save_data(new_data, local_path)
        print("Data saved.")

# Example usage
# update_data(DATA_URL, LOCAL_DATA_PATH)

In [5]:
def clean_addresses(df):
    """
    Clean the '100 BLOCK ADDR' column in the DataFrame.
    """
    # Example cleaning step: Replace "XX" with "00" in addresses
    df['100_block_addr'] = df['100_block_addr'].str.replace('XX', '00')
    return df


In [6]:
def robust_geocode(address, zip_code=None):
    """
    Attempt to geocode an address using an external geocoding service.
    Appends zip code information to the address to improve accuracy.
    Caches results using a combination of address and zip code as the key.

    Parameters:
    - address: The street address to geocode.
    - zip_code: Optional zip code to include in the geocoding request.
    """
    # Formulate the query with the zip code, if provided
    address_query = f"{address}, Phoenix, AZ"
#     if zip_code:
#         zip_code = str(zip_code)
#         address_query += f", {zip_code}"
    
    # Use address_query as the cache key to uniquely identify each geocode request
    cache_key = address_query
    
    # Check cache first to avoid redundant geocoding requests
    if cache_key in geocode_cache:
        return geocode_cache[cache_key]
    
    try:
       # address_query = f"{address}, {zip_code}" if zip_code else address
        location = geocode(address_query, timeout=10)
        if location:
            lat_lon = (location.latitude, location.longitude)
            geocode_cache[cache_key] = lat_lon
            return lat_lon
        else:
            geocode_cache[cache_key] = (None, None)
            return (None, None)
    except Exception as e:
        print(f"Error geocoding address '{address_query}': {e}")
        geocode_cache[cache_key] = (None, None)
        return (None, None)

In [7]:
def save_cache_to_file(cache, file_path):
    """
    Save the geocode cache to a CSV file, including handling cache entries
    that are keyed with address and zip code combinations.

    Parameters:
    - cache: The geocode cache dictionary.
    - file_path: Path to the CSV file where the cache is saved.
    """
    cache_data = [{
        'address_zip': key,  # The combined address and zip code key
        'latitude': lat_lon[0] if lat_lon else None,
        'longitude': lat_lon[1] if lat_lon else None
    } for key, lat_lon in cache.items()]
    
    cache_df = pd.DataFrame(cache_data)
    # Optionally split 'address_zip' into separate 'address' and 'zip' columns here
    cache_df.to_csv(file_path, index=False)

In [8]:
def batch_addr_processing(df, address_column, cache_file_path, batch_size=2000):
    """
    Processes addresses in batches, applying geocoding, providing progress updates,
    and saving the cache periodically.

    Parameters:
    - df: DataFrame containing the addresses to be processed.
    - address_column: The name of the column in df that contains the addresses.
    - cache_file_path: The file path where the geocode cache will be saved.
    - batch_size: The number of addresses to process in each batch.
    """
    start_time = time.time()  # Start timer

    geocoded_results = df.apply(lambda x: robust_geocode(x[address_column], x['zip']) if pd.notnull(x[address_column]) else (None, None), axis=1)

    # Splitting the tuple results into two separate series for latitude and longitude
    df['latitude'], df['longitude'] = zip(*geocoded_results)

    #print(df)

    # Save cache periodically after each batch
    save_cache_to_file(geocode_cache, cache_file_path)
        
    end_time = time.time()  # End timer
    elapsed_time = end_time - start_time  # Calculate elapsed time
    print(f"Batch address processing completed in {elapsed_time:.2f} seconds.")
     
    return df

In [9]:
geolocator = Nominatim(user_agent="nb_phx_test_app")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, error_wait_seconds=10, max_retries=2, swallow_exceptions=False)

In [10]:

# Adjust the rate limiter to use a more conservative delay and a longer timeout
# Increasing min_delay_seconds to avoid hitting rate limits
# Setting a longer timeout for each geocode call

#location = geolocator.geocode("175 5th Avenue NYC")
location = geolocator.geocode("1300 E ALMERIA RD, Phoenix, AZ, 85006")

print(location.address)
print((location.latitude, location.longitude))

1300, East Almeria Road, Palms Trailer Park, Phoenix, Maricopa County, Arizona, 85006, United States
(33.466771, -112.054083)


In [11]:
# df = download_data(DATA_URL)

In [12]:
# save_data(df, LOCAL_DATA_PATH)
# df_clean = clean_addresses(df)

In [13]:
df_test = pd.read_csv("/Users/natebender/Desktop/crimestat_copy.csv",dtype={'inc_number': str})

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
len(df_test)

527233

In [15]:
df_test.columns = [col.lower().replace(' ', '_') for col in df_test.columns]
df_test['zip'] = df_test['zip'].astype(str)
df_test = df_test.dropna(subset=['inc_number'])
# Ensure 'occurred_on' is datetime and create a string version for display
df_test['occurred_on'] = pd.to_datetime(df_test['occurred_on'])
df_test['occurred_on_str'] = df_test['occurred_on'].dt.strftime('%Y-%m-%d')

In [16]:
df_test = df_test.iloc[:100]
len(df_test)

100

In [17]:
df_test = clean_addresses(df_test)
print(df_test.isnull().sum())

inc_number             0
occurred_on            0
occurred_to           18
ucr_crime_category     0
100_block_addr         0
zip                    0
premise_type           0
grid                   0
occurred_on_str        0
dtype: int64


In [18]:
df_test.dropna(subset=['occurred_on'], inplace=True)

In [19]:
df_test = batch_addr_processing(df_test, '100_block_addr', CACHE_FILE_PATH)

KeyError: 0

In [None]:
df_test["occurred_on"]

In [None]:
df_test.head()

In [None]:
# Create figure
fig = go.Figure()

# Add a trace for each unique date
for date in unique_dates:
    df_filtered = df_test[df_test['occurred_on_str'] == date]
    if not df_filtered.empty:  # Check if filtered DataFrame is not empty
        # Create the text for the hover tooltip
        hover_text = df_filtered.apply(lambda row: f"Address: {row['100_block_addr']}<br>Latitude: {row['latitude']}, Longitude: {row['longitude']}<br>Date: {date}<br>Crime Category: {row['ucr_crime_category']}", axis=1)
        
        fig.add_trace(
            go.Scattermapbox(
                lat=df_filtered['latitude'],
                lon=df_filtered['longitude'],
                mode='markers',
                marker=go.scattermapbox.Marker(size=9),
                name=date,
                text=hover_text,  # Use the custom hover text
                hoverinfo='text',  # Ensure only the custom text is displayed on hover
                visible=False
            )
        )

# Debug: Ensure at least one trace is set to visible
if fig.data:
    fig.data[0].visible = True  # Making the first trace visible

# Add and configure the slider
steps = []
for i, date in enumerate(unique_dates):
    date_str = str(date)  # Explicitly convert `date` to string to avoid TypeError
    step = dict(
        method="update",
        args=[{"visible": [False] * len(fig.data)}, {"title": "Date: " + date_str}],
    )
    step["args"][0]["visible"][i] = True  # Toggle visibility
    steps.append(step)

fig.update_layout(
    sliders=[{"active": 0, "steps": steps}],
    mapbox_style="open-street-map",
    mapbox_zoom=4
)

In [None]:
fig.show()