## Rohan Gore
##### ~rmg9725
###
###### _Collaborators: Claude 3.7 Sonnet_
###

In [7]:
# !pip install pyarrow

## Question 1

In [1]:
import pandas as pd
import json
import numpy as np
from haversine import haversine, Unit
import os

# Since Dask with PyArrow is causing issues, we'll use pandas first and then convert
# Load the restaurants data
restaurants_path = 'shared/hw4/Restaurants_in_Durham_County_NC.csv'
restaurants_df = pd.read_csv(
    restaurants_path, 
    sep=';',
    dtype={
        'Seats': 'float64',
        'geolocation': 'object',
        'Status': 'object',
        'Rpt_Area_Desc': 'object'
    }
)

# Now convert to Dask DataFrame with pandas backend
import dask.dataframe as dd
restaurants_ddf = dd.from_pandas(restaurants_df, npartitions=4)

# Load foreclosures JSON
foreclosures_path = 'shared/hw4/durham-nc-foreclosure-2006-2016.json'
with open(foreclosures_path, 'r') as f:
    foreclosures_data = json.load(f)


In [2]:

# Extract coordinates for easier processing
for item in foreclosures_data:
    if 'geometry' in item and 'coordinates' in item['geometry']:
        item['longitude'] = item['geometry']['coordinates'][0]
        item['latitude'] = item['geometry']['coordinates'][1]

# Convert to pandas DataFrame
foreclosures_df = pd.DataFrame(foreclosures_data)

# Convert to Dask DataFrame
foreclosures_ddf = dd.from_pandas(foreclosures_df, npartitions=4)


#### Question 1.1

In [3]:

# Q1.1: Find active food service restaurants closest to the coordinate
target_coord = (35.994914, -78.897133)

# Filter restaurants by status and area description using Dask operations
filtered_restaurants_ddf = restaurants_ddf[
    (restaurants_ddf['Status'] == 'ACTIVE') & 
    (restaurants_ddf['Rpt_Area_Desc'] == 'Food Service')
]

# Compute to pandas for distance calculation
filtered_restaurants = filtered_restaurants_ddf.compute()

# Parse geolocation and calculate distances
def parse_coords(geo_str):
    if isinstance(geo_str, str) and ',' in geo_str:
        try:
            lat_str, lon_str = geo_str.split(',')
            return (float(lat_str.strip()), float(lon_str.strip()))
        except:
            return None
    return None

filtered_restaurants['coords'] = filtered_restaurants['geolocation'].apply(parse_coords)
filtered_restaurants['distance'] = filtered_restaurants['coords'].apply(
    lambda x: haversine(target_coord, x, unit=Unit.MILES) if x is not None else float('inf')
)



In [4]:
# Find the closest restaurant
closest_restaurant = filtered_restaurants.sort_values('distance').iloc[0]
print("\nQ1.1: Closest active food service restaurant:")
print(f"Name: {closest_restaurant['Premise_Name']}")
print(f"Address: {closest_restaurant['Premise_Address1']}")
print(f"Distance: {closest_restaurant['distance']:.4f} miles")



Q1.1: Closest active food service restaurant:
Name: OLD HAVANA SANDWICH SHOP
Address: 310 E. MAIN ST.
Distance: 0.1258 miles


#### Question 1.2

In [5]:

# Q1.2: Find foreclosures within 1 mile radius of the closest restaurant
restaurant_coord = closest_restaurant['coords']

# Convert foreclosures to pandas for distance calculation
foreclosures_pd = foreclosures_ddf.compute()

# Calculate distances from the restaurant to each foreclosure
foreclosures_pd['distance_from_restaurant'] = foreclosures_pd.apply(
    lambda row: haversine(restaurant_coord, (row['latitude'], row['longitude']), unit=Unit.MILES) 
    if pd.notna(row['latitude']) and pd.notna(row['longitude']) else float('inf'),
    axis=1
)


In [6]:

# Count foreclosures within 1 mile
foreclosures_within_1mile = foreclosures_pd[foreclosures_pd['distance_from_restaurant'] <= 1].shape[0]
print(f"\nQ1.2: Number of foreclosures within 1 mile of the restaurant: {foreclosures_within_1mile}")



Q1.2: Number of foreclosures within 1 mile of the restaurant: 320


In [7]:
print ("GG")

GG
