In [1]:
import pandas as pd

In [None]:
data = pd.read_csv('')

In [3]:
import pandas as pd
import numpy as np
import math
import statistics
import datetime
import matplotlib as plt
import matplotlib.pyplot
from scipy import optimize
from sklearn.linear_model import HuberRegressor, LinearRegression

hotspots=pd.read_csv('NYC_Wi-Fi_Hotspot_Locations.csv')
print("Question 1: {:d}".format(len(hotspots["Provider"].drop_duplicates())))

bronx_hotspots = hotspots[hotspots["Borough Name"]=="Bronx"]
print("Question 2: {:d}".format(bronx_hotspots["Provider"].value_counts()[1]))

# Convert to upper to ensure capitalization doesn't matter. >-1 will include NaN's and cases where "park" isn't found.
# Ignore blanks in total number because we don't know if they are in parks or not.
print("Question 3: {:.10f}".format(float(len(hotspots[hotspots["Name"].str.upper().str.find("PARK")>-1]))/len(hotspots.dropna(subset=["Name"]))))

# Use "not found" or -1 comparison here- we cannot know the hotspot is in a library if there is no location type.
not_library_hotspots = hotspots[hotspots["Location_T"].str.upper().str.find("LIBRARY")==-1]
# It is free if remarks are blank, or just say the serial number. Otherwise, the remarks explain time/data/device limitations.
n_free = len(not_library_hotspots[not_library_hotspots["Type"].str.upper()=="FREE"])
print("Question 4: {:.10f}".format(float(n_free)/len(not_library_hotspots)))

population = pd.read_csv("Census_Demographics_at_the_Neighborhood_Tabulation_Area__NTA__level.csv")
nta_counts = hotspots["Neighborhood Tabulation Area Code (NTACODE)"].value_counts()
nta_counts_trimmed = nta_counts[nta_counts>=30]
hotspots_per_capita = []
# Save hotspots per capita for each population value in which the NTA code matches one of the codes with at least 30 hotspots
for ind in nta_counts_trimmed.index:
    p = population[population["Geographic Area - Neighborhood Tabulation Area (NTA)* Code"]==ind]
    hotspots_per_capita.append(float(nta_counts_trimmed[ind])/p.loc[p.index[0]]["Total Population 2010 Number"])
# Find n where length of list is 2n+1
n=math.floor(len(hotspots_per_capita)/2)
hotspots_per_capita.sort()
Q1=statistics.median(hotspots_per_capita[:n])
Q3=statistics.median(hotspots_per_capita[-n:])
print("Question 5: {:.10f}".format(Q3-Q1))

# Start by eliminating hotspots at the same location- nearest three hotspots shouldn't include ones at the same
# exact point where distance=0, and I'm interpreting the question as asking for the three nearest hotspot points,
# so that if there were three hotspots in the same place at 100 feet away, they would only be counted once.
# I'm also interpreting "How far do you need to walk between hotspots" as "Don't weight multiple hotspots in one
# location as more important than one hotspot", which is why I'm dropping duplicates now instead of after finding
# distances for each hotspot.
hotspot_locs = hotspots.drop_duplicates(subset="Location (Lat, Long)")
avg_three_nearest = []
for ind in hotspot_locs.index:
    latitude=hotspot_locs["Latitude"][ind]
    longitude=hotspot_locs["Longitude"][ind]
    # delta phi and delta lambda from the wikipedia formula. latitudes are clearly given in degrees, so convert
    # to radians.
    dphi = (hotspot_locs["Latitude"]-latitude)*np.pi/180.0
    dlambda = (hotspot_locs["Longitude"]-longitude)*np.pi/180.0
    phi_m = (hotspot_locs["Latitude"]+latitude)*(np.pi/180.0)/2.0
    # Convert Earth's radius to feet
    earth_radius = 6371*3280.84
    D=earth_radius*np.sqrt(dphi**2 + (dlambda * np.cos(phi_m))**2)
    # Sorting is less efficient than searching for mins, but only costs a second or two and is easier code to read
    D.sort_values(inplace=True, ignore_index=True)
    # Ignore first value because this is always zero (self)
    avg_three_nearest.append(np.mean(D[1:3]))
print("Question 6: {:.10f}".format(np.median(avg_three_nearest)))

# Start by eliminating dates that don't make sense
valid_dates = pd.to_datetime(hotspots["Activated"],errors="coerce").dropna()
print("Question 7: {:.10f}".format(float(valid_dates.dt.weekday.value_counts().max())/len(valid_dates)))

# Bin all months into the first day and count up values
monthyear = pd.DataFrame(pd.to_datetime(valid_dates.dt.strftime("%m-%Y")).value_counts().sort_index())
monthyear_trimmed = monthyear[monthyear.index<pd.to_datetime("2018-07-01")]
# Get days from start by comparing difference between months. Binning by first day will weight each month by
# its length relative to 30.5 days, which is correct because this will eliminate inaccuracies due to month length
# variation.
first_date=monthyear.index[0]
x = ((monthyear_trimmed.index-first_date).days)/30.5
y = np.array(monthyear_trimmed.Activated.to_list())
# Polyfit seemed to be too sensitive to outliers, so I used a Huber regressor which is supposed to
# be insensitive to outliers.
huber = HuberRegressor(epsilon=1.0)
huber.fit(np.expand_dims(x,1), np.expand_dims(y,1).ravel())
print("Question 8: {:.10f}".format(huber.coef_[0]))


Question 1: 17
Question 2: 120
Question 3: 0.1294917449
Question 4: 0.8121778351
Question 5: 0.0008650662
Question 6: 241.4857557857
Question 7: 0.2436882547
Question 8: 2.4097928147
