# Determining Boro and Neighborhood by station coordinates + EDA
This exercise was done using the NYC_2019.csv dataset. Revisit data cleaning section if using something different.

Skip import section for EDA

TODO:
- figure out where the bronx went

In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# Generate Data

## Import Station Data & Reverse Geocode

In [None]:
# import data of interest
datapath = "data/NY_2019.csv"

col_select = [
    "startstationid",
    "startstationname",
    "startstationlatitude",
    "startstationlongitude",
]
col_types = {"startstationid": "category", "startstationname": "category"}

stations = pd.read_csv(datapath, index_col=False, usecols=col_select, dtype=col_types)

# format, drop duplicates, drop na
stations.rename(
    columns={
        "startstationid": "stationid",
        "startstationname": "stationname",
        "startstationlatitude": "latitude",
        "startstationlongitude": "longitude",
    },
    inplace=True,
)
stations.drop_duplicates(subset=["stationid"], inplace=True)
stations.dropna(subset=["stationid"], inplace=True)
stations.set_index("stationid", inplace=True)
stations

In [None]:
# initialize geocode
geolocator = Nominatim(user_agent="bikegeocode")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1, max_retries=0)

# pull geolocation data for each station
locations_lst = []
for index, row in stations.iterrows():
    locations_lst.append(
        reverse("{}, {}".format(row["latitude"], row["longitude"])).raw["address"]
    )

pd.DataFrame(locations_lst[:10])

In [None]:
# select desired fields from geo data, then create a new dataframe using stationid as index

locations = pd.DataFrame(
    index=stations.index,
    data=locations_lst,
    columns=["neighbourhood", "suburb", "postcode"],
)
locations.rename(
    columns={"neighbourhood": "neighborhood", "suburb": "boro", "postcode": "zipcode"},
    inplace=True,
)
locations["neighborhood"] = locations["neighborhood"].astype("category")
locations["boro"] = locations["boro"].astype("category")
locations[["stationname", "latitude", "longitude"]] = stations[
    ["stationname", "latitude", "longitude"]
]
locations.zipcode = locations.zipcode.str[:5]

locations.reset_index(inplace=True)
locations

Where did the bronx go?

## Clean Locations Data

!!!CAUTION!!!

this was done with NYC_2019.csv data - if a new dataset is provided this must be revisited

In [None]:
locations.info()

#### Stations

In [None]:
locations.loc[locations.stationid.isna()]
# these should have been dropped when importing stations dataframe - this is just a confirmation

#### Zipcode

In [None]:
locations.loc[locations.zipcode.isna()]

In [None]:
# manually fill in missing zips from google maps (selecting location near street intersection)
locations.loc[locations.stationid == "524.0", "zipcode"] = "10036"
locations.loc[locations.stationid == "3263.0", "zipcode"] = "10003"
locations.loc[locations.stationid == "3443.0", "zipcode"] = "10019"

In [None]:
locations.loc[locations.zipcode.isna()]

#### Neighborhoods

In [None]:
# create a dictionary for imputing neighborhood with mode value of a given zip
neighborhood_dict = (
    locations.groupby("zipcode")["neighborhood"].agg(pd.Series.mode).to_dict()
)
neighborhood_dict

# There still appears to be a lot of uncertainty in which neighborhood is associated with a given zip
# some zips have several mode values, while some have none.
# no further work with neighborhoods at this time

#### Boros

In [None]:
# create a dictionary for imputing boro with mode value of a given zip
boro_dict = locations.groupby("zipcode")["boro"].agg(pd.Series.mode).to_dict()
boro_dict

In [None]:
# manually fill the few with missing values
boro_dict["11209"] = "Brooklyn"
boro_dict["11227"] = "Brooklyn"
boro_dict["11232"] = "Brooklyn"
boro_dict

In [None]:
# impute missing values
locations.boro = locations.boro.fillna(locations.zipcode.map(boro_dict))
locations.loc[locations.boro.isna()]

In [None]:
# review value counts
locations.boro.value_counts()

In [None]:
# combine queens and queens county
locations.loc[locations.boro == "Queens County", "boro"] = "Queens"
locations.boro = locations.boro.astype("string").astype("category")
locations.boro.value_counts()

## Review

In [None]:
locations.info()

## Export to file

In [None]:
# update export path as needed

exportpath = datapath[:-4] + "_locations.csv"
locations.to_csv(exportpath, index=False)

exportpath = datapath[:-4] + "_locations.parquet"
locations.to_parquet(exportpath)

# Where did the Bronx Go?

In [None]:
# import data of interest
datapath = "data/NY_2019.csv"

col_select = [
    "startstationid",
    "startstationname",
    "startstationlatitude",
    "startstationlongitude",
]
col_types = {"startstationid": "category", "startstationname": "category"}

stations = pd.read_csv(datapath, index_col=False, usecols=col_select, dtype=col_types)

# format, drop duplicates, drop na
stations.rename(
    columns={
        "startstationid": "stationid",
        "startstationname": "stationname",
        "startstationlatitude": "latitude",
        "startstationlongitude": "longitude",
    },
    inplace=True,
)

In [None]:
wherebronx = stations.loc[stations.stationid.isna()]

In [None]:
wherebronx.shape

In [None]:
# initialize geocode
geolocator = Nominatim(user_agent="bikegeocode")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1, max_retries=0)

# pull geolocation data for each station
locations_lst = []
for index, row in wherebronx.iterrows():
    locations_lst.append(
        reverse("{}, {}".format(row["latitude"], row["longitude"])).raw["address"]
    )

In [None]:
miss_stationid = pd.DataFrame(locations_lst)
miss_stationid

In [None]:
miss_stationid.suburb.value_counts()

# Docking Station - Geo EDA

In [None]:
# load data if not generated above
locations = pd.read_parquet("data/NY_2019_locations.parquet")

In [None]:
locations.head()

In [None]:
locations.info()

## Neighborhood

In [None]:
plt.figure(figsize=(15, 8))
ax = sns.histplot(
    locations.neighborhood.value_counts(),
    bins=locations.neighborhood.value_counts().nunique(),
)
ax.set(xlabel="Count of Neighborhoods", ylabel="Number of Docking Stations")

In [None]:
plt.figure(figsize=(15, 8))
x = sns.countplot(
    x=locations.neighborhood, order=locations.neighborhood.value_counts().index[:20]
)
x.set_xticklabels(x.get_xticklabels(), rotation=45, horizontalalignment="right")
x.set(title="Count of Stations per neighborhood [top 20]")

## Boro

In [None]:
plt.figure(figsize=(15, 8))
x = sns.countplot(x=locations.boro, order=locations.boro.value_counts().index)
# x.set_xticklabels(x.get_xticklabels(),rotation=45,horizontalalignment='right')
x.set(title="Count of Stations per boro")