In [54]:
import os
import pandas as pd
import requests as rq
import itertools
from bs4 import BeautifulSoup
from dotenv import load_dotenv

In [57]:
load_dotenv('./.env')
gkey = os.getenv('gkey');

# Scraping for borough, neighborhood, and zipcodes in New York, NY 

In [6]:
url = 'https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm'
response = rq.get(url)
soup = BeautifulSoup(response.text, 'lxml')

In [49]:
boroughs = [b.text for b in soup.find_all('td', headers='header1')]
neighborhoods = [n.text for n in soup.find_all('td', headers='header2')]

# zipcodes grouped by neighborhood (2d)
zipcodes = [z.text.split(',') for z in soup.find_all('td', headers='header3')] 

# unique zipcodes (1d)
zipcodes_1d = [z.strip(' ') for zip_list in zipcodes for z in zip_list]

In [167]:
borough_df = pd.DataFrame(boroughs, columns=['borough_name'])
borough_df

Unnamed: 0,borough_name
0,Bronx
1,Brooklyn
2,Manhattan
3,Queens
4,Staten Island


In [191]:
neighborhood_df = pd.DataFrame(neighborhoods, columns=['neighborhood_name'])

In [170]:
zipcode_df = pd.DataFrame(zipcodes_1d, columns=['zipcode'])
zipcode_df

Unnamed: 0,zipcode
0,10453
1,10457
2,10460
3,10458
4,10467
...,...
173,10312
174,10301
175,10304
176,10305


# Getting New York Subway addresses/zipcodes via Google reverse geocoding API

In [172]:
subway_df = pd.read_csv('Resources/DOITT_SUBWAY_STATION_01_13SEPT2010.csv')
subway_df

Unnamed: 0,URL,OBJECTID,NAME,the_geom,LINE,NOTES
0,http://web.mta.info/nyct/service/,1,Astor Pl,POINT (-73.99106999861966 40.73005400028978),4-6-6 Express,"4 nights, 6-all times, 6 Express-weekdays AM s..."
1,http://web.mta.info/nyct/service/,2,Canal St,POINT (-74.00019299927328 40.71880300107709),4-6-6 Express,"4 nights, 6-all times, 6 Express-weekdays AM s..."
2,http://web.mta.info/nyct/service/,3,50th St,POINT (-73.98384899986625 40.76172799961419),1-2,"1-all times, 2-nights"
3,http://web.mta.info/nyct/service/,4,Bergen St,POINT (-73.97499915116808 40.68086213682956),2-3-4,"4-nights, 3-all other times, 2-all times"
4,http://web.mta.info/nyct/service/,5,Pennsylvania Ave,POINT (-73.89488591154061 40.66471445143568),3-4,"4-nights, 3-all other times"
...,...,...,...,...,...,...
468,http://web.mta.info/nyct/service/,469,Coney Island - Stillwell Av,POINT (-73.9812359981396 40.57728100006751),D-F-N-Q,"D,F,N,Q-all times"
469,http://web.mta.info/nyct/service/,470,34th St - Hudson Yards,POINT (-74.00219709442206 40.75544635961596),7-7 Express,"7-all times, 7 Express-rush hours AM westbound..."
470,http://web.mta.info/nyct/service/,641,72nd St,POINT (-73.95836178682246 40.76880251014895),Q,Q-all times
471,http://web.mta.info/nyct/service/,642,86th St,POINT (-73.95177090964917 40.77786104333163),Q,Q-all times


In [140]:
#long lat 
subway_responses = []
for coord in subway_df.the_geom:
    coords = coord.strip('POINT ()').split(' ')
    coords = (',').join([coords[1], coords[0]]).strip(' ') #ensuring no spacing in coords 
    
    url = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={coords}&key={gkey}'
    print(f'Reverse geocoding')
    subway_responses.append(rq.get(url))

In [188]:
subway_address_cols = {
    'Address': [],
    'Zipcode': []
}

for resp in subway_responses:
    json = resp.json()
    full_address = json['results'][0]['formatted_address'].split(',')
    # Street full_address
    if 'New York' in full_address[0] + full_address[1]:
        subway_address_cols['Address'].append(full_address[0])
    else:
        subway_address_cols['Address'].append(full_address[0] + full_address[1])
    # zipcode    
    subway_address_cols['Zipcode'].append(full_address[-2].strip('NY '))

In [189]:
subway_addresses_df = pd.DataFrame(subway_address_cols)
subway_addresses_df['OBJECTID'] = subway_df['OBJECTID']

In [190]:
subway_final_df = pd.merge(subway_df, subway_addresses_df, on='OBJECTID', how='inner')
subway_final_df = subway_final_df[['OBJECTID', 'NAME', 'Address', 'Zipcode', 'LINE', 'NOTES']]
subway_final_df = subway_final_df.rename(columns={
    'OBJECTID': 'StationID', 
    'NAME': 'StationName',
    'LINE': 'Line',
    'NOTES':'Notes'})
subway_final_df

Unnamed: 0,StationID,StationName,Address,Zipcode,Line,Notes
0,1,Astor Pl,E 8th St & Lafayette St,10003,4-6-6 Express,"4 nights, 6-all times, 6 Express-weekdays AM s..."
1,2,Canal St,127A Lafayette St,10013,4-6-6 Express,"4 nights, 6-all times, 6 Express-weekdays AM s..."
2,3,50th St,1638 Broadway,10019,1-2,"1-all times, 2-nights"
3,4,Bergen St,Bergen Street Station Brooklyn,11217,2-3-4,"4-nights, 3-all other times, 2-all times"
4,5,Pennsylvania Ave,601 Livonia Ave Brooklyn,11207,3-4,"4-nights, 3-all other times"
...,...,...,...,...,...,...
468,469,Coney Island - Stillwell Av,Coney Island - Stillwell Av Brooklyn,11224,D-F-N-Q,"D,F,N,Q-all times"
469,470,34th St - Hudson Yards,11 Av/W 34 St,10001,7-7 Express,"7-all times, 7 Express-rush hours AM westbound..."
470,641,72nd St,2 Av/E 72 St,10021,Q,Q-all times
471,642,86th St,86th St 300-398 East 86th St,10028,Q,Q-all times
