In [152]:
import os
import pandas as pd
import numpy as np
import requests as rq
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [75]:
load_dotenv('./.env')
gkey = os.getenv('gkey')
pkey = os.getenv('pkey')
engine = create_engine(f'postgresql://postgres:{pkey}@localhost:5432/etl')

# Scraping for borough, neighborhood, and zipcodes in New York, NY 

In [133]:
url = 'https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm'
response = rq.get(url)
soup = BeautifulSoup(response.text, 'lxml')

In [134]:
boroughs = [b.text for b in soup.find_all('td', headers='header1')]
neighborhoods = [n.text for n in soup.find_all('td', headers='header2')]

# zipcodes grouped by neighborhood (2d)
zipcodes = [z.text.split(',') for z in soup.find_all('td', headers='header3')] 

# unique zipcodes (1d)
zipcodes_1d = [z.strip(' ') for zip_list in zipcodes for z in zip_list]

In [135]:
borough_df = pd.DataFrame(boroughs, columns=['borough'])
borough_df['boroughID'] = np.arange(borough_df.shape[0])
borough_df = borough_df[['boroughID', 'borough']]
borough_df

Unnamed: 0,boroughID,borough
0,0,Bronx
1,1,Brooklyn
2,2,Manhattan
3,3,Queens
4,4,Staten Island


In [136]:
neighborhood_df = pd.DataFrame(neighborhoods, columns=['neighborhood_name'])

In [137]:
zipcode_df = pd.DataFrame(zipcodes_1d, columns=['zipcode'])
zipcode_df = zipcode_df.sort_values('zipcode')
zipcode_df['zipcodeID'] = np.arange(zipcode_df.shape[0])
zipcode_df = zipcode_df[['zipcodeID', 'zipcode']]
zipcode_df.to_csv('Resources/zipcodes.csv')
zipcode_df.head()

Unnamed: 0,zipcodeID,zipcode
67,0,10001
88,1,10002
89,2,10003
82,3,10004
83,4,10005


In [153]:
def matchZipID(df, colname):
    ids = []
    for zipcode in df[colname]:
        for i, code in enumerate(zipcode_df.zipcode):
            if zipcode == code:
                ids.append(zipcode_df.iloc[i, 0].round())
#                 print(zipcode_df.iloc[i, 0].round())
    return pd.DataFrame(ids)

# Getting New York Subway addresses/zipcodes via Google reverse geocoding API

In [85]:
subway_df = pd.read_csv('Resources/DOITT_SUBWAY_STATION_01_13SEPT2010.csv')
subway_df

Unnamed: 0,URL,OBJECTID,NAME,the_geom,LINE,NOTES
0,http://web.mta.info/nyct/service/,1,Astor Pl,POINT (-73.99106999861966 40.73005400028978),4-6-6 Express,"4 nights, 6-all times, 6 Express-weekdays AM s..."
1,http://web.mta.info/nyct/service/,2,Canal St,POINT (-74.00019299927328 40.71880300107709),4-6-6 Express,"4 nights, 6-all times, 6 Express-weekdays AM s..."
2,http://web.mta.info/nyct/service/,3,50th St,POINT (-73.98384899986625 40.76172799961419),1-2,"1-all times, 2-nights"
3,http://web.mta.info/nyct/service/,4,Bergen St,POINT (-73.97499915116808 40.68086213682956),2-3-4,"4-nights, 3-all other times, 2-all times"
4,http://web.mta.info/nyct/service/,5,Pennsylvania Ave,POINT (-73.89488591154061 40.66471445143568),3-4,"4-nights, 3-all other times"
...,...,...,...,...,...,...
468,http://web.mta.info/nyct/service/,469,Coney Island - Stillwell Av,POINT (-73.9812359981396 40.57728100006751),D-F-N-Q,"D,F,N,Q-all times"
469,http://web.mta.info/nyct/service/,470,34th St - Hudson Yards,POINT (-74.00219709442206 40.75544635961596),7-7 Express,"7-all times, 7 Express-rush hours AM westbound..."
470,http://web.mta.info/nyct/service/,641,72nd St,POINT (-73.95836178682246 40.76880251014895),Q,Q-all times
471,http://web.mta.info/nyct/service/,642,86th St,POINT (-73.95177090964917 40.77786104333163),Q,Q-all times


In [87]:
#long lat 
subway_responses = []
for i, coord in enumerate(subway_df.the_geom):
    coords = coord.strip('POINT ()').split(' ')
    coords = (',').join([coords[1], coords[0]]).strip(' ') #ensuring no spacing in coords 
    
    url = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={coords}&key={gkey}'
    print(f'Reverse geocoding station with ID {subway_df.iloc[i,1]}')
    subway_responses.append(rq.get(url))

Reverse geocoding station with ID 1
Reverse geocoding station with ID 2
Reverse geocoding station with ID 3
Reverse geocoding station with ID 4
Reverse geocoding station with ID 5
Reverse geocoding station with ID 6
Reverse geocoding station with ID 7
Reverse geocoding station with ID 8
Reverse geocoding station with ID 9
Reverse geocoding station with ID 10
Reverse geocoding station with ID 11
Reverse geocoding station with ID 12
Reverse geocoding station with ID 13
Reverse geocoding station with ID 14
Reverse geocoding station with ID 15
Reverse geocoding station with ID 16
Reverse geocoding station with ID 17
Reverse geocoding station with ID 18
Reverse geocoding station with ID 19
Reverse geocoding station with ID 20
Reverse geocoding station with ID 21
Reverse geocoding station with ID 22
Reverse geocoding station with ID 23
Reverse geocoding station with ID 24
Reverse geocoding station with ID 25
Reverse geocoding station with ID 26
Reverse geocoding station with ID 27
Reverse ge

Reverse geocoding station with ID 220
Reverse geocoding station with ID 221
Reverse geocoding station with ID 222
Reverse geocoding station with ID 223
Reverse geocoding station with ID 224
Reverse geocoding station with ID 225
Reverse geocoding station with ID 226
Reverse geocoding station with ID 227
Reverse geocoding station with ID 228
Reverse geocoding station with ID 229
Reverse geocoding station with ID 230
Reverse geocoding station with ID 231
Reverse geocoding station with ID 232
Reverse geocoding station with ID 233
Reverse geocoding station with ID 234
Reverse geocoding station with ID 235
Reverse geocoding station with ID 236
Reverse geocoding station with ID 237
Reverse geocoding station with ID 238
Reverse geocoding station with ID 239
Reverse geocoding station with ID 240
Reverse geocoding station with ID 241
Reverse geocoding station with ID 242
Reverse geocoding station with ID 243
Reverse geocoding station with ID 244
Reverse geocoding station with ID 245
Reverse geoc

Reverse geocoding station with ID 436
Reverse geocoding station with ID 437
Reverse geocoding station with ID 438
Reverse geocoding station with ID 439
Reverse geocoding station with ID 440
Reverse geocoding station with ID 441
Reverse geocoding station with ID 442
Reverse geocoding station with ID 443
Reverse geocoding station with ID 444
Reverse geocoding station with ID 445
Reverse geocoding station with ID 446
Reverse geocoding station with ID 447
Reverse geocoding station with ID 448
Reverse geocoding station with ID 449
Reverse geocoding station with ID 450
Reverse geocoding station with ID 451
Reverse geocoding station with ID 452
Reverse geocoding station with ID 453
Reverse geocoding station with ID 454
Reverse geocoding station with ID 455
Reverse geocoding station with ID 456
Reverse geocoding station with ID 457
Reverse geocoding station with ID 458
Reverse geocoding station with ID 459
Reverse geocoding station with ID 460
Reverse geocoding station with ID 461
Reverse geoc

In [88]:
subway_address_cols = {
    'Address': [],
    'Zipcode': []
}

for resp in subway_responses:
    json = resp.json()
    try:
        full_address = json['results'][0]['formatted_address'].split(',')
        # Street full_address
        if 'New York' in full_address[0] + full_address[1]:
            subway_address_cols['Address'].append(full_address[0])
        else:
            subway_address_cols['Address'].append(full_address[0] + full_address[1])
        # zipcode    
        subway_address_cols['Zipcode'].append(full_address[-2].strip('NY '))
    except IndexError:
        subway_address_cols['Zipcode'].append(np.nan)
        subway_address_cols['Address'].append(np.nan)

In [89]:
subway_addresses_df = pd.DataFrame(subway_address_cols)
subway_addresses_df['OBJECTID'] = subway_df['OBJECTID']

In [151]:
subway_final_df = pd.merge(subway_df, subway_addresses_df, on='OBJECTID', how='inner')
subway_final_df = subway_final_df[['OBJECTID', 'NAME', 'Address', 'Zipcode', 'LINE']]
subway_final_df = subway_final_df[subway_final_df.Zipcode != '']
subway_final_df = subway_final_df.rename(columns={
    'OBJECTID': 'stationID', 
    'NAME': 'stationname',
    'Zipcode': 'zipcodeID',
    'Address': 'address',
    'LINE': 'line'})
# subway_final_df = subway_final_df.dropna()
subway_final_df['zipcodeID'] = matchZipID(subway_final_df, 'zipcodeID')
subway_final_df = subway_final_df.dropna()
subway_final_df['zipcodeID'] = subway_final_df['zipcodeID'].apply(round)
subway_final_df

Unnamed: 0,stationID,stationname,address,zipcodeID,line
0,1,Astor Pl,E 8th St & Lafayette St,2,4-6-6 Express
1,2,Canal St,127A Lafayette St,11,4-6-6 Express
2,3,50th St,1638 Broadway,16,1-2
3,4,Bergen St,Bergen Street Station Brooklyn,103,2-3-4
4,5,Pennsylvania Ave,601 Livonia Ave Brooklyn,93,3-4
...,...,...,...,...,...
452,453,High St,High Street - Brooklyn Bridge Brooklyn,32,A-C
453,454,Lafayette Ave,Fulton St/S Portland Av Brooklyn,110,A-C
454,455,President St,845A Nostrand Ave. Brooklyn,0,2-5
456,457,Bleecker St,Bleecker St,25,4-6-6 Express


## Uploading to postgres

In [69]:
borough_df.to_sql(name='boroughs', con=engine, if_exists='replace', index=False)

In [71]:
zipcode_df.to_sql(name='zipcodes', con=engine, if_exists='replace', index=False)

In [72]:
subway_final_df.to_sql(name='subways', con=engine, if_exists='replace', index=False)