In [48]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests


In [49]:
nyc_raw = requests.get('https://github.com/nychealth/coronavirus-data/blob/master/tests-by-zcta.csv').text
nyc_soup = BeautifulSoup(nyc_raw, 'lxml')

In [50]:
soup_content = nyc_soup.find('div', class_='application-main')
nyc_table = soup_content.find('table', class_='js-csv-data csv-data js-file-line-container')
table = nyc_table.find_all('tbody')
rows = table[0].find_all('tr')

In [51]:
zipcodes = []
positive_tests = []
percentage_total = []

for row in rows:    
    columns = row.find_all('td')
    
    zipcode = columns[1].text
    positive_test = columns[2].text
    percentage = columns[4].text
    
    zipcodes.append(zipcode)
    positive_tests.append(positive_test)
    percentage_total.append(percentage)

In [52]:
df_covid19 = pd.DataFrame(list(zip(zipcodes, positive_tests, percentage_total)), \
                  columns =['Zip Code', 'POSITIVES', 'PERCENT_POSI']) 
df_covid19.drop([0], axis = 0, inplace = True)
df_covid19.reset_index(drop=True).head(10)

Unnamed: 0,Zip Code,POSITIVES,PERCENT_POSI
0,10001,320,28.52
1,10002,930,38.05
2,10003,414,27.02
3,10004,28,26.17
4,10005,54,21.18
5,10006,23,21.3
6,10007,48,19.92
7,10009,631,30.09
8,10010,242,20.88
9,10011,477,30.04


In [53]:
zipcodes.pop(0)

'NA'

In [7]:
def url_creator(zip_code,category):
    url = 'https://census.missouri.edu/acs/profiles/report.php?p=37&g=86000US' + zip_code + '&s=' + str(category)
    return url

In [9]:
degrees = []
schools = []

for zipcode in zipcodes:
    
    link = requests.get(url_creator(zipcode,'Social')).text
    link_soup = BeautifulSoup(link, 'lxml')
    link_content = link_soup.find('main')
    zip_table = link_content.find('table', class_='profileTable')
    table = zip_table.find_all('tr')

    school_raw = table[69].find_all('td')
    school = float(school_raw[2].text)

    degree_raw = table[80].find_all('td')
    degree = float(degree_raw[2].text)

    degrees.append(degree)
    schools.append(school)
     

In [10]:
len(zipcodes) == len(degrees) == len(schools)

True

In [11]:
# degrees 
# schools

In [12]:
elderlys = []
whites = []
blacks = []
asians = []

for zipcode in zipcodes:
    link = requests.get(url_creator(zipcode,'Demographic')).text
    link_soup = BeautifulSoup(link, 'lxml')
    link_content = link_soup.find('main')
    zip_table = link_content.find('table', class_='profileTable')
    table = zip_table.find_all('tr')

    elderly_raw = table[24].find_all('td')
    elderly = float(elderly_raw[2].text)

    white_raw = table[37].find_all('td')
    white = float(white_raw[2].text)

    black_raw = table[38].find_all('td')
    black = float(black_raw[2].text)

    asian_raw = table[40].find_all('td')
    asian = float(asian_raw[2].text)

    elderlys.append(elderly)
    whites.append(white)
    blacks.append(black)
    asians.append(asian)


In [13]:
len(zipcodes) == len(elderlys) == len(whites) == len(blacks) == len(asians)

True

In [14]:
# elderlys
# whites
# blacks
# asians

In [37]:
minorities = list(np.around(np.add(blacks, asians),2))
len(minorities)

177

In [16]:
def clean_money(element):
    element = str(element)
    element = element.replace('$','') # removes $ sign
    element = element.replace(',','') # removes comma 
    element = float(element)          # changes data type to float
    return element

In [18]:
mean_incomes = []
med_incomes = []
povertys = []
unemploys = []


for zipcode in zipcodes:
    link = requests.get(url_creator(zipcode,'Economic')).text
    link_soup = BeautifulSoup(link, 'lxml')
    link_content = link_soup.find('main')
    zip_table = link_content.find('table', class_='profileTable')
    table = zip_table.find_all('tr')

    med_income_raw = table[20].find_all('td')
    med_income = clean_money(med_income_raw[1].text)

    mean_income_raw = table[21].find_all('td')
    mean_income = clean_money(mean_income_raw[1].text)
    
    poverty_raw = table[54].find_all('td')
    poverty = float(poverty_raw[2].text)

    unemploy_raw = table[75].find_all('td')
    unemploy = float(unemploy_raw[2].text)

    mean_incomes.append(mean_income)
    med_incomes.append(med_income)
    povertys.append(poverty)
    unemploys.append(unemploy)


In [20]:
len(zipcodes) == len(med_incomes) == len(povertys) == len(unemploys) == len(mean_incomes)

True

In [24]:
# med_incomes
# mean_incomes
# povertys
# unemploys

## Used for bs4 scraping

In [168]:
# Bachelor Degree or over percentage 
# degree = table[80].find_all('td')[2].text
# degree[2].text

In [69]:
# Enrolled in college or graduate school percentage 
# school = table[69].find_all('td')
# school[2].text

'63.5'

In [122]:
# 62 years or older percentage 
# elderly = table[24].find_all('td')
# elderly[2].text

'18.5'

In [120]:
# White population percentage 
# white = table[37].find_all('td')
# white[2].text

'62.9'

In [119]:
# Black or African population percentage 
# black = table[38].find_all('td')
# black[2].text

'6.2'

In [121]:
# Asian population percentage
# asian = table[40].find_all('td')
# asian[2].text

'25.2'

In [145]:
# Median household income 
# income = table[20].find_all('td')
# income[1].text

'$88,526'

In [150]:
# Persons below poverty percentage 
# poverty = table[54].find_all('td')
# poverty[2].text

'13.3'

In [152]:
# unemployment rate 
# unemployment = table[75].find_all('td')
# unemployment[2].text

'4.5'

In [39]:
df_covid19['DEGREES'] = degrees
df_covid19['ENROLLS'] = schools
df_covid19['ELDER'] = elderlys
df_covid19['WHITES'] = whites
df_covid19['BLACKS'] = blacks
df_covid19['ASIANS'] = asians
df_covid19['MINORITIES'] = minorities
df_covid19['MED_INCOMES'] = med_incomes
df_covid19['MEAN_INCOMES'] = mean_incomes
df_covid19['POVERTY'] = povertys
df_covid19['UNEMPLOYMENT'] = unemploys


In [40]:
df_covid19.head()

Unnamed: 0,Zip Code,POSITIVES,PERCENT_POSI,DEGREES,ENROLLS,ELDER,WHITES,BLACKS,ASIANS,MINORITIES,MED_INCOMES,MEAN_INCOMES,POVERTY,UNEMPLOYMENT
1,10001,318,29.17,68.0,63.5,18.5,62.9,6.2,25.2,31.4,88526.0,151628.0,13.3,4.5
2,10002,924,38.48,34.0,36.8,24.9,32.2,8.4,41.5,49.9,35859.0,68315.0,27.7,7.0
3,10003,413,27.46,80.2,84.2,15.1,75.9,5.8,13.5,19.3,112131.0,189885.0,8.5,5.7
4,10004,28,27.72,90.4,25.1,12.4,62.8,1.7,30.4,32.1,157645.0,218650.0,2.2,4.5
5,10005,54,21.6,88.4,52.2,1.3,73.4,2.5,17.0,19.5,173333.0,208186.0,9.5,3.1


In [41]:
API_KEY = '5e232949c48244eb888c484b236c9534'

In [42]:
import json 


latitudes = [] #initializes empty list to append latitude data
longitudes = [] #initializes empty list to append longitude data

for zipcode in zipcodes:
    location_input = str(zipcode) + " New York City, NY" # correct location input format 
    url = 'https://api.opencagedata.com/geocode/v1/json?q={}&key={}'.format(location_input, API_KEY) # correct url format
    obj = json.loads(requests.get(url).text)  # Gets the json file, stores in a dictionary

    results = obj['results'] # gets the 'results' key
    latitude = results[0]['geometry']['lat'] # gets the 'latitude' data 
    longitude = results[0]['geometry']['lng'] #gets the 'longitude' data

    latitudes.append(latitude) # Appends data to the list of latitudes
    longitudes.append(longitude) # Appends data to the list of longitudes

In [43]:
len(zipcodes) == len(latitudes) == len(longitudes)

True

In [44]:
df_covid19['Latitude'] = latitudes
df_covid19['Longitude'] = longitudes

In [45]:
df_covid19.tail()

Unnamed: 0,Zip Code,POSITIVES,PERCENT_POSI,DEGREES,ENROLLS,ELDER,WHITES,BLACKS,ASIANS,MINORITIES,MED_INCOMES,MEAN_INCOMES,POVERTY,UNEMPLOYMENT,Latitude,Longitude
173,11691,2031,49.13,25.8,22.8,15.3,38.9,46.8,3.5,50.3,46147.0,65467.0,21.4,9.4,40.608086,-73.752489
174,11692,536,45.0,23.4,22.5,15.3,22.6,60.9,5.7,66.6,44024.0,63908.0,25.4,12.9,40.59385,-73.796758
175,11693,299,42.84,26.4,30.9,16.8,63.3,19.9,4.0,23.9,59431.0,77799.0,14.0,5.2,40.598526,-73.817816
176,11694,645,41.24,43.4,27.6,25.2,81.3,6.8,3.5,10.3,80976.0,111206.0,9.9,5.4,40.580607,-73.83534
177,11697,86,38.74,50.8,26.8,30.1,95.4,0.6,0.6,1.2,103580.0,131013.0,3.1,4.8,40.559684,-73.915031


In [46]:
df_covid19 = df_covid19[['Zip Code','Latitude','Longitude','POSITIVES','PERCENT_POSI','DEGREES',  
                       'ENROLLS', 'ELDER', 'WHITES','BLACKS','ASIANS','MINORITIES','MED_INCOMES',
                         'MEAN_INCOMES','POVERTY','UNEMPLOYMENT']]
df_covid19.head()

Unnamed: 0,Zip Code,Latitude,Longitude,POSITIVES,PERCENT_POSI,DEGREES,ENROLLS,ELDER,WHITES,BLACKS,ASIANS,MINORITIES,MED_INCOMES,MEAN_INCOMES,POVERTY,UNEMPLOYMENT
1,10001,40.729825,-73.960752,318,29.17,68.0,63.5,18.5,62.9,6.2,25.2,31.4,88526.0,151628.0,13.3,4.5
2,10002,40.722313,-73.987709,924,38.48,34.0,36.8,24.9,32.2,8.4,41.5,49.9,35859.0,68315.0,27.7,7.0
3,10003,40.731603,-73.988488,413,27.46,80.2,84.2,15.1,75.9,5.8,13.5,19.3,112131.0,189885.0,8.5,5.7
4,10004,40.700741,-74.013467,28,27.72,90.4,25.1,12.4,62.8,1.7,30.4,32.1,157645.0,218650.0,2.2,4.5
5,10005,40.705636,-74.0089,54,21.6,88.4,52.2,1.3,73.4,2.5,17.0,19.5,173333.0,208186.0,9.5,3.1


In [47]:
df_covid19.to_csv('NYC_compiled_data (ACS 5 years).csv', index=False)

In [239]:
with open('nyc_zip_code.geojson', 'r') as jsonFile:
    geodata = json.load(jsonFile)
tmp = geodata
zips = df_covid19['Zip Code'].tolist()

geozips = []
for i in range(len(tmp['features'])):
    if tmp['features'][i]['properties']['postalcode'] in zips:
        geozips.append(tmp['features'][i])
        
new_json = dict.fromkeys(['type','features'])
new_json['type'] = 'FeatureCollection'
new_json['features'] = geozips

open('updated-file.json', 'w').write(
    json.dumps(new_json, sort_keys = True, indent = 4, separators = (',', ': '))
)

1677371