# Data Preprocessing and Cleaning

In [1]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import json
!pip install lxml



In [2]:
#Downloading the loading the NYC geographical data
!wget -q -O "newyork_data.json" https://cocl.us/new_york_dataset
with open("newyork_data.json") as json_data:
    nyc_data = json.load(json_data)

In [3]:
#Defining dataframe columns and instantiating the dataframe
column_names = ["Borough", "Neighborhood", "Latitude", "Longitude"]
neighborhoods = pd.DataFrame(columns = column_names)

In [4]:
#Filling the neighborhoods dataframe using a for loop
nyc_features = nyc_data["features"]

for data in nyc_features:
    borough = neighborhood_name = data["properties"]["borough"]
    neighborhood_name = data["properties"]["name"]
    
    neighborhood_latlon = data["geometry"]["coordinates"]
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({"Borough": borough, 
                                         "Neighborhood": neighborhood_name, 
                                         "Latitude": neighborhood_lat,
                                         "Longitude": neighborhood_lon}, 
                                        ignore_index = True)

In [5]:
#Examining the results of the dataframe
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [6]:
#Loading neighborhood names and zip code information into dataframe'
zipcode_df = pd.read_html("https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm")[0]

#Checking the zipcode dataframe
zipcode_df.head()

Unnamed: 0,Borough,Neighborhood,ZIP Codes
0,Bronx,Central Bronx,"10453, 10457, 10460"
1,Bronx,Bronx Park and Fordham,"10458, 10467, 10468"
2,Bronx,High Bridge and Morrisania,"10451, 10452, 10456"
3,Bronx,Hunts Point and Mott Haven,"10454, 10455, 10459, 10474"
4,Bronx,Kingsbridge and Riverdale,"10463, 10471"


In [7]:
#Separating the multiple zipcodes into separate rows
#Separating zip code with comma and space
modzip_df = zipcode_df.drop("ZIP Codes", axis = 1)
modzip_df = modzip_df.join(zipcode_df["ZIP Codes"].str.split(", |\\,", expand = True).stack().reset_index(level = 1, drop = True).rename("zip code"))

#Converting zip code column to type int
modzip_df["zip code"] = modzip_df["zip code"].astype(int)

#Checking the dataframe
modzip_df.head()

Unnamed: 0,Borough,Neighborhood,zip code
0,Bronx,Central Bronx,10453
0,Bronx,Central Bronx,10457
0,Bronx,Central Bronx,10460
1,Bronx,Bronx Park and Fordham,10458
1,Bronx,Bronx Park and Fordham,10467


In [8]:
#Separating the neighborhood names in the Neighborhood column
modzip_df = modzip_df.drop("Neighborhood", axis = 1)
modzip_df = modzip_df.join(zipcode_df["Neighborhood"].str.split(" and ", expand = True).stack().reset_index(level = 1, drop = True).rename("neighborhood"))
modzip_df.head()

Unnamed: 0,Borough,zip code,neighborhood
0,Bronx,10453,Central Bronx
0,Bronx,10457,Central Bronx
0,Bronx,10460,Central Bronx
1,Bronx,10458,Bronx Park
1,Bronx,10458,Fordham


In [9]:
#Generating zipcode to MODZCTA conversion dataframe
modzcta_df = pd.read_csv("https://raw.githubusercontent.com/nychealth/coronavirus-data/master/Geography-resources/ZCTA-to-MODZCTA.csv")
modzcta_df.head()

Unnamed: 0,ZCTA,MODZCTA
0,10001,10001
1,10002,10002
2,10003,10003
3,10004,10004
4,10005,10005


In [10]:
#Converting zipcode to MODZCTA using the conversion dataframe
modzip_df["zip code"] = modzip_df["zip code"].map(modzcta_df.set_index("ZCTA")["MODZCTA"])

#Removing rows that contain NA values as the covid-19 data is based on modzcta. Any zip code that can't be converted is eliminated
modzip_df.dropna(inplace = True)

#Converting zip code column back to type int
modzip_df["zip code"] = modzip_df["zip code"].astype(int)

In [11]:
#Importing and loading covid-19 data by modzcta
covid19_df = pd.read_csv("https://raw.githubusercontent.com/nychealth/coronavirus-data/master/tests-by-zcta.csv")
covid19_df.head()

Unnamed: 0,modzcta,Positive,Total,modzcta_cum_perc_pos
0,,6727,7603,88.5
1,10001.0,365,2373,15.4
2,10002.0,1053,5047,20.9
3,10003.0,450,3697,12.2
4,10004.0,31,256,12.1


In [12]:
#Removing modzcta_cum_perc_pos and Positive columns as we only need the total number of cases for the purpose of this project
covid19_df = covid19_df.drop(["Positive", "modzcta_cum_perc_pos"], axis = 1)

#Dropping NA values and converting modzcta columnn to type int
covid19_df.dropna(inplace = True)
covid19_df["modzcta"] = covid19_df["modzcta"].astype(int)

In [13]:
#Generating a separate dataframe for merging
merge1_df = covid19_df.rename(columns = {"modzcta": "zip code", "Total": "covid19 cases"})
merge1_df.head()

Unnamed: 0,zip code,covid19 cases
1,10001,2373
2,10002,5047
3,10003,3697
4,10004,256
5,10005,567


In [14]:
#Merging dataframes to generate a dataframe that contains both zip code and covid case information
merge1_df = merge1_df.merge(modzip_df, how = "inner", on = "zip code")

In [15]:
#Renaming columns in preparation for generating the compiled dataframe
merge2_df = merge1_df.rename(columns = {"neighborhood": "Neighborhood"})
compiled_df = merge2_df.merge(neighborhoods, how = "inner", on = ["Borough", "Neighborhood"])

#Checking the compiled dataframe
compiled_df[:10]

Unnamed: 0,zip code,covid19 cases,Borough,Neighborhood,Latitude,Longitude
0,10001,2373,Manhattan,Chelsea,40.744035,-74.003116
1,10011,3829,Manhattan,Chelsea,40.744035,-74.003116
2,10018,1247,Manhattan,Chelsea,40.744035,-74.003116
3,10019,4008,Manhattan,Chelsea,40.744035,-74.003116
4,10019,4008,Manhattan,Chelsea,40.744035,-74.003116
5,10036,2667,Manhattan,Chelsea,40.744035,-74.003116
6,10001,2373,Manhattan,Clinton,40.759101,-73.996119
7,10011,3829,Manhattan,Clinton,40.759101,-73.996119
8,10018,1247,Manhattan,Clinton,40.759101,-73.996119
9,10019,4008,Manhattan,Clinton,40.759101,-73.996119


In [16]:
#Reformatting the compiled dataframe before exporting the dataframe as .csv file
compiled_df["zip code"] = compiled_df["zip code"].astype(str)
formatdf1 = compiled_df.groupby("Neighborhood")["zip code"].apply(', '.join)
formatdf2 = compiled_df.groupby("Neighborhood")["covid19 cases"].sum()
cleaned_data_df = neighborhoods.merge(formatdf1, on = "Neighborhood")
cleaned_data_df = cleaned_data_df.merge(formatdf2, on = "Neighborhood")
cleaned_data_df = cleaned_data_df.rename(columns = {"zip code": "Zip Code", "covid19 cases": "Total Covid-19 Cases"})
cleaned_data_df[:10]

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Zip Code,Total Covid-19 Cases
0,Bronx,Riverdale,40.890834,-73.912585,"10463, 10471",12000
1,Bronx,Kingsbridge,40.881687,-73.902818,"10463, 10471",12000
2,Bronx,Fordham,40.860997,-73.896427,"10458, 10467, 10468",27981
3,Bronx,Mott Haven,40.806239,-73.9161,"10454, 10455, 10459, 10474",14367
4,Bronx,Hunts Point,40.80973,-73.883315,"10454, 10455, 10459, 10474",14367
5,Bronx,Morrisania,40.823592,-73.901506,"10451, 10452, 10456",22884
6,Brooklyn,Sunset Park,40.645103,-74.010316,"11220, 11232",7947
7,Brooklyn,Greenpoint,40.730201,-73.954241,"11211, 11222",13919
8,Brooklyn,Flatbush,40.636326,-73.958401,"11203, 11210, 11225, 11226",26174
9,Brooklyn,Williamsburg,40.707144,-73.958115,"11206, 11221, 11237",17095


In [17]:
#Exporting the cleaned and compiled dataframe as a .csv file
cleaned_data_df.to_csv("compiled_data.csv")