# Data Creation Process

In [1]:
# import requirements
import pandas as pd
import numpy as np
import string

## Sources

[NY County Level Test Data](https://health.data.ny.gov/Health/New-York-State-Statewide-COVID-19-Testing/xdss-u53e/data)

[County Level Economic Variables](https://raw.githubusercontent.com/JieYingWu/COVID-19_US_County-level_Summaries/master/data/counties.csv)

[NY Municipality Codes](https://data.ny.gov/Government-Finance/NY-Municipalities-and-County-FIPS-codes/79vr-2kdi)

[Social deprivation index (SDI) - Robert Graham Center](https://www.graham-center.org/rgc/maps-data-tools/sdi/social-deprivation-index.html)

In [2]:
# ny covid data
ny_data = pd.read_csv("./raw_data/New_York_State_Statewide_COVID-19_Testing.csv")

# county name - county code df
county_codes = pd.read_csv("./raw_data/NY_Municipalities_and_County_FIPS_codes.csv")

# other county level variables
county_data = pd.read_csv("https://raw.githubusercontent.com/JieYingWu/COVID-19_US_County-level_Summaries/master/data/counties.csv")

# social deprivation index data
sdi = pd.read_csv("./raw_data/ACS2015_countyallvars.csv")

## Creating County Level Testing and Economic Data 

In [3]:
ny_data.head()

Unnamed: 0,Test Date,County,New Positives,Cumulative Number of Positives,Total Number of Tests Performed,Cumulative Number of Tests Performed
0,06/07/2020,Albany,17,1978,1194,32194
1,06/07/2020,Allegany,0,51,149,3117
2,06/07/2020,Bronx,72,46052,3718,222329
3,06/07/2020,Broome,4,624,745,22789
4,06/07/2020,Cattaraugus,1,95,245,7036


### County testing data has no FIPS codes - we're merging with a county codes dataframe to get these

In [4]:
# formatting 'County' column for easy merging
#lowercase, strip whitespace, remove punctuation
ny_data['County'] = ny_data['County'].apply(lambda x:x.lower().strip().translate(str.maketrans('', '', string.punctuation)))

# formatting 'County Name' column for easy merging
county_codes['County Name'] = county_codes['County Name'].apply(lambda x:x.lower().strip().translate(str.maketrans('', '', string.punctuation)))

In [5]:
# selecting relevant columns and getting rid of municipality level data
county_codes = county_codes.groupby(['County Name','County FIPS']).count().reset_index()[['County Name','County FIPS']]

In [6]:
# st lawrence has the wrong county code - it's supposed to be 36089
county_codes[county_codes['County Name']=='st lawrence']

Unnamed: 0,County Name,County FIPS
50,st lawrence,36099


In [7]:
# fixing st lawrence error
county_codes.drop(50,inplace=True)
county_codes = county_codes.append(pd.DataFrame([['st lawrence',36089]],columns=county_codes.columns))

#### Jennifer's Code: Joining SDI data with county code data

In [8]:
# joining sdi data w NY FIPS codes to restrict to NY county SDI scores 
sdi_ny = sdi.merge(county_codes, left_on='county', right_on='County FIPS', how='inner').drop('county',axis=1)

In [9]:
sdi_ny.head()

Unnamed: 0,population,sdi_score,fpl_100_score,sing_parent_fam_score,black_score,dropout_score,no_car_score,rent_occup_score,crowding_score,nonemp_score,...,percnt_rentoccup,percnt_crowding,percnt_nonemp,percnt_unemp,percnt_highneeds,percnt_hispanic,percnt_frgnborn,percnt_lingisol,County Name,County FIPS
0,307463,36,37,47,64,11,90,73,5,20,...,0.416773,0.009324,0.06431,0.054552,0.412642,0.055249,0.088144,0.020089,albany,36001
1,48070,44,58,44,13,43,74,21,40,68,...,0.268635,0.020727,0.093737,0.08987,0.407135,0.015269,0.022363,0.005414,allegany,36003
2,1428357,100,99,100,91,99,100,100,100,98,...,0.810285,0.120878,0.141479,0.116415,0.413613,0.546264,0.344042,0.1838,bronx,36005
3,198093,52,68,59,35,32,90,47,22,47,...,0.342971,0.014735,0.078596,0.07129,0.41641,0.037654,0.062617,0.017161,broome,36007
4,78962,54,76,54,15,49,87,26,42,53,...,0.286866,0.021274,0.084671,0.079327,0.403561,0.019313,0.019946,0.008377,cattaraugus,36009


#### Joining SDI data with testing data

In [10]:
# joining ny data w SDI data (on county names)
ny_data_fips = ny_data.set_index("County").join(sdi_ny.set_index("County Name")).astype({'County FIPS': 'int32'})

In [11]:
ny_data_fips.head()

Unnamed: 0,Test Date,New Positives,Cumulative Number of Positives,Total Number of Tests Performed,Cumulative Number of Tests Performed,population,sdi_score,fpl_100_score,sing_parent_fam_score,black_score,...,percnt_hhnocar,percnt_rentoccup,percnt_crowding,percnt_nonemp,percnt_unemp,percnt_highneeds,percnt_hispanic,percnt_frgnborn,percnt_lingisol,County FIPS
albany,06/07/2020,17,1978,1194,32194,307463,36,37,47,64,...,0.127581,0.416773,0.009324,0.06431,0.054552,0.412642,0.055249,0.088144,0.020089,36001
albany,06/06/2020,8,1961,751,31000,307463,36,37,47,64,...,0.127581,0.416773,0.009324,0.06431,0.054552,0.412642,0.055249,0.088144,0.020089,36001
albany,06/05/2020,12,1953,1164,30249,307463,36,37,47,64,...,0.127581,0.416773,0.009324,0.06431,0.054552,0.412642,0.055249,0.088144,0.020089,36001
albany,06/04/2020,11,1941,1179,29085,307463,36,37,47,64,...,0.127581,0.416773,0.009324,0.06431,0.054552,0.412642,0.055249,0.088144,0.020089,36001
albany,06/03/2020,10,1930,817,27906,307463,36,37,47,64,...,0.127581,0.416773,0.009324,0.06431,0.054552,0.412642,0.055249,0.088144,0.020089,36001


In [12]:
# checking the join
ny_data_fips.shape[0] == ny_data.shape[0]

True

#### Collecting a list of columns that have the relevant infection and socioeconomic data we want to explore

In [13]:
relevant_infection_columns = ['Test Date','New Positives',\
                              'Cumulative Number of Positives',
                              'Total Number of Tests Performed','Cumulative Number of Tests Performed', 'County FIPS']
relevant_ses_columns = ['POP_ESTIMATE_2018', 'sdi_score']

#### Joining testing data with socioeconomic data

In [29]:
# joining infection w socioeconomic
joined_df = county_data[county_data['State']=='NY'].merge(ny_data_fips,left_on='FIPS', right_on='County FIPS')

# selecting relevant columns
df = joined_df[relevant_ses_columns+relevant_infection_columns]

In [30]:
df['Test Date'] = pd.to_datetime(df['Test Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
df.head()

Unnamed: 0,POP_ESTIMATE_2018,sdi_score,Test Date,New Positives,Cumulative Number of Positives,Total Number of Tests Performed,Cumulative Number of Tests Performed,County FIPS
0,307117,36,2020-06-07,17,1978,1194,32194,36001
1,307117,36,2020-06-06,8,1961,751,31000,36001
2,307117,36,2020-06-05,12,1953,1164,30249,36001
3,307117,36,2020-06-04,11,1941,1179,29085,36001
4,307117,36,2020-06-03,10,1930,817,27906,36001


### Feature Creation

#### Creating outcome variable

In [32]:
# 7 day rolling average of new cases - avg of new cases from current day + 6 previous days 
df['Rolling Avg New Cases'] = df['New Positives'].rolling(7).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [33]:
# rolling average normalized by population*100,000 - per capita rate of new cases
df['Per Capita Rate'] = (df['Rolling Avg New Cases']/df['POP_ESTIMATE_2018'])*100000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#### Creating quartiles for social deprivation index
##### Jennifer's code

In [37]:
# create SDI quartile variable
df['sdi_quartile'] = pd.qcut(df['sdi_score'], q=4)

# label the bins
bin_labels_4 = ['Q1', 'Q2', 'Q3', 'Q4']
df['sdi_quartile_label'] = pd.qcut(df['sdi_score'],
                              q=[0, .25, .50, .75, 1],labels=bin_labels_4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [38]:
df.head()

Unnamed: 0,POP_ESTIMATE_2018,sdi_score,Test Date,New Positives,Cumulative Number of Positives,Total Number of Tests Performed,Cumulative Number of Tests Performed,County FIPS,Rolling Avg New Cases,Per Capita Rate,sdi_quartile,sdi_quartile_label
0,307117,36,2020-06-07,17,1978,1194,32194,36001,,,"(24.0, 37.5]",Q2
1,307117,36,2020-06-06,8,1961,751,31000,36001,,,"(24.0, 37.5]",Q2
2,307117,36,2020-06-05,12,1953,1164,30249,36001,,,"(24.0, 37.5]",Q2
3,307117,36,2020-06-04,11,1941,1179,29085,36001,,,"(24.0, 37.5]",Q2
4,307117,36,2020-06-03,10,1930,817,27906,36001,,,"(24.0, 37.5]",Q2


In [39]:
df.to_csv("./data/ny_county_data_v2.csv")