# Combining HealthLandscapes and COVID-19 case data sets
# Sidney Cannon-Bailey
# 10/25/2020

This notebook was created to begin stitching all the data sources together. The data that should be combined is:

- Covid-19 data, currently separated by city and organized by zip code
- Health Landscapes data, currently separated by city and organized by census tract
- Racial population data, all cities together, organized by zip code
- Census tracts by zip code, to allow for conversion back and forth between the two


V2: Reorganizing order of reading in data sources to convert to zip codes and average health landscapes data easier


Before running this notebook, raw data was altered as follows:
- downloaded each city's covid-19 data and excluded all data points except zip code and number of confirmed cases. NOTE: Boston's covid-19 data was sometimes reported with overlapping zip codes. I used zip code population to divide the total number of cases into each zip code, assuming that each zip code contributed at the same percentage of population (ex. If zip codes 12345 (population of 60 people) and 12346 (population of 40 people) had 10 cases total, I assigned the number of cases as 6 and 4, respectively.
- Changed column names in racial data to make more intuitive

The setup of this notebook is as follows:
1. Load every city's health landscapes data into one dataframe
2. Add zip code info from the zip code <--> tracts csv to the health landscapes dataframe
3. Since there are multiple census tracts per zip code, average the health landscapes data overall all census tracts in each zip code
4. Load every city's covid-19 data into one dataframe
5. Merge the health landscapes and covid-19 dataframes by zip code
6. Load the racial dataset and merge with the health/covid-19 dataframe by zip code

In [1]:
# set file location on your local machine
dataLoc = '../data/' #'C:/Users/sidne/OneDrive/Documents/GeorgiaTech/ComputationalDataAnalytics_ISYE6740/Project/Datasets/';
healthLandFolder = dataLoc + 'healthLandscapesData/'
covidFolder = dataLoc + 'covid19Data/'
zipTractConvertLoc = dataLoc + 'zipTractConversion2.csv' #'zipTractConversion.csv'
racialDataLoc = dataLoc + 'nhgisRacialData.csv'

In [2]:
# import libraries
import os
import pandas as pd

In [3]:
# change display options (comment out if not wanted)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [4]:
############## FUNCTIONS ##############

def loadAllHealthLandscapesData(healthDataFolder):
    
    healthDf = pd.DataFrame();
    
    # find every file in the health landscapes folder
    allFiles = os.listdir(healthDataFolder);
    
    # go through each file in the health landscapes folder
    for fileName in allFiles:
        
        if '.csv' not in fileName: # skip over README
            continue
    
        # load the dataset as a dataframe
        tempDf = pd.read_csv(healthDataFolder + fileName);
    
        # concatenate with the overall dataframe
        healthDf = pd.concat([healthDf, tempDf]);
        
        # delete the temp df
        del tempDf;
    
    
    # remove unnecessary columns
    del healthDf['TractFIPS']
    del healthDf['Population2010']
    
    # rename columns
    healthDf.columns = ['TractFIPS', 'PlaceName', 'BINGE', 'CSMOKING', 'LPA', 'OBESITY', 'SLEEP', 'ARTHRITIS', 
                        'CASTHMA', 'BPHIGH', 'CANCER', 'HIGHCHOL', 'KIDNEY', 'COPD', 'CHD', 'DIABETES', 'MHLTH', 
                        'PHLTH', 'TEETHLOST', 'STROKE', 'ACCESS2', 'CHECKUP', 'DENTAL', 'BPMED', 'CHOLSCREEN', 
                        'MAMMOUSE', 'PAPTEST', 'COLON_SCREEN', 'COREM', 'COREW']
    
    return healthDf



def loadAllCovidData(covidDataFolder):
    
    covidDf = pd.DataFrame();
    
    # find every file in the health landscapes folder
    allFiles = os.listdir(covidDataFolder);
    
    # go through each file in the health landscapes folder
    for fileName in allFiles:
        if ('.csv' in fileName):
            # load the dataset as a dataframe
            tempDf = pd.read_csv(covidDataFolder + fileName);

            # concatenate with teh overall dataframe
            covidDf = pd.concat([covidDf, tempDf]);

            # delete the temp df
            del tempDf;
    
    return covidDf;


def averageHealthTractsToZip(healthDf, zipTractDf):
    
    # merge the health landscapes data to see the corresponding zip code for each census tract
    healthZipDf = pd.merge(healthDf, zipTractDf, left_on='TractFIPS', right_on='TRACT', how='left')
    
    # remove the now duplicated census tract column
    del healthZipDf['TRACT']
    
    # create an empty dataframe with the same columns as the original df
    avgHealthDf = pd.DataFrame(columns = list(healthZipDf))
    
    # find all zip codes recorded
    allZips = healthZipDf['ZIP'].unique();
    
    # iterate through each zip code present
    for zipNum, zipVal in enumerate(allZips):
        
        # find all rows with this zip code
        filterData = healthZipDf.loc[healthZipDf['ZIP']==zipVal]
        filterData.head()
        # average all health landscape metrics in this set of rows in the specified zip code
        # append this average row to the new df
        avgHealthDf.loc[zipNum] = filterData.mean()
        
        # replace the place name (since the string cannot be averaged)
        avgHealthDf.loc[zipNum,'PlaceName_1'] = filterData['PlaceName_1'].iloc[0]
            
    
    return avgHealthDf


def weighted_average(df,data_col,weight_col,by_col):
    df['_data_times_weight'] = df[data_col]*df[weight_col]
    df['_weight_where_notnull'] = df[weight_col]*pd.notnull(df[data_col])
    g = df.groupby(by_col)
    result = g['_data_times_weight'].sum() / g['_weight_where_notnull'].sum()
    del df['_data_times_weight'], df['_weight_where_notnull']
    return result

def weightedAverageHealthTractsToZip(healthDf, zipTractDf):
    
    # merge the health landscapes data to see the corresponding zip code for each census tract
    healthZipDf = pd.merge(healthDf, zipTractDf, left_on='TractFIPS', right_on='TRACT', how='left')
    
    # remove the now duplicated census tract column
    del healthZipDf['TRACT']
    
    # remove RES_RATIO of 0 (no people living in that tract-zip combo)
    healthZipDf = healthZipDf[healthZipDf['RES_RATIO'] != 0.0]
    
    # create new dataframe
    newHealthDf = healthZipDf[['ZIP', 'PlaceName']].drop_duplicates().reset_index(drop=True)

    # weighted average of each column
    for col in ['BINGE', 'CSMOKING', 'LPA', 'OBESITY', 'SLEEP', 'ARTHRITIS', 'CASTHMA', 'BPHIGH', 'CANCER', 
                'HIGHCHOL', 'KIDNEY', 'COPD', 'CHD', 'DIABETES', 'MHLTH', 'PHLTH', 'TEETHLOST', 'STROKE', 'ACCESS2', 
                'CHECKUP', 'DENTAL', 'BPMED', 'CHOLSCREEN', 'MAMMOUSE', 'PAPTEST', 'COLON_SCREEN', 'COREM', 'COREW']:
        temp = weighted_average(healthZipDf, col, 'RES_RATIO', 'ZIP')
        temp = temp.rename(col)
        newHealthDf = newHealthDf.merge(temp, how='left', left_on='ZIP', right_index=True)
    
    return newHealthDf

In [5]:
# import the zip <--> census tract conversion csv
# 1 zip code may contain multiple tracts and 1 tract may span multiple zip codes
zipTractDf = pd.read_csv(zipTractConvertLoc)
zipTractDf.head()

Unnamed: 0,TRACT,ZIP,RES_RATIO
0,1001020100,36067,1.0
1,1001020200,36067,1.0
2,1001020300,36067,1.0
3,1001020400,36066,0.979601
4,1001020400,36067,0.020399


In [6]:
# Load each city's health landscapes data
healthDf = loadAllHealthLandscapesData(healthLandFolder);

healthDf.head()

Unnamed: 0,TractFIPS,PlaceName,BINGE,CSMOKING,LPA,OBESITY,SLEEP,ARTHRITIS,CASTHMA,BPHIGH,CANCER,HIGHCHOL,KIDNEY,COPD,CHD,DIABETES,MHLTH,PHLTH,TEETHLOST,STROKE,ACCESS2,CHECKUP,DENTAL,BPMED,CHOLSCREEN,MAMMOUSE,PAPTEST,COLON_SCREEN,COREM,COREW
0,17031010100,"Chicago, IL",20.6,20.3,28,36.4,39.0,20.3,10.0,31.9,4.5,28.2,3.0,6.5,4.8,10.7,14.2,12.3,16.2,3.1,14.6,68.6,54.7,70.3,82.2,79.8,87,54.5,29.4,23.8
1,17031010201,"Chicago, IL",21.0,22.5,30,36.2,39.0,18.9,9.9,30.2,4.0,27.8,3.0,6.8,4.8,10.8,15.3,13.3,18.3,3.1,20.4,66.0,50.3,67.5,79.6,78.6,86,50.5,28.0,22.1
2,17031010202,"Chicago, IL",21.0,20.1,28,33.9,36.8,22.0,9.5,32.9,5.3,30.4,3.2,7.3,5.8,11.6,13.4,12.9,17.0,3.5,18.4,68.2,55.6,72.4,82.5,78.4,86,56.5,31.2,24.2
3,17031010300,"Chicago, IL",21.0,19.0,27,31.8,35.1,22.8,9.1,32.5,6.1,31.0,3.3,7.3,6.2,10.8,12.8,12.5,15.4,3.5,15.9,68.3,59.7,73.3,83.0,77.0,86,59.3,33.6,25.2
4,17031010400,"Chicago, IL",25.3,15.1,21,25.0,32.2,13.1,9.1,19.5,3.3,21.5,2.0,4.3,3.0,5.7,13.6,8.3,10.8,1.7,11.3,62.8,65.7,62.2,74.7,78.6,84,58.7,35.3,27.4


In [7]:
# confirm all cities loaded
healthDf['PlaceName'].unique()

array(['Chicago, IL', 'Houston, TX', 'Atlanta, GA', 'Miami, FL',
       'San Francisco, CA', 'Boston, MA'], dtype=object)

In [8]:
# now find each corresponding zip code for the census tracts
# iterate through each zip code and average all census tracts in that zip code
#newHealthDf = averageHealthTractsToZip(healthDf, zipTractDf)
#newHealthDf.head()


In [9]:
# now find each corresponding zip code for the census tracts
# weighted average (by res ratio) all census tracts in that zip code
newHealthDf = weightedAverageHealthTractsToZip(healthDf, zipTractDf)
newHealthDf.head()

Unnamed: 0,ZIP,PlaceName,BINGE,CSMOKING,LPA,OBESITY,SLEEP,ARTHRITIS,CASTHMA,BPHIGH,CANCER,HIGHCHOL,KIDNEY,COPD,CHD,DIABETES,MHLTH,PHLTH,TEETHLOST,STROKE,ACCESS2,CHECKUP,DENTAL,BPMED,CHOLSCREEN,MAMMOUSE,PAPTEST,COLON_SCREEN,COREM,COREW
0,60626,"Chicago, IL",22.409075,18.507466,26.024114,31.38636,35.730079,18.171554,9.101819,27.667354,4.453979,27.196605,2.802491,6.009283,4.751435,9.352056,13.539646,11.286156,15.414419,2.766435,15.882525,65.496102,56.897428,67.931447,79.610679,78.008543,85.119727,55.139355,31.1585,24.561061
1,60645,"Chicago, IL",19.442819,17.193151,26.844575,28.673986,34.596204,21.531676,8.425354,30.346922,5.887454,31.332466,3.109202,6.601026,5.780704,10.566467,12.15083,11.879903,14.378911,3.119535,15.263973,67.75776,59.249705,73.085461,83.0449,76.131089,83.965212,56.168687,33.112585,25.837961
2,60660,"Chicago, IL",23.124917,15.740977,23.069889,27.632124,33.809852,17.672593,8.281761,26.126963,4.784825,27.392308,2.55101,5.183105,4.466401,8.535328,11.671725,9.785494,11.670886,2.438207,13.034434,65.686384,62.676151,68.572338,81.38282,77.83432,84.906159,58.569299,34.060313,27.069775
3,60659,"Chicago, IL",18.52878,17.219101,27.398051,26.708414,34.744762,21.029521,8.053602,30.038804,5.730133,31.991165,3.11573,6.576212,5.931617,11.184024,12.010375,11.959188,13.55866,3.109463,15.892934,67.454903,58.895987,73.129784,82.375043,74.979397,82.039306,55.285519,32.676208,25.919074
4,60640,"Chicago, IL",23.84652,15.333711,22.077723,27.36702,33.165992,17.547783,8.089063,25.714847,4.945263,27.206407,2.517053,4.972571,4.380928,8.245342,11.03956,9.446483,10.545113,2.405663,12.108133,65.320458,65.028418,67.535452,82.685756,77.875108,86.326651,60.002147,35.480995,27.912054


In [10]:

# load each city's covid-19 cases
# NOTE: The Boston dataset had multiple zip codes reported in one category
# To change this, I divided the number positive cases into each zip code, weighting by each zip code's population

covidDf = loadAllCovidData(covidFolder);

covidDf.head()

Unnamed: 0,Zip Code,Confirmed Cases
0,94130,39
1,94158,102
2,94107,380
3,94105,94
4,94134,961


In [11]:
# merge the health landscapes and covid datasets on zip code
allDf = pd.merge(left=covidDf, right=newHealthDf, left_on='Zip Code', right_on='ZIP')
allDf.head()

Unnamed: 0,Zip Code,Confirmed Cases,ZIP,PlaceName,BINGE,CSMOKING,LPA,OBESITY,SLEEP,ARTHRITIS,CASTHMA,BPHIGH,CANCER,HIGHCHOL,KIDNEY,COPD,CHD,DIABETES,MHLTH,PHLTH,TEETHLOST,STROKE,ACCESS2,CHECKUP,DENTAL,BPMED,CHOLSCREEN,MAMMOUSE,PAPTEST,COLON_SCREEN,COREM,COREW
0,94130,39,94130,"San Francisco, CA",22.301356,20.097949,21.997914,24.9992,36.498853,11.000348,10.099235,20.300556,2.300382,20.901008,2.199896,4.399583,2.70007,6.299965,17.797705,11.598957,18.696592,1.89993,15.797879,60.200765,48.706189,50.102191,64.804451,81.700417,82.001043,52.004277,23.502608,20.403373
1,94158,102,94158,"San Francisco, CA",25.014369,10.196252,12.990629,15.407184,29.892816,9.902499,7.200625,16.398751,3.401249,21.500312,1.499688,2.199688,2.099063,4.597189,9.197813,6.198751,4.79469,1.199375,7.19344,61.999063,75.121241,55.389692,77.913432,81.707809,84.015618,70.110933,36.116556,34.31187
2,94107,380,94107,"San Francisco, CA",24.22019,10.934772,14.71918,18.834588,29.150231,14.876086,7.75069,22.649484,5.041688,26.589586,2.273159,3.407267,3.750579,7.051803,9.699597,8.362983,6.652906,2.069219,7.614763,64.973699,73.817504,61.770194,81.482032,81.852477,86.003055,69.829734,35.832336,32.77794
3,94105,94,94105,"San Francisco, CA",25.8,9.2,12.0,16.2,29.3,11.0,7.1,17.2,3.8,22.7,1.6,2.3,2.2,4.5,8.7,6.2,4.8,1.2,6.1,62.5,76.6,56.7,80.1,82.2,85.0,71.5,38.3,36.4
4,94134,961,94134,"San Francisco, CA",15.103789,13.926066,23.139504,17.927599,34.94597,17.884536,7.82218,29.640441,5.101614,32.729771,3.17272,4.583115,5.296648,12.699528,11.371218,12.030176,12.802422,3.172291,14.505529,69.509535,59.353555,72.347502,79.885493,77.377915,76.330589,58.628222,27.140427,25.893235


In [12]:
# load race by zip code
racialDf = pd.read_csv(racialDataLoc)

In [13]:
# add racial data to overall data frame, keeping only the zip codes in the overall df
mergedRacialCovidHealthDf = pd.merge(allDf, racialDf, left_on='Zip Code', right_on = 'Zip Code', how='left')


In [14]:
mergedRacialCovidHealthDf.head()


Unnamed: 0,Zip Code,Confirmed Cases,ZIP,PlaceName,BINGE,CSMOKING,LPA,OBESITY,SLEEP,ARTHRITIS,CASTHMA,BPHIGH,CANCER,HIGHCHOL,KIDNEY,COPD,CHD,DIABETES,MHLTH,PHLTH,TEETHLOST,STROKE,ACCESS2,CHECKUP,DENTAL,BPMED,CHOLSCREEN,MAMMOUSE,PAPTEST,COLON_SCREEN,COREM,COREW,totalPop,whiteAlonePop,blackAlonePop,nativeAlonePop,asianAlonePop,hawaiiAlonePop,otherAlonePop,twoOrMorePop
0,94130,39,94130,"San Francisco, CA",22.301356,20.097949,21.997914,24.9992,36.498853,11.000348,10.099235,20.300556,2.300382,20.901008,2.199896,4.399583,2.70007,6.299965,17.797705,11.598957,18.696592,1.89993,15.797879,60.200765,48.706189,50.102191,64.804451,81.700417,82.001043,52.004277,23.502608,20.403373,2880.0,1011.0,724.0,37.0,525.0,28.0,260.0,295.0
1,94158,102,94158,"San Francisco, CA",25.014369,10.196252,12.990629,15.407184,29.892816,9.902499,7.200625,16.398751,3.401249,21.500312,1.499688,2.199688,2.099063,4.597189,9.197813,6.198751,4.79469,1.199375,7.19344,61.999063,75.121241,55.389692,77.913432,81.707809,84.015618,70.110933,36.116556,34.31187,4792.0,2280.0,177.0,21.0,1939.0,16.0,112.0,247.0
2,94107,380,94107,"San Francisco, CA",24.22019,10.934772,14.71918,18.834588,29.150231,14.876086,7.75069,22.649484,5.041688,26.589586,2.273159,3.407267,3.750579,7.051803,9.699597,8.362983,6.652906,2.069219,7.614763,64.973699,73.817504,61.770194,81.482032,81.852477,86.003055,69.829734,35.832336,32.77794,26599.0,15753.0,1792.0,113.0,6447.0,154.0,1098.0,1242.0
3,94105,94,94105,"San Francisco, CA",25.8,9.2,12.0,16.2,29.3,11.0,7.1,17.2,3.8,22.7,1.6,2.3,2.2,4.5,8.7,6.2,4.8,1.2,6.1,62.5,76.6,56.7,80.1,82.2,85.0,71.5,38.3,36.4,5846.0,3313.0,131.0,18.0,2038.0,19.0,75.0,252.0
4,94134,961,94134,"San Francisco, CA",15.103789,13.926066,23.139504,17.927599,34.94597,17.884536,7.82218,29.640441,5.101614,32.729771,3.17272,4.583115,5.296648,12.699528,11.371218,12.030176,12.802422,3.172291,14.505529,69.509535,59.353555,72.347502,79.885493,77.377915,76.330589,58.628222,27.140427,25.893235,40798.0,6620.0,3903.0,198.0,23007.0,640.0,4761.0,1669.0


Dataset with the multiple sources combined! Still some work to do, mainly using the provided racial populations to decide which zip codes are racial diverse and which are racially uniform. Then we should have a pretty clean, comprehensive dataset to work with

In [15]:
mergedRacialCovidHealthDf[mergedRacialCovidHealthDf['Confirmed Cases'] == '<10']

Unnamed: 0,Zip Code,Confirmed Cases,ZIP,PlaceName,BINGE,CSMOKING,LPA,OBESITY,SLEEP,ARTHRITIS,CASTHMA,BPHIGH,CANCER,HIGHCHOL,KIDNEY,COPD,CHD,DIABETES,MHLTH,PHLTH,TEETHLOST,STROKE,ACCESS2,CHECKUP,DENTAL,BPMED,CHOLSCREEN,MAMMOUSE,PAPTEST,COLON_SCREEN,COREM,COREW,totalPop,whiteAlonePop,blackAlonePop,nativeAlonePop,asianAlonePop,hawaiiAlonePop,otherAlonePop,twoOrMorePop
228,31131,<10,31131,"Atlanta, GA",12.3,19.9,36.0,38.8,43.5,25.9,11.2,45.7,6.5,34.2,4.5,8.3,7.5,17.7,13.4,14.2,20.2,5.7,24.0,80.4,56.7,83.9,85.2,84.8,86.0,64.7,24.0,30.2,,,,,,,,


In [16]:
# replace <10 case value with None
mergedRacialCovidHealthDf.loc[mergedRacialCovidHealthDf['Confirmed Cases'] == '<10', 'Confirmed Cases'] = None

In [17]:
# case rate
mergedRacialCovidHealthDf['COVID_perc'] = mergedRacialCovidHealthDf['Confirmed Cases'].astype(str).str.replace(',', '').astype(float) / mergedRacialCovidHealthDf['totalPop']

# percent white
mergedRacialCovidHealthDf['white_perc'] = mergedRacialCovidHealthDf['whiteAlonePop'] / mergedRacialCovidHealthDf['totalPop']

# percent black
mergedRacialCovidHealthDf['black_perc'] = mergedRacialCovidHealthDf['blackAlonePop'] / mergedRacialCovidHealthDf['totalPop']

# percent native
mergedRacialCovidHealthDf['native_perc'] = mergedRacialCovidHealthDf['nativeAlonePop'] / mergedRacialCovidHealthDf['totalPop']

# percent asian
mergedRacialCovidHealthDf['asian_perc'] = mergedRacialCovidHealthDf['asianAlonePop'] / mergedRacialCovidHealthDf['totalPop']

# percent hawaii
mergedRacialCovidHealthDf['hawaii_perc'] = mergedRacialCovidHealthDf['hawaiiAlonePop'] / mergedRacialCovidHealthDf['totalPop']

# percent other
mergedRacialCovidHealthDf['other_perc'] = mergedRacialCovidHealthDf['otherAlonePop'] / mergedRacialCovidHealthDf['totalPop']

# percent two+
mergedRacialCovidHealthDf['two_plus_perc'] = mergedRacialCovidHealthDf['twoOrMorePop'] / mergedRacialCovidHealthDf['totalPop']

In [1]:
mergedRacialCovidHealthDf.shape

NameError: name 'mergedRacialCovidHealthDf' is not defined

In [None]:
# maybe drop zip code 30334 b/c only pop of 1?
mergedRacialCovidHealthDf = mergedRacialCovidHealthDf[mergedRacialCovidHealthDf['ZIP'] == 30334]

In [None]:
mer

In [18]:
mergedRacialCovidHealthDf.head()

Unnamed: 0,Zip Code,Confirmed Cases,ZIP,PlaceName,BINGE,CSMOKING,LPA,OBESITY,SLEEP,ARTHRITIS,CASTHMA,BPHIGH,CANCER,HIGHCHOL,KIDNEY,COPD,CHD,DIABETES,MHLTH,PHLTH,TEETHLOST,STROKE,ACCESS2,CHECKUP,DENTAL,BPMED,CHOLSCREEN,MAMMOUSE,PAPTEST,COLON_SCREEN,COREM,COREW,totalPop,whiteAlonePop,blackAlonePop,nativeAlonePop,asianAlonePop,hawaiiAlonePop,otherAlonePop,twoOrMorePop,COVID_perc,white_perc,black_perc,native_perc,asian_perc,hawaii_perc,other_perc,two_plus_perc
0,94130,39,94130,"San Francisco, CA",22.301356,20.097949,21.997914,24.9992,36.498853,11.000348,10.099235,20.300556,2.300382,20.901008,2.199896,4.399583,2.70007,6.299965,17.797705,11.598957,18.696592,1.89993,15.797879,60.200765,48.706189,50.102191,64.804451,81.700417,82.001043,52.004277,23.502608,20.403373,2880.0,1011.0,724.0,37.0,525.0,28.0,260.0,295.0,0.013542,0.351042,0.251389,0.012847,0.182292,0.009722,0.090278,0.102431
1,94158,102,94158,"San Francisco, CA",25.014369,10.196252,12.990629,15.407184,29.892816,9.902499,7.200625,16.398751,3.401249,21.500312,1.499688,2.199688,2.099063,4.597189,9.197813,6.198751,4.79469,1.199375,7.19344,61.999063,75.121241,55.389692,77.913432,81.707809,84.015618,70.110933,36.116556,34.31187,4792.0,2280.0,177.0,21.0,1939.0,16.0,112.0,247.0,0.021285,0.475793,0.036937,0.004382,0.404633,0.003339,0.023372,0.051544
2,94107,380,94107,"San Francisco, CA",24.22019,10.934772,14.71918,18.834588,29.150231,14.876086,7.75069,22.649484,5.041688,26.589586,2.273159,3.407267,3.750579,7.051803,9.699597,8.362983,6.652906,2.069219,7.614763,64.973699,73.817504,61.770194,81.482032,81.852477,86.003055,69.829734,35.832336,32.77794,26599.0,15753.0,1792.0,113.0,6447.0,154.0,1098.0,1242.0,0.014286,0.59224,0.067371,0.004248,0.242378,0.00579,0.04128,0.046693
3,94105,94,94105,"San Francisco, CA",25.8,9.2,12.0,16.2,29.3,11.0,7.1,17.2,3.8,22.7,1.6,2.3,2.2,4.5,8.7,6.2,4.8,1.2,6.1,62.5,76.6,56.7,80.1,82.2,85.0,71.5,38.3,36.4,5846.0,3313.0,131.0,18.0,2038.0,19.0,75.0,252.0,0.016079,0.566712,0.022408,0.003079,0.348614,0.00325,0.012829,0.043106
4,94134,961,94134,"San Francisco, CA",15.103789,13.926066,23.139504,17.927599,34.94597,17.884536,7.82218,29.640441,5.101614,32.729771,3.17272,4.583115,5.296648,12.699528,11.371218,12.030176,12.802422,3.172291,14.505529,69.509535,59.353555,72.347502,79.885493,77.377915,76.330589,58.628222,27.140427,25.893235,40798.0,6620.0,3903.0,198.0,23007.0,640.0,4761.0,1669.0,0.023555,0.162263,0.095666,0.004853,0.563925,0.015687,0.116697,0.040909


In [19]:
# save the final dataset for later use
mergedRacialCovidHealthDf.to_csv(dataLoc+"mergedTotalDataset.csv", index=False);