In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Original Data retrieved from: https://www.kaggle.com/datasets/laurindogarcia/covid-19-race-gender-poverty-risk-us-county

# Notes on Data from Kaggle website:

## Data is drawn from:

### USA Facts/U.S CDC,
### SAIPE/U.S Census,
### Population Estimates/U.S Census,
### Policy Map/NY Times/2017 SMART-BRFSS, U.S CDC
### Links to sources are in the file description below.


## About this file
### This files is a consolidation of data from the following sources:

## US Coronavirus Cases (USA Facts/U.S CDC, 2020): timeseries from 22/01/2020 to 31/07/2020;
### https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/

## US Coronavirus Deaths (USA Facts/U.S. CDC, 2020): timeseries from 22/01/2020 to 31/07/2020;
### https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/

## State/County Poverty Universe Data, All ages (SAIPE, U.S Census, 2019);
### https://www.census.gov/data/datasets/time-series/demo/saipe/model-tables.html

## Annual County Resident Population Estimates by Age, Sex, Race, and Hispanic Origin: April 1, 2010 to July 1, 2019 (CC-EST2019-ALLDATA) (U.S Census, 2019);
### https://www.census.gov/data/tables/time-series/demo/popest/2010s-counties-detail.html

## Severe COVID-19 Health Risk Index by U.S County (Policy Map/NY Times/2017 SMART-BRFSS, U.S CDC, 2017
### No available website for that one (website doesn't work anymore)

In [2]:
# Read in CSV; if need log transformed data, its here.
cd_log = pd.read_csv("covid_data_log_200908.csv")
cd_log

Unnamed: 0,FIPS,stateFIPS,countyFIPS_2d,County,State,Cases,Deaths,Poverty,Population,W_Male,...,B_Male,B_Female,I_Male,I_Female,A_Male,A_Female,NH_Male,NH_Female,Risk_Index,Risk_Cat
0,1001,1,1,Autauga County,AL,10.590264,6.812345,10.916415,10.930765,10.651360,...,9.282754,9.414913,5.978886,6.100319,6.588926,6.778785,4.465908,4.317488,65.42,Above Average
1,1003,1,3,Baldwin County,AL,11.251171,6.864848,12.279579,12.315976,12.162852,...,9.895052,9.957739,7.908755,7.872455,7.756623,8.129764,5.537334,5.587249,68.39,Above Average
2,1005,1,5,Barbour County,AL,10.111517,5.043425,9.997843,10.113992,9.465448,...,9.452737,9.330787,5.652489,5.204007,4.844187,4.948760,4.276666,3.713572,97.09,High
3,1007,1,7,Bibb County,AL,9.520469,4.634729,9.914032,10.016548,9.777641,...,8.691315,8.221210,5.068904,5.017280,4.276666,4.290459,3.912023,2.772589,83.36,Above Average
4,1009,1,9,Blount County,AL,9.868430,4.406719,10.954973,10.965194,10.912649,...,7.068172,6.984716,6.383507,6.393591,5.446737,5.579730,4.624973,4.127134,81.75,Above Average
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,56,37,Sweetwater County,WY,8.903951,3.526361,10.650294,10.653558,10.629223,...,6.719013,6.461468,6.781058,6.685861,6.246107,6.428105,4.574711,4.595120,10.42,Low
3138,56039,56,39,Teton County,WY,9.534089,4.615121,10.038368,10.063223,10.057410,...,5.513429,5.198497,5.736572,5.605802,5.880533,6.350886,4.143135,3.761200,2.94,Very low
3139,56041,56,41,Uinta County,WY,9.183688,0.000000,9.910215,9.914724,9.888272,...,5.293305,5.225747,5.968708,5.996452,4.812184,5.192957,3.988984,3.784190,27.13,Below Average
3140,56043,56,43,Washakie County,WY,8.040447,5.673323,8.953511,8.962520,8.936167,...,4.382027,4.043051,5.129899,5.288267,4.189655,4.624973,2.564949,2.564949,32.76,Below Average


In [3]:
# Gather unique values for county
len(cd_log.County.unique())

1878

In [4]:
# Det if FIP = County Len; 3142 ideal bc matches other census data for CRE dataset len
len(cd_log.FIPS.unique())

3142

In [5]:
# Risk_Index is a float score, not sure based on what, but not one we want to manipulate.
# These have all been log transformed, moving over into integers may be helpful with aggregating with CRE data
cd_log.dtypes

FIPS               int64
stateFIPS          int64
countyFIPS_2d      int64
County            object
State             object
Cases            float64
Deaths           float64
Poverty          float64
Population       float64
W_Male           float64
W_Female         float64
B_Male           float64
B_Female         float64
I_Male           float64
I_Female         float64
A_Male           float64
A_Female         float64
NH_Male          float64
NH_Female        float64
Risk_Index       float64
Risk_Cat          object
dtype: object

In [6]:
# undoing log transformation
ignore = ['FIPS', 'stateFIPS', 'countyFIPS_2d',
          'Risk_Cat', 'County', 'State', 'Risk_Index']

# def exp_transform(cd_log):
cd_log2 = cd_log.copy()

for (columnName, columnData) in cd_log2.iteritems():
    if columnName not in ignore:
        #         print(columnName)
        column_list = cd_log2[columnName].values.tolist()
#         print(column_list)
        weep = [np.exp(pip) for pip in column_list]
#         print(weep)
        cd_log2.drop([columnName], axis=1)
        cd_log2[columnName] = weep

In [7]:
# Checking transformation worked
cd_log2

Unnamed: 0,FIPS,stateFIPS,countyFIPS_2d,County,State,Cases,Deaths,Poverty,Population,W_Male,...,B_Male,B_Female,I_Male,I_Female,A_Male,A_Female,NH_Male,NH_Female,Risk_Index,Risk_Cat
0,1001,1,1,Autauga County,AL,39746.0,909.0,55073.0,55869.0,42250.0,...,10751.0,12270.0,395.0,446.0,727.0,879.0,87.0,75.0,65.42,Above Average
1,1003,1,3,Baldwin County,AL,76970.0,958.0,215255.0,223234.0,191540.0,...,19832.0,21115.0,2721.0,2624.0,2337.0,3394.0,254.0,267.0,68.39,Above Average
2,1005,1,5,Barbour County,AL,24625.0,155.0,21979.0,24686.0,12906.0,...,12743.0,11280.0,285.0,182.0,127.0,141.0,72.0,41.0,97.09,High
3,1007,1,7,Bibb County,AL,13636.0,103.0,20212.0,22394.0,17635.0,...,5951.0,3719.0,159.0,151.0,72.0,73.0,50.0,16.0,83.36,Above Average
4,1009,1,9,Blount County,AL,19311.0,82.0,57238.0,57826.0,54866.0,...,1174.0,1080.0,592.0,598.0,232.0,265.0,102.0,62.0,81.75,Above Average
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,56,37,Sweetwater County,WY,7361.0,34.0,42205.0,42343.0,41325.0,...,828.0,640.0,881.0,801.0,516.0,619.0,97.0,99.0,10.42,Low
3138,56039,56,39,Teton County,WY,13823.0,101.0,22888.0,23464.0,23328.0,...,248.0,181.0,310.0,272.0,358.0,573.0,63.0,43.0,2.94,Very low
3139,56041,56,41,Uinta County,WY,9737.0,1.0,20135.0,20226.0,19698.0,...,199.0,186.0,391.0,402.0,123.0,180.0,54.0,44.0,27.13,Below Average
3140,56043,56,43,Washakie County,WY,3104.0,291.0,7735.0,7805.0,7602.0,...,80.0,57.0,169.0,198.0,66.0,102.0,13.0,13.0,32.76,Below Average


In [8]:
# NANcheck
cd_log2.isna().sum()

FIPS             0
stateFIPS        0
countyFIPS_2d    0
County           0
State            0
Cases            0
Deaths           0
Poverty          0
Population       0
W_Male           0
W_Female         0
B_Male           0
B_Female         0
I_Male           0
I_Female         0
A_Male           0
A_Female         0
NH_Male          0
NH_Female        0
Risk_Index       0
Risk_Cat         0
dtype: int64

In [9]:
# Creating DataFrame with columns of interest/ need for DB creation/ hypothesis

covid_data_OG = cd_log2[['FIPS', 'County',
                         'Cases', 'Deaths', 'Risk_Index', 'Risk_Cat']]
covid_data_OG

Unnamed: 0,FIPS,County,Cases,Deaths,Risk_Index,Risk_Cat
0,1001,Autauga County,39746.0,909.0,65.42,Above Average
1,1003,Baldwin County,76970.0,958.0,68.39,Above Average
2,1005,Barbour County,24625.0,155.0,97.09,High
3,1007,Bibb County,13636.0,103.0,83.36,Above Average
4,1009,Blount County,19311.0,82.0,81.75,Above Average
...,...,...,...,...,...,...
3137,56037,Sweetwater County,7361.0,34.0,10.42,Low
3138,56039,Teton County,13823.0,101.0,2.94,Very low
3139,56041,Uinta County,9737.0,1.0,27.13,Below Average
3140,56043,Washakie County,3104.0,291.0,32.76,Below Average


In [10]:
pd.set_option('display.max_rows', None)

In [11]:
covid_data_OG.to_csv('Covid_Data_Original_Processing_Complete.csv')