# How I created the CSV for the Paper

I used the automate notebook to get Paper_Data_avg_cases_with_New_York_City.csv, then I used this notebook to add the county population data 

# Add Population to CSV with New York City

According to the website below:
"In some instances, we report data from multiple counties or other non-county geographies as a single county. For instance, we report a single value for New York City, comprising the cases for New York, Kings, Queens, Bronx and Richmond Counties."

https://github.com/nytimes/covid-19-data

This new CSV has New York City county

In [2]:
#below are the packages I will be using
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date

# Path Variables
The default path is the GitHub descon-uccs/pandemic-data repository 

In [6]:
#file path for the CSV that the automate notebook created
paperFilepath = '../Data/Jupyter Notebook Input/Paper_Data_avg_cases_with_New_York_City.csv'

#population data file path
populationFilepath = '../Data/Jupyter Notebook Input/covid_county_population_usafacts.csv'

#file path for where the final paper will be created
finalPaperFilePath = '../Data/Jupyter Notebook Input/'

In [4]:
#The only requirement is that the CSV file and this notebook are in the same location 
dataset = pd.read_csv(paperFilepath)
dataset 

Unnamed: 0,date,fips,state,county,cases,vaccinations,population
0,2020-03-24,1001,Alabama,Autauga,0.14,0,0
1,2020-03-25,1001,Alabama,Autauga,0.57,0,0
2,2020-03-26,1001,Alabama,Autauga,0.86,0,0
3,2020-03-27,1001,Alabama,Autauga,0.86,0,0
4,2020-03-28,1001,Alabama,Autauga,0.86,0,0
...,...,...,...,...,...,...,...
1185483,2021-04-11,56045,Wyoming,Weston,0.75,1379,0
1185484,2021-04-12,56045,Wyoming,Weston,0.86,1379,0
1185485,2021-04-13,56045,Wyoming,Weston,0.88,1380,0
1185486,2021-04-14,56045,Wyoming,Weston,1.00,1392,0


In [5]:
paper_dates = dataset["date"]
result1 = (paper_dates.drop_duplicates()).to_list()
print(min(result1), max(result1))

2020-01-21 2021-04-15


## Add population data

In [9]:
dataset3 = pd.read_csv(populationFilepath)
dataset3

Unnamed: 0,countyFIPS,County Name,State,population
0,0,Statewide Unallocated,AL,0
1,1001,Autauga County,AL,55869
2,1003,Baldwin County,AL,223234
3,1005,Barbour County,AL,24686
4,1007,Bibb County,AL,22394
...,...,...,...,...
3190,56037,Sweetwater County,WY,42343
3191,56039,Teton County,WY,23464
3192,56041,Uinta County,WY,20226
3193,56043,Washakie County,WY,7805


In [10]:
dataset3[dataset3['countyFIPS'] == 36998]

Unnamed: 0,countyFIPS,County Name,State,population


In [11]:
test1 = dataset["fips"]
result1 = (test1.drop_duplicates()).to_list()
len(result1)

3133

In [12]:
test2 = dataset3["countyFIPS"]
result2 = (test2.drop_duplicates()).to_list()
len(result2)

3145

In [13]:
#below I find what FIPS we have population data for but we do not need and remove the population value
unwanted_fips = list(set(result2) - set(result1))
unwanted_fips

[0,
 36005,
 15005,
 2282,
 2060,
 36047,
 6000,
 36081,
 2164,
 36085,
 2105,
 36061,
 2270]

In [38]:
population_df = (dataset3[~dataset3['countyFIPS'].isin(unwanted_fips)]).reset_index(drop=True)
population_df

Unnamed: 0,countyFIPS,County Name,State,population
0,1001,Autauga County,AL,55869
1,1003,Baldwin County,AL,223234
2,1005,Barbour County,AL,24686
3,1007,Bibb County,AL,22394
4,1009,Blount County,AL,57826
...,...,...,...,...
3127,56037,Sweetwater County,WY,42343
3128,56039,Teton County,WY,23464
3129,56041,Uinta County,WY,20226
3130,56043,Washakie County,WY,7805


In [61]:
dataset3[dataset3['countyFIPS'].isin(New_York_City_counties)]

Unnamed: 0,countyFIPS,County Name,State,population
1865,36005,Bronx County,NY,1418207
1886,36047,Kings County,NY,2559903
1893,36061,New York County,NY,1628706
1903,36081,Queens County,NY,2253858
1905,36085,Richmond County,NY,476143


In [44]:
#below I set the population of New York City County to the sum of New York, Kings, Queens, Bronx and Richmond Counties
New_York_City_counties = [36005, 36047, 36061, 36081, 36085]
temp_NY_populations = dataset3[dataset3['countyFIPS'].isin(New_York_City_counties)]
New_York_City_population = temp_NY_populations['population'].sum()
temp_NY_df = {'countyFIPS': 36998, 'County Name': 'New York City', 'State': 'NY', 'population':New_York_City_population}
population_df = population_df.append(temp_NY_df, ignore_index = True)
population_df

Unnamed: 0,countyFIPS,County Name,State,population
0,1001,Autauga County,AL,55869
1,1003,Baldwin County,AL,223234
2,1005,Barbour County,AL,24686
3,1007,Bibb County,AL,22394
4,1009,Blount County,AL,57826
...,...,...,...,...
3128,56039,Teton County,WY,23464
3129,56041,Uinta County,WY,20226
3130,56043,Washakie County,WY,7805
3131,56045,Weston County,WY,6927


In [45]:
fips_populations = list(zip(population_df.countyFIPS, population_df.population))
fips_populations

[(1001, 55869),
 (1003, 223234),
 (1005, 24686),
 (1007, 22394),
 (1009, 57826),
 (1011, 10101),
 (1013, 19448),
 (1015, 113605),
 (1017, 33254),
 (1019, 26196),
 (1021, 44428),
 (1023, 12589),
 (1025, 23622),
 (1027, 13235),
 (1029, 14910),
 (1031, 52342),
 (1033, 55241),
 (1035, 12067),
 (1037, 10663),
 (1039, 37049),
 (1041, 13772),
 (1043, 83768),
 (1045, 49172),
 (1047, 37196),
 (1049, 71513),
 (1051, 81209),
 (1053, 36633),
 (1055, 102268),
 (1057, 16302),
 (1059, 31362),
 (1061, 26271),
 (1063, 8111),
 (1065, 14651),
 (1067, 17205),
 (1069, 105882),
 (1071, 51626),
 (1073, 658573),
 (1075, 13805),
 (1077, 92729),
 (1079, 32924),
 (1081, 164542),
 (1083, 98915),
 (1085, 9726),
 (1087, 18068),
 (1089, 372909),
 (1091, 18863),
 (1093, 29709),
 (1095, 96774),
 (1097, 413210),
 (1099, 20733),
 (1101, 226486),
 (1103, 119679),
 (1105, 8923),
 (1107, 19930),
 (1109, 33114),
 (1111, 22722),
 (1113, 57961),
 (1115, 89512),
 (1117, 217702),
 (1119, 12427),
 (1121, 79978),
 (1123, 40367),


In [48]:
paper_df = dataset
paper_df

Unnamed: 0,date,fips,state,county,cases,vaccinations,population
0,2020-03-24,1001,Alabama,Autauga,0.14,0,0
1,2020-03-25,1001,Alabama,Autauga,0.57,0,0
2,2020-03-26,1001,Alabama,Autauga,0.86,0,0
3,2020-03-27,1001,Alabama,Autauga,0.86,0,0
4,2020-03-28,1001,Alabama,Autauga,0.86,0,0
...,...,...,...,...,...,...,...
1185483,2021-04-11,56045,Wyoming,Weston,0.75,1379,0
1185484,2021-04-12,56045,Wyoming,Weston,0.86,1379,0
1185485,2021-04-13,56045,Wyoming,Weston,0.88,1380,0
1185486,2021-04-14,56045,Wyoming,Weston,1.00,1392,0


In [49]:
#Note: this block of code takes a few minutes to complete as there are a total of 3133 counties
for fips, population in fips_populations:
    #print(fips, population)
    paper_df.loc[paper_df['fips'] == fips, 'population'] = population
    #paper_df['population'] = np.where((paper_df.fips == fips), population, 0)

In [50]:
paper_dates2 = paper_df["date"]
result2 = (paper_dates2.drop_duplicates()).to_list()
print(min(result2), max(result2))

2020-01-21 2021-04-15


In [55]:
paper_fips = paper_df["fips"]
result3 = (paper_fips.drop_duplicates()).to_list()
print(len(result3))

3133


In [56]:
paper_df

Unnamed: 0,date,fips,state,county,cases,vaccinations,population
0,2020-03-24,1001,Alabama,Autauga,0.14,0,55869
1,2020-03-25,1001,Alabama,Autauga,0.57,0,55869
2,2020-03-26,1001,Alabama,Autauga,0.86,0,55869
3,2020-03-27,1001,Alabama,Autauga,0.86,0,55869
4,2020-03-28,1001,Alabama,Autauga,0.86,0,55869
...,...,...,...,...,...,...,...
1185483,2021-04-11,56045,Wyoming,Weston,0.75,1379,6927
1185484,2021-04-12,56045,Wyoming,Weston,0.86,1379,6927
1185485,2021-04-13,56045,Wyoming,Weston,0.88,1380,6927
1185486,2021-04-14,56045,Wyoming,Weston,1.00,1392,6927


In [57]:
#paper_df.to_csv('Final_Paper_Data.csv', index=False)
paper_df.to_csv(finalPaperFilePath + 'Final_Paper_Data_avg_cases_with_New_York_City.csv', index=False)