In [1]:
# pip install pandas
# pip install matplotlib

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
import pandas as pd
import datetime

In [3]:
## Sources for covid-19 in case we lose them
# https://data.dhsgis.wi.gov/datasets/covid-19-historical-data-table/data?where=%20(GEO%20%3D%20%27County%27%20OR%20GEO%20%3D%20%27State%27)%20
# https://www.dhs.wisconsin.gov/publications/p02677.pdf

## Source for population by county
# https://www.wisconsin-demographics.com/counties_by_population

## Source for area of each county
# https://en.wikipedia.org/wiki/List_of_counties_in_Wisconsin

data = pd.read_csv(r'C:\Users\Peter\Desktop\SIIMHackathon\COVID-19_Historical_Data_Table_3.csv')
data.head()

Unnamed: 0,OBJECTID,GEOID,GEO,NAME,LoadDttm,NEGATIVE,POSITIVE,HOSP_YES,HOSP_NO,HOSP_UNK,...,DTH_E_NHSP,DTH_E_UNK,POS_HC_Y,POS_HC_N,POS_HC_UNK,DTH_NEW,POS_NEW,NEG_NEW,TEST_NEW,DATE
0,1,55,State,WI,2020/03/15 19:00:00+00,313.0,32,,,,...,,,,,,,,,,2020/03/15 14:00:00+00
1,2,55001,County,Adams,2020/03/15 19:00:00+00,,0,,,,...,,,,,,,,,,2020/03/15 14:00:00+00
2,3,55003,County,Ashland,2020/03/15 19:00:00+00,,0,,,,...,,,,,,,,,,2020/03/15 14:00:00+00
3,4,55005,County,Barron,2020/03/15 19:00:00+00,,0,,,,...,,,,,,,,,,2020/03/15 14:00:00+00
4,5,55007,County,Bayfield,2020/03/15 19:00:00+00,,0,,,,...,,,,,,,,,,2020/03/15 14:00:00+00


In [4]:
print(data.shape) # 7300 rows
print(data['NAME'].value_counts()) # 72 counties and whole state, so 100 data points

(7300, 104)
Dunn         100
Marquette    100
Waushara     100
Shawano      100
Sauk         100
            ... 
Dodge        100
Adams        100
Polk         100
Vernon       100
Milwaukee    100
Name: NAME, Length: 73, dtype: int64


In [5]:
dates = data['LoadDttm'].value_counts().index

dates[0].split()[0]
sorted_dates = [date.split()[0] for date in dates] # why are there times even
sorted_dates = sorted(sorted_dates, key=lambda x: datetime.datetime.strptime(x, '%Y/%m/%d'))

print('First five : ' + str(sorted_dates[0:5])) # all that to find out there's one per day for a hundred days
print('Last five : ' + str(sorted_dates[-5:]))

First five : ['2020/03/15', '2020/03/16', '2020/03/17', '2020/03/18', '2020/03/19']
Last five : ['2020/06/18', '2020/06/19', '2020/06/20', '2020/06/21', '2020/06/22']


In [6]:
data = data[['NAME', 'LoadDttm',  'POSITIVE']]
data.head()

Unnamed: 0,NAME,LoadDttm,POSITIVE
0,WI,2020/03/15 19:00:00+00,32
1,Adams,2020/03/15 19:00:00+00,0
2,Ashland,2020/03/15 19:00:00+00,0
3,Barron,2020/03/15 19:00:00+00,0
4,Bayfield,2020/03/15 19:00:00+00,0


In [7]:
data2 = pd.read_csv(r'C:\Users\Peter\Desktop\SIIMHackathon\wisconsin_pop_by_county.txt')
print(data2.shape)
data2.head()

(72, 3)


Unnamed: 0,County,Population,Area(sqmi)
0,Milwaukee,954209,241.4
1,Dane,529843,1197.24
2,Waukesha,398879,549.57
3,Brown,259786,529.71
4,Racine,195398,332.5


In [8]:
data = data.drop(data[data['NAME'] == 'WI'].index)
data['Population'] = 0
data['Area'] = 0

for county in data2['County'].value_counts().index:
    data.loc[data['NAME'] == county, 'Population'] = data2[data2['County'] == county]['Population'].values[0]
    data.loc[data['NAME'] == county, 'Area'] = data2[data2['County'] == county]['Area(sqmi)'].values[0]

# Data is already sorted by date so we probably don't have to mess with it
# Split data by days
data_np = np.array_split(data.to_numpy(), 100)
data_np[0]

array([['Adams', '2020/03/15 19:00:00+00', 0, 20073, 645.65],
       ['Ashland', '2020/03/15 19:00:00+00', 0, 15712, 1045.04],
       ['Barron', '2020/03/15 19:00:00+00', 0, 45252, 862.71],
       ['Bayfield', '2020/03/15 19:00:00+00', 0, 14992, 1477.86],
       ['Brown', '2020/03/15 19:00:00+00', 0, 259786, 529.71],
       ['Buffalo', '2020/03/15 19:00:00+00', 0, 13167, 671.64],
       ['Burnett', '2020/03/15 19:00:00+00', 0, 15258, 821.85],
       ['Calumet', '2020/03/15 19:00:00+00', 0, 49807, 318.24],
       ['Chippewa', '2020/03/15 19:00:00+00', 0, 63635, 1008.37],
       ['Clark', '2020/03/15 19:00:00+00', 0, 34491, 1209.82],
       ['Columbia', '2020/03/15 19:00:00+00', 0, 56954, 765.53],
       ['Crawford', '2020/03/15 19:00:00+00', 0, 16288, 570.66],
       ['Dane', '2020/03/15 19:00:00+00', 6, 529843, 1197.24],
       ['Dodge', '2020/03/15 19:00:00+00', 0, 87776, 875.63],
       ['Door', '2020/03/15 19:00:00+00', 0, 27439, 481.98],
       ['Douglas', '2020/03/15 19:00:00+00',

In [9]:
# Need list of delta I and population densities

delta_I = [0] * (len(data_np) * len(data_np[0]))
pop_density = [0] * len(data_np[0])

for i in range(len(data_np[0])):
    pop_density[i] = data_np[0][i][3] / data_np[0][i][4]

for i in range(len(data_np) - 1):
    for j in range(len(data_np[0])):
        
        # County orders are not consistent for some reason, so we must suffer
        county = data_np[i][j][0]
        
        for k in range(len(data_np[0])):
            if data_np[i+1][k][0] == county:
                
                # 0s are for var and population density in next cell
                delta_I[(i * len(data_np[0]) + j)] = [county, data_np[i+1][k][2] - data_np[i][j][2], data_np[i+1][k][3],
                                                      data_np[i+1][k][4], 0, 0]
            
        '''if data_np[i+1][j][2] > 0 and data_np[i][j][2] > 0:
            delta_I[(i * len(data_np[0]) + j)] = data_np[i+1][j][2] - data_np[i][j][2]
            
        if delta_I[(i * len(data_np[0]) + j)] == -808:
            print('i : ' + str(i) + ' j : ' + str(j))
            print(data_np[i][j])
            print(data_np[i+1][j])'''

# Taylor did report a decrease in cumulative cases according to the data
delta_I[-144:-72]     

[['Iron', 0, 5715, 758.17, 0, 0],
 ['Florence', 0, 4337, 488.2, 0, 0],
 ['Forest', 0, 9018, 1014.07, 0, 0],
 ['Pepin', 0, 7262, 231.98, 0, 0],
 ['Rusk', 0, 14183, 913.59, 0, 0],
 ['Ashland', 0, 15712, 1045.04, 0, 0],
 ['Burnett', 0, 15258, 821.85, 0, 0],
 ['Price', 0, 13490, 1254.38, 0, 0],
 ['Taylor', -1, 20356, 974.88, 0, 0],
 ['Buffalo', 0, 13167, 671.64, 0, 0],
 ['Vilas', 0, 21593, 856.6, 0, 0],
 ['Marquette', 1, 15207, 455.6, 0, 0],
 ['Menominee', 0, 4579, 357.61, 0, 0],
 ['Bayfield', 0, 14992, 1477.86, 0, 0],
 ['Washburn', 0, 15689, 797.11, 0, 0],
 ['Lafayette', 2, 16735, 633.59, 0, 0],
 ['Langlade', 0, 19164, 870.64, 0, 0],
 ['Kewaunee', 0, 20360, 342.52, 0, 0],
 ['Adams', 1, 20073, 645.65, 0, 0],
 ['Richland', 0, 17539, 586.15, 0, 0],
 ['Lincoln', 0, 27848, 878.97, 0, 0],
 ['Green Lake', 0, 18757, 349.44, 0, 0],
 ['Sawyer', 0, 16370, 1257.31, 0, 0],
 ['Clark', 2, 34491, 1209.82, 0, 0],
 ['Iowa', 1, 23620, 762.58, 0, 0],
 ['Douglas', 0, 43402, 1304.14, 0, 0],
 ['Oneida', 0, 3534

In [10]:
# Stripped SIR model, var * population_density = dI/dt
# 100*72 data points to estimate var across different population densities

## This can probably be combined with the above one but there's no time

# last row is empty b/c delta I
for i in range(len(delta_I)-72):
    
    # Catch negative cases(?) and throw away 0s, data is daily changes so some counties may have skipped days
    # which messes with the way we're using the data, maybe fix later
    
    if delta_I[i][1] > 0:
        
        # Find matching County
        for j in range(len(data_np[0])):
            
            if delta_I[i][0] == data_np[0][j][0]:
                delta_I[i][4] = data_np[0][j][3] / data_np[0][j][4]
            
        delta_I[i][5] = delta_I[i][1] / delta_I[i][4]
                
delta_I[-144:-72]  

[['Iron', 0, 5715, 758.17, 0, 0],
 ['Florence', 0, 4337, 488.2, 0, 0],
 ['Forest', 0, 9018, 1014.07, 0, 0],
 ['Pepin', 0, 7262, 231.98, 0, 0],
 ['Rusk', 0, 14183, 913.59, 0, 0],
 ['Ashland', 0, 15712, 1045.04, 0, 0],
 ['Burnett', 0, 15258, 821.85, 0, 0],
 ['Price', 0, 13490, 1254.38, 0, 0],
 ['Taylor', -1, 20356, 974.88, 0, 0],
 ['Buffalo', 0, 13167, 671.64, 0, 0],
 ['Vilas', 0, 21593, 856.6, 0, 0],
 ['Marquette', 1, 15207, 455.6, 33.377963125548725, 0.029959886894193467],
 ['Menominee', 0, 4579, 357.61, 0, 0],
 ['Bayfield', 0, 14992, 1477.86, 0, 0],
 ['Washburn', 0, 15689, 797.11, 0, 0],
 ['Lafayette', 2, 16735, 633.59, 26.412980002840953, 0.075720346579026],
 ['Langlade', 0, 19164, 870.64, 0, 0],
 ['Kewaunee', 0, 20360, 342.52, 0, 0],
 ['Adams', 1, 20073, 645.65, 31.08959962828158, 0.03216509739451003],
 ['Richland', 0, 17539, 586.15, 0, 0],
 ['Lincoln', 0, 27848, 878.97, 0, 0],
 ['Green Lake', 0, 18757, 349.44, 0, 0],
 ['Sawyer', 0, 16370, 1257.31, 0, 0],
 ['Clark', 2, 34491, 1209.8

In [34]:
# Also can be combined with above loops

county_var_list = {}
for i in range(len(data_np[0])):
    # first value is sum, second is count
    county_var_list[data_np[0][i][0]] = [0, 0]

for i in range(len(delta_I)-72):
    
    if delta_I[i][5] != 0:
        county_var_list[delta_I[i][0]][0] += delta_I[i][5]
        county_var_list[delta_I[i][0]][1] += 1

county_var_list

{'Adams': [0.3538160713396103, 11],
 'Ashland': [0.1995366598778004, 3],
 'Barron': [0.6291308671439937, 19],
 'Bayfield': [0.2957297225186766, 3],
 'Brown': [5.262722048147323, 88],
 'Buffalo': [0.459084073820916, 9],
 'Burnett': [0.21545418796696814, 4],
 'Calumet': [0.5878306262171984, 49],
 'Chippewa': [1.1884615384615385, 47],
 'Clark': [2.209812994694269, 40],
 'Columbia': [0.9543250693542153, 41],
 'Crawford': [1.0861038801571705, 19],
 'Dane': [2.5624008621421814, 97],
 'Dodge': [4.389322821728037, 66],
 'Door': [0.7553168847261196, 24],
 'Douglas': [0.6310064052347821, 19],
 'Dunn': [0.5731336239831004, 23],
 'Eau Claire': [1.0159015836335206, 62],
 'Florence': [0.22513258012451, 2],
 'Fond du Lac': [2.0043175487465175, 76],
 'Forest': [4.160633178088269, 16],
 'Grant': [2.8545120398240322, 55],
 'Green': [1.2672743055555558, 43],
 'Green Lake': [0.4471162765900729, 20],
 'Iowa': [0.7748484335309059, 18],
 'Iron': [0.26532633420822394, 2],
 'Jackson': [1.44502096947235, 22],
 

In [36]:
for key, value in county_var_list.items():
    value.append(value[0] / value[1])

[0.3538160713396103, 11, 0.032165097394510025]
[0.1995366598778004, 3, 0.0665122199592668]
[0.6291308671439937, 19, 0.03311215090231546]
[0.2957297225186766, 3, 0.09857657417289219]
[5.262722048147323, 88, 0.05980365963803776]
[0.459084073820916, 9, 0.05100934153565733]
[0.21545418796696814, 4, 0.053863546991742034]
[0.5878306262171984, 49, 0.011996543392187722]
[1.1884615384615385, 47, 0.025286415711947625]
[2.209812994694269, 40, 0.05524532486735673]
[0.9543250693542153, 41, 0.02327622120376135]
[1.0861038801571705, 19, 0.05716336211353529]
[2.5624008621421814, 97, 0.02641650373342455]
[4.389322821728037, 66, 0.06650489123830358]
[0.7553168847261196, 24, 0.03147153686358831]
[0.6310064052347821, 19, 0.033210863433409586]
[0.5731336239831004, 23, 0.02491885321665654]
[1.0159015836335206, 62, 0.01638550941344388]
[0.22513258012451, 2, 0.112566290062255]
[2.0043175487465175, 76, 0.026372599325612072]
[4.160633178088269, 16, 0.26003957363051683]
[2.8545120398240322, 55, 0.051900218905891

In [42]:
# dI/dt = x * population_density
# dI/dt = x * (pop / area)
# delta(dI/dt) = x * (pop1 - pop2 / area) - trying to find change in dI/dt as a result of one person vaccinated

deltadelta_dict = {}

for key, value in county_var_list.items():
    
    for i in range(len(data_np[0])):
        if data_np[0][i][0] == key:
            population = data_np[0][i][2]
            area = data_np[0][i][3]
        
    deltadelta_I = value[2] * ((population - (population-1)) / area)
    
    deltadelta_dict[key] = deltadelta_I
    
# These are really small but they are also per day, so we need average infectious period to multiply

# units are change in infected per 1 person vaccinated per square mile per day
deltadelta_dict

{'Adams': 0.032165097394510025,
 'Ashland': 0.0665122199592668,
 'Barron': 0.03311215090231546,
 'Bayfield': 0.09857657417289219,
 'Brown': 0.05980365963803776,
 'Buffalo': 0.05100934153565733,
 'Burnett': 0.053863546991742034,
 'Calumet': 0.011996543392187722,
 'Chippewa': 0.025286415711947625,
 'Clark': 0.05524532486735673,
 'Columbia': 0.02327622120376135,
 'Crawford': 0.05716336211353529,
 'Dane': 0.02641650373342455,
 'Dodge': 0.06650489123830358,
 'Door': 0.03147153686358831,
 'Douglas': 0.033210863433409586,
 'Dunn': 0.02491885321665654,
 'Eau Claire': 0.01638550941344388,
 'Florence': 0.112566290062255,
 'Fond du Lac': 0.026372599325612072,
 'Forest': 0.26003957363051683,
 'Grant': 0.05190021890589149,
 'Green': 0.02947149547803618,
 'Green Lake': 0.022355813829503645,
 'Iowa': 0.04304713519616144,
 'Iron': 0.13266316710411197,
 'Jackson': 0.06568277133965227,
 'Jefferson': 0.017243841281646195,
 'Juneau': 0.034835383625421115,
 'Kenosha': 0.02416917833703846,
 'Kewaunee': 0.02

In [41]:
deltadelta_dict['Adams']

1.602406087506104e-06