In [133]:
import numpy as np
import pandas as pd 
import os, time
from datetime import datetime, timedelta

In [134]:
#load in Coronavirus by county data
df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
df = df[df['state'] == 'Massachusetts']

In [135]:
df = df[df['cases']>0]
df = df.groupby(['fips', 'date']).sum().reset_index()
df = df.drop('deaths', axis =1)

In [136]:
#create separate dataframes for each county
fips = [25001, 25003, 25005, 25007, 25009, 25011, 25013, 25015, 25017, 25019, 25021, 25023, 25025, 25027]

dfs = {}
for code in fips:
    dfs['df_' + str(code)] = df[df['fips'] == code]    

In [137]:
#create new cases column, set NaN to # of cases and fill 0s back to 2020-02-21
for name, dfe in dfs.items():
    dfe.loc[:,'new cases'] = dfe.loc[:,'cases'].diff(1)
    dfe.loc[:,'new cases'] = dfe.loc[:,'new cases'].fillna(dfe.cases)
    

In [138]:
#set direction of corona flow   key --> listed counties
#ex. county 25001 will lead 25007, 25019
neighbors = {'df_25001': ['df_25007', 'df_25019'], 'df_25003': ['df_25011', 'df_25013', 'df_25015'], 
             'df_25005': [], 'df_25007': [], 'df_25009': [], 'df_25011': [], 'df_25013': ['df_25015'], 'df_25015': [], 
             'df_25017': ['df_25009', 'df_25021', 'df_25027'], 'df_25019': [], 'df_25021': ['df_25005', 'df_25023'], 
             'df_25023': ['df_25001'], 'df_25025': ['df_25009', 'df_25017', 'df_25021'], 'df_25027': ['df_25011', 'df_25013', 'df_25015']}

In [139]:
#get slope between two lists
def get_slope(X, Y):

    xbar = sum(X)/len(X)
    ybar = sum(Y)/len(Y)
    n = len(X) # or len(Y)

    numer = sum([xi*yi for xi,yi in zip(X, Y)]) - n * xbar * ybar
    denum = sum([xi**2 for xi in X]) - n * xbar**2

    b = numer / denum

    return b

In [140]:
#"shifts" data
def compare(df1, df2, leadtime):

    #sets initial date for df1
    datei = str(datetime.strptime((df2['date'].iloc[0]), '%Y-%m-%d') - timedelta(days=leadtime +1))
    
    #fills in data for before first case as 0
    while (str(df1['date'].iloc[0]) > '2020-02-21'):
        info = {'fips' : int(df1['fips'].iloc[0]), 'date' : ((datetime.strptime(str(df1['date'].iloc[0])[0:10], '%Y-%m-%d') - timedelta(days=1))), 'cases' : 0 , 'new cases' : 0}
        df1 = df1.append(info, ignore_index = True)
        df1 = df1.groupby(['date']).sum().reset_index()
    
    df1 = df1[df1['date'] >= datei]
    
    #sets end date for df1
    df1 = df1.drop(df1.tail(leadtime).index)
    
    #sets X and Y to df values
    X = df1.loc[:,'new cases']
    Y = df2.loc[:,'new cases']
    
    return get_slope(X, Y)

In [141]:
#changes leadtime and then selects slope closest to 1
def best_match(df_name):
    df1 = dfs[df_name]
    neigh_best = {}
    for neighbor in neighbors[df_name]:
        lst = []
        for i in range(10):
            lst.append(compare(dfs[df_name], dfs[neighbor], i))
        bestval = lst.index(min(lst, key=lambda x:abs(x-1)))
#uncomment this line to see the list of the slopes  
    #  print(neighbor, lst, bestval)
        neigh_best[neighbor] = bestval
    return(neigh_best)

In [142]:
#if you uncomment the print(neighbor, lst, bestval) line above you can see here that even the best slope is really bad...
best_match('df_25001')

{'df_25007': 0, 'df_25019': 0}

In [143]:
lead_time = {}

for name, dfe in dfs.items():
    lead_time[name] = best_match(name)
    
print(lead_time)

{'df_25001': {'df_25007': 0, 'df_25019': 0}, 'df_25003': {'df_25011': 2, 'df_25013': 0, 'df_25015': 7}, 'df_25005': {}, 'df_25007': {}, 'df_25009': {}, 'df_25011': {}, 'df_25013': {'df_25015': 8}, 'df_25015': {}, 'df_25017': {'df_25009': 6, 'df_25021': 6, 'df_25027': 6}, 'df_25019': {}, 'df_25021': {'df_25005': 0, 'df_25023': 0}, 'df_25023': {'df_25001': 6}, 'df_25025': {'df_25009': 7, 'df_25017': 1, 'df_25021': 7}, 'df_25027': {'df_25011': 5, 'df_25013': 5, 'df_25015': 5}}
