In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

In [2]:
DF = pd.read_csv('data_cleaned/MERGED_PETER.csv')

DF = DF[~pd.isna(DF['Population'])]
DF = DF[~pd.isna(DF['Area_(sqmi)'])].reset_index(drop=True)
DF.dtypes

Unique_ID                               object
date                                    object
county                                  object
state                                   object
fips                                   float64
cases                                    int64
deaths                                   int64
Latitude                               float64
Longitude                              float64
Total_Hospital_Beds                     object
Total_ICU_Beds                          object
Available_Hospital_Beds                 object
Potentially_Available_Hospital_Beds     object
Available_ICU_Beds                     float64
Potentially_Available_ICU_Beds          object
Adult_Population                        object
Population_65+                          object
Area_(sqmi)                            float64
Population                             float64
dtype: object

In [3]:
for col in DF.columns[5:] :
    DF[col] = DF[col].apply(lambda x : float(str(x).replace(',','')))
DF.dtypes

Unique_ID                               object
date                                    object
county                                  object
state                                   object
fips                                   float64
cases                                  float64
deaths                                 float64
Latitude                               float64
Longitude                              float64
Total_Hospital_Beds                    float64
Total_ICU_Beds                         float64
Available_Hospital_Beds                float64
Potentially_Available_Hospital_Beds    float64
Available_ICU_Beds                     float64
Potentially_Available_ICU_Beds         float64
Adult_Population                       float64
Population_65+                         float64
Area_(sqmi)                            float64
Population                             float64
dtype: object

In [4]:
hospital_ids = np.unique(DF.loc[~pd.isna(DF['Total_Hospital_Beds']),['Unique_ID']].values)

item_seen = []
hospital_locations = []
for i,v in DF.iterrows() :
    if v['Unique_ID'] in hospital_ids :
        if v['Unique_ID'] not in item_seen :
            item_seen.append(v['Unique_ID'])
            hospital_locations.append([v['Latitude'],v['Longitude']])


In [5]:
DF['Pop_Density'] = DF['Population']/DF['Area_(sqmi)']

In [6]:
big_city_ids = set(DF[DF['Pop_Density'] >= 3000]['Unique_ID'])

item_seen = []
big_city_locations = []
for i,v in DF.iterrows() :
    if v['Unique_ID'] in big_city_ids :
        if v['Unique_ID'] not in item_seen :
            item_seen.append(v['Unique_ID'])
            big_city_locations.append([v['Latitude'],v['Longitude']])


In [7]:
def Proximity(location_x,location_y,city_loc,hospital_loc) :
    
    min_distance_city = 1e10
    for loc in city_loc :
        dis = np.sqrt((loc[0] - location_x)**2 + (loc[1] - location_y)**2)
        if dis < min_distance_city :
            min_distance_city = dis
            
    min_distance_hosp = 1e10
    for loc in hospital_loc :
        dis = np.sqrt((loc[0] - location_x)**2 + (loc[1] - location_y)**2)
        if dis < min_distance_hosp :
            min_distance_hosp = dis
            
    return min_distance_city,min_distance_hosp

In [8]:
nearest_big_city,nearest_hospital = [],[]
for i,v in DF.iterrows() :
    if i%500 == 0 :print(i,end=',')
    cit,hos = Proximity(v['Latitude'],v['Longitude'],big_city_locations,hospital_locations)
    nearest_big_city.append(cit)
    nearest_hospital.append(hos)

DF['Nearest_Hospital'] = nearest_hospital
DF['Nearest_BigCity'] = nearest_big_city

0,500,1000,1500,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000,10500,11000,11500,12000,12500,13000,13500,14000,14500,15000,15500,16000,16500,17000,17500,18000,18500,19000,19500,20000,20500,21000,21500,22000,22500,23000,23500,24000,24500,25000,25500,26000,26500,

In [10]:
min_date = {uid:sorted(DF[DF['Unique_ID'] == uid]['date'])[0] for uid in DF['Unique_ID'].unique()}

In [16]:
day_before = []
for date in DF['date'] :
    minus_one = datetime.datetime.strptime(date,'%Y-%M-%d') - datetime.timedelta(days=1)
    day_before.append(datetime.datetime.strftime(minus_one,'%Y-%M-%d'))
    

In [18]:
DF['day_before_index'] = day_before

In [19]:
def New_Cases(col) :
    
    new_cases_per_day = []
    
    for i,v in DF.iterrows() :
        
        unique_id = v['Unique_ID']
        date = v['date']
        if date == min_date[unique_id] :
            to_add = v[col]
        else :
            try :
                to_subtract = DF[(DF['Unique_ID'] == unique_id) & (DF['date'] == v['day_before_index'])][col].values[0]
                to_add = v[col] - to_subtract
            except :
                to_add = np.nan
        new_cases_per_day.append(to_add)    
    return new_cases_per_day
        
        
    
DF['New_Cases'] = New_Cases('cases') 
DF['New_Deaths'] = New_Cases('deaths') 

In [21]:
DF.to_csv('data_cleaned/Distance_Calculations.csv',index=False)

In [23]:
## Read in Steve's lagged dataframe

DF = pd.read_csv('output/MASTER_filtered_withlag.csv')

In [25]:
DF.head()

Unnamed: 0,Unique_ID,date,county,state,fips,cases,deaths,Latitude,Longitude,Total_Hospital_Beds,...,Area_.sqmi.,Population,Pop_Density,Nearest_Hospital,Nearest_BigCity,day_before_index,New_Cases,New_Deaths,day_before_cases,day_before_deaths
0,Snohomish_Washington,2020-01-21,Snohomish,Washington,53061.0,1,0,48.04616,-121.71707,,...,2090.0,822083,393.341148,1.866215,10.319262,2020-01-20,1.0,0.0,,
1,Snohomish_Washington,2020-01-22,Snohomish,Washington,53061.0,1,0,48.04616,-121.71707,,...,2090.0,822083,393.341148,1.866215,10.319262,2020-01-21,0.0,0.0,1.0,0.0
2,Snohomish_Washington,2020-01-23,Snohomish,Washington,53061.0,1,0,48.04616,-121.71707,,...,2090.0,822083,393.341148,1.866215,10.319262,2020-01-22,0.0,0.0,0.0,0.0
3,Cook_Illinois,2020-01-24,Cook,Illinois,17031.0,1,0,41.841448,-87.816588,,...,946.0,5150233,5444.22093,1.173137,0.0,2020-01-23,1.0,0.0,,
4,Snohomish_Washington,2020-01-24,Snohomish,Washington,53061.0,1,0,48.04616,-121.71707,,...,2090.0,822083,393.341148,1.866215,10.319262,2020-01-23,0.0,0.0,0.0,0.0


In [27]:
DF['cases_normalized'] = DF['New_Cases']/DF['Population']
DF['day_before_cases_normalized'] = DF['day_before_cases']/DF['Population']

DF['day_before_deaths_normalized'] = DF['day_before_deaths']/DF['Population']

DF = DF[~pd.isna(DF['day_before_cases_normalized'])].reset_index(drop=True)

In [28]:
ind_holdout = []

unique_ids = DF['Unique_ID'].unique()
holdouts = {}

for county in unique_ids :
    
    subbed = DF[DF['Unique_ID'] == county]
    to_hold = sorted(subbed['date'])[-2:]
    holdouts[county] = to_hold

holdout_indices = []
for i,v in DF.iterrows() :
    
    unique_id = v['Unique_ID']
    if v['date'] in holdouts[unique_id] :
        holdout_indices.append(i)
        
    

In [29]:
Holdout_Set = DF.iloc[holdout_indices].reset_index(drop=True)

In [30]:
indices_train = np.setdiff1d(list(DF.index),holdout_indices)

Training_Set = DF.iloc[indices_train].reset_index(drop=True)

In [31]:
cols_drop = ['date','county','state','fips','Latitude','Longitude','Total_Hospital_Beds', 'Total_ICU_Beds',
       'Available_Hospital_Beds', 'Potentially_Available_Hospital_Beds',
       'Available_ICU_Beds', 'Potentially_Available_ICU_Beds',
       'Adult_Population', 'Population_65.','Area_.sqmi.','day_before_index','cases','deaths',
            'day_before_cases', 'day_before_deaths','cases','cases_normalized']

Holdout_Set_X,Holdout_Set_Y = Holdout_Set.drop(cols_drop,axis=1),Holdout_Set['cases_normalized']
Training_Set_X,Training_Set_Y = Training_Set.drop(cols_drop,axis=1),Training_Set['cases_normalized']

In [32]:

Holdout_Set_X.to_csv('output/Holdout_Set_X.csv',index=True)
Holdout_Set_Y.to_csv('output/Holdout_Set_Y.csv',index=True)
Training_Set_X.to_csv('output/Training_Set_X.csv',index=True)
Training_Set_Y.to_csv('output/Training_Set_Y.csv',index=True)

# Now actually Model the Data

In [297]:
def Standardize_Data(Train_X,Test_X) :
    
    mean_vals = {}
    std_vals = {}
    
    for col in Train_X.columns :
        mean_vals[col] = np.mean(Train_X[col])
        std_vals[col] = np.std(Train_X[col])
        
        Train_X[col] = (Train_X[col] - mean_vals[col])/std_vals[col]
        Test_X[col] = (Test_X[col] - mean_vals[col])/std_vals[col]
        
    return Train_X,Test_X,mean_vals,std_vals

In [301]:
rand_inds = np.random.permutation(Training_Set_X.index)

percent_training=.7

trainings_ind = rand_inds[:int(percent_training*len(rand_inds))]
testing_ind = rand_inds[int(percent_training*len(rand_inds)):]