In [1]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [2]:
%matplotlib inline
import numpy as np
import sklearn
import sklearn.datasets
import pandas as pd
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [3]:
import tensorflow
tensorflow.keras.__version__

'2.2.4-tf'

In [4]:
pd.options.display.max_rows = 300

In [5]:
pd.options.display.max_columns = 200

In [6]:
# Read in CSV files - covid file with all dates
covid = pd.read_csv('../COVID-19-Predictive-Modelling/assets/data/COVID County Data/covid_county_data_cleaned.csv')
social = pd.read_csv('../COVID-19-Predictive-Modelling/assets/data/SVI2018_US_COUNTY_2.csv')

In [7]:
covid.head()

Unnamed: 0.1,Unnamed: 0,name,date,fips,lat,long,confirmed,deaths,confirmed_diff,deaths_diff,last_update,state,state_abbr
0,0,Jefferson,3/22/20,1073,33.555547,-86.895063,71,0,0,0,3/22/20 23:45,Alabama,AL
1,1,Shelby,3/22/20,1117,33.268798,-86.662326,17,0,0,0,3/22/20 23:45,Alabama,AL
2,2,Lee,3/22/20,1081,32.601549,-85.351322,16,0,0,0,3/22/20 23:45,Alabama,AL
3,3,Madison,3/22/20,1089,34.763271,-86.550696,16,0,0,0,3/22/20 23:45,Alabama,AL
4,4,Tuscaloosa,3/22/20,1125,33.287261,-87.525568,7,0,0,0,3/22/20 23:45,Alabama,AL


In [8]:
# Drop Unnamed: 0 column
covid.drop('Unnamed: 0', axis=1, inplace=True)
covid.head()

Unnamed: 0,name,date,fips,lat,long,confirmed,deaths,confirmed_diff,deaths_diff,last_update,state,state_abbr
0,Jefferson,3/22/20,1073,33.555547,-86.895063,71,0,0,0,3/22/20 23:45,Alabama,AL
1,Shelby,3/22/20,1117,33.268798,-86.662326,17,0,0,0,3/22/20 23:45,Alabama,AL
2,Lee,3/22/20,1081,32.601549,-85.351322,16,0,0,0,3/22/20 23:45,Alabama,AL
3,Madison,3/22/20,1089,34.763271,-86.550696,16,0,0,0,3/22/20 23:45,Alabama,AL
4,Tuscaloosa,3/22/20,1125,33.287261,-87.525568,7,0,0,0,3/22/20 23:45,Alabama,AL


In [9]:
# For the SVI, -999 reflects missing values. Apparantly, only one county has -999 vaules in its dataset, so removing.
social =  social[(social.iloc[:, 1:] != -999).all(axis=1)]
social.head()

Unnamed: 0,ST,STATE,COUNTY,fips,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,E_AGE65,E_AGE17,E_DISABL,E_SNGPNT,E_MINRTY,E_LIMENG,E_MUNIT,E_MOBILE,E_CROWD,E_NOVEH,E_GROUPQ
1,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546
2,1,ALABAMA,Blount,1009,644.83046,57645,24222,20600,8220,909,22656,7861,10233,13468,8114,1437,7413,934,211,6108,339,856,543
3,1,ALABAMA,Butler,1013,776.838201,20025,10026,6708,4640,567,20430,2141,3806,4566,3492,704,9641,93,134,2625,119,520,322
4,1,ALABAMA,Calhoun,1015,605.867251,115098,53682,45033,20819,4628,24706,12620,19386,25196,23598,4701,31675,1076,1990,7904,772,2599,3112
5,1,ALABAMA,Chambers,1017,596.560643,33826,16981,13516,5531,773,22827,4383,6409,7006,5570,1307,14954,36,679,2378,404,989,512


In [10]:
# Merge covid and social on fips
merged = pd.merge(social, covid, on='fips')
merged.head()

Unnamed: 0,ST,STATE,COUNTY,fips,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,E_AGE65,E_AGE17,E_DISABL,E_SNGPNT,E_MINRTY,E_LIMENG,E_MUNIT,E_MOBILE,E_CROWD,E_NOVEH,E_GROUPQ,name,date,lat,long,confirmed,deaths,confirmed_diff,deaths_diff,last_update,state,state_abbr
0,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/22/20,32.539527,-86.644082,0,0,0,0,3/22/20 23:45,Alabama,AL
1,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/23/20,32.539527,-86.644082,0,0,0,0,3/23/20 23:19,Alabama,AL
2,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/24/20,32.539527,-86.644082,1,0,1,0,3/24/20 23:37,Alabama,AL
3,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/25/20,32.539527,-86.644082,4,0,3,0,3/25/20 23:33,Alabama,AL
4,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/26/20,32.539527,-86.644082,6,0,2,0,3/26/20 23:48,Alabama,AL


In [11]:
merged.dtypes

ST                  int64
STATE              object
COUNTY             object
fips                int64
AREA_SQMI         float64
E_TOTPOP            int64
E_HU                int64
E_HH                int64
E_POV               int64
E_UNEMP             int64
E_PCI               int64
E_NOHSDP            int64
E_AGE65             int64
E_AGE17             int64
E_DISABL            int64
E_SNGPNT            int64
E_MINRTY            int64
E_LIMENG            int64
E_MUNIT             int64
E_MOBILE            int64
E_CROWD             int64
E_NOVEH             int64
E_GROUPQ            int64
name               object
date               object
lat               float64
long              float64
confirmed           int64
deaths              int64
confirmed_diff      int64
deaths_diff         int64
last_update        object
state              object
state_abbr         object
dtype: object

In [12]:
merged.shape

(158646, 34)

In [13]:
# Create a dictionary with state lockdown dates and map to the merged DF. Note that 5 states (Arkansas, Iowa, Nebraska, North Dakota, and Wyoning) never locked down.
# NaN was entered for these states.
def set_value(row_number, assigned_value):
    return assigned_value[row_number]

In [14]:
ld_dictionary = {'ARKANSAS': 'NaN', 'IOWA': 'NaN', 'NEBRASKA': 'NaN', 'NORTH DAKOTA': 'NaN', 'WYOMING': 'NaN', 'ALABAMA': '4/30/20', 'ALASKA': '3/27/20', 'ARIZONA': '3/30/20', 'CALIFORNIA': '3/19/20', 'COLORADO': '3/25/20', 'CONNECTICUT': '3/20/20', 'DELAWARE': '3/22/20', 'FLORIDA': '4/1/20', 'GEORGIA': '4/2/20', 'HAWAII': '3/23/20', 'IDAHO': '3/25/20', 'ILLINOIS': '3/20/20', 'INDIANA': '3/23/20', 'KANSAS': '3/28/20', 'KENTUCKY': '3/22/20', 'LOUISIANA': '3/22/20', 'MAINE': '3/31/20', 'MARYLAND': '3/30/20', 'MASSACHUSETTS': '3/23/20', 'MICHIGAN': '3/23/20', 'MINNESOTA': '3/25/20', 'MISSISSIPPI': '4/1/20', 'MISSOURI': '4/3/20', 'MONTANA': '3/26/20', 'NEVADA': '3/31/2020', 'NEW HAMPSHIRE': '3/26/20', 'NEW JERSEY': '3/21/20', 'NEW MEXICO': '3/23/20', 'NEW YORK': '3/20/20', 'NORTH CAROLINA': '3/27/20', 'OHIO': '3/22/20', 'OKLAHOMA': '4/1/20', 'OREGON': '3/23/20', 'PENNSYLVANIA': '4/1/20', 'RHODE ISLAND': '3/30/20', 'SOUTH CAROLINA': '4/6/20', 'SOUTH DAKOTA': '3/23/20', 'TENNESSEE': '3/30/20', 'TEXAS': '3/31/20', 'UTAH': '3/27/20', 'VERMONT': '3/24/20', 'VIRGINIA': '3/30/20', 'WASHINGTON': '3/23/20', 'WEST VIRGINIA': '3/23/20', 'WISCONSIN': '3/24/20'}

In [15]:
merged['ld_date'] = merged['STATE'].apply(set_value, args = (ld_dictionary, ))

In [23]:
merged.head()

Unnamed: 0,ST,STATE,COUNTY,fips,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,E_AGE65,E_AGE17,E_DISABL,E_SNGPNT,E_MINRTY,E_LIMENG,E_MUNIT,E_MOBILE,E_CROWD,E_NOVEH,E_GROUPQ,name,date,lat,long,confirmed,deaths,confirmed_diff,deaths_diff,last_update,state,state_abbr,ld_date
0,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/22/20,32.539527,-86.644082,0,0,0,0,3/22/20 23:45,Alabama,AL,4/30/20
1,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/23/20,32.539527,-86.644082,0,0,0,0,3/23/20 23:19,Alabama,AL,4/30/20
2,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/24/20,32.539527,-86.644082,1,0,1,0,3/24/20 23:37,Alabama,AL,4/30/20
3,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/25/20,32.539527,-86.644082,4,0,3,0,3/25/20 23:33,Alabama,AL,4/30/20
4,1,ALABAMA,Autauga,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,10465,1586,13788,426,886,4279,299,1191,546,Autauga,3/26/20,32.539527,-86.644082,6,0,2,0,3/26/20 23:48,Alabama,AL,4/30/20


In [26]:
merged.shape

(158646, 35)

In [None]:
# Drop rows with NaN values


In [None]:
# Convert ld_date to datetime
merged

In [22]:
# Calculate plus 30 lockdown column
merged['ld_date_plus'] = merged['ld_date'] + pd.DateOffset(days=30)
merged.head()

TypeError: can only concatenate str (not "relativedelta") to str

In [18]:
# Create a calculated column that reflects the date 30 days post lockdown.


In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
# Not able to get SGD to work
from keras.optimizers import SGD

Using TensorFlow backend.


In [20]:
# Model 1: Linear regression using confirmed (summed count) from SVI
X = combined[['AREA_SQMI','E_HU', 'E_HH', 'E_POV', 'E_UNEMP', 'E_PCI', 'E_NOHSDP', 'E_AGE65', 'E_AGE17', 'E_DISABL', 'E_SNGPNT', 'E_MINRTY', 'E_LIMENG', 'E_MUNIT', 'E_MOBILE', 'E_CROWD', 'E_NOVEH', 'E_GROUPQ']]

NameError: name 'combined' is not defined

In [None]:
Y = combined['confirmed'].values.reshape(-1,1)

In [None]:
# Model 1 - Normalized predictors
X = merged[predictors].values
Y = merged[target_column].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=40)
print(X_train.shape); print(X_test.shape); print(Y_train.shape); print(Y_test.shape)

In [None]:
# Create an empty sequential model
model = Sequential()

In [None]:
model.add(Dense(6, input_dim=19, activation='relu'))
model.add(Dense(1))
opt = SGD(lr=0.1, momentum=0.9)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='mean_squared_logarithmic_error',
              metrics=['accuracy'])
model.fit(
    X_train,
    Y_train,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=60, verbose=0)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_train, Y_train, verbose=0)
print(
    f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
model.save("covid_model_trained.h5")

In [None]:
from tensorflow.keras.models import load_model
covid_model = load_model("covid_model_trained.h5")

In [None]:
model_loss, model_accuracy = covid_model.evaluate(
    X_test, Y_test, verbose=0)
print(
    f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
pred_train= model.predict(X_train)
print(np.sqrt(mean_squared_error(Y_train,pred_train)))
pred= model.predict(X_test)
print(np.sqrt(mean_squared_error(Y_test,pred))) 

In [None]:
from matplotlib import pyplot
pyplot.title('Loss / Mean Squared Logarithmic Error')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
# Model 2
# Assign X (predictors) and Y (criterion)
X = merged.drop("confirmed", axis=1)
Y = merged["confirmed"].values.reshape(-1, 1)
print(X.shape, Y.shape)

In [None]:
# Split the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
Y_scaler = StandardScaler().fit(Y_train)
Y_train_scaled = Y_scaler.transform(Y_train)
Y_test_scaled = Y_scaler.transform(Y_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
Y_train.shape

In [None]:
Y_test.shape

In [None]:
# define model
model = Sequential()
model.add(Dense(6, input_dim=19, activation='relu'))
model.add(Dense(1))
opt = SGD(lr=0.01, momentum=0.9)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])
model.fit(
    X_train_scaled,
    Y_train,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
history = model.fit(X_train_scaled, Y_train, validation_data=(X_test_scaled, Y_test), epochs=60, verbose=0)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_train_scaled, Y_train, verbose=0)
print(
    f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
model.save("covid_model_trained.h5")

In [None]:
from tensorflow.keras.models import load_model
covid_model = load_model("covid_model_trained.h5")

In [None]:
model_loss, model_accuracy = covid_model.evaluate(
    X_test_scaled, Y_test, verbose=0)
print(
    f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
from matplotlib import pyplot
pyplot.title('Loss / Mean Squared Logarithmic Error')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
# Model 3
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, random_state=1)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# define model
model = Sequential()
model.add(Dense(12, input_dim=19, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(units = 6, activation = 'relu'))
model.add(Dense(3, activation='linear'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='mean_squared_logarithmic_error',
              metrics=['accuracy'])
model.fit(
    X_train_scaled,
    Y_train,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
history = model.fit(X_train_scaled, Y_train, validation_data=(X_test_scaled, Y_test), epochs=60, verbose=0)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_train_scaled, Y_train, verbose=0)
print(
    f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
model.save("covid_model_trained.h5")

In [None]:
from tensorflow.keras.models import load_model
covid_model = load_model("covid_model_trained.h5")

In [None]:
model_loss, model_accuracy = covid_model.evaluate(
    X_test_scaled, Y_test, verbose=0)
print(
    f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
from matplotlib import pyplot
pyplot.title('Loss / Mean Squared Logarithmic Error')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
# Model 4
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# define model
model = Sequential()
model.add(Dense(38, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))
model.add(Dense(38, kernel_initializer='normal',activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))

In [None]:
# Compile the network :
model.compile(loss='mean_absolute_error', optimizer='RMSprop', metrics=['mean_absolute_error'])
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    Y_train,
    epochs=100,
    shuffle=True,
    verbose=2
)

In [None]:
history = model.fit(X_train_scaled, Y_train, validation_data=(X_test_scaled, Y_test), epochs=100, verbose=0)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_train_scaled, Y_train, verbose=0)
print(
    f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
model.save("covid_model_trained.h5")

In [None]:
from tensorflow.keras.models import load_model
covid_model = load_model("covid_model_trained.h5")

In [None]:
model_loss, model_accuracy = covid_model.evaluate(
    X_test_scaled, Y_test, verbose=0)
print(
    f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
from matplotlib import pyplot
pyplot.title('Loss / Mean Squared Logarithmic Error')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()