In [1]:
#import dependencies
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import psycopg2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
#create engine and connect to AWS RDS
db_string = f"postgresql://postgres:{db_password}@capstone-db.cutxgn80t57o.us-west-1.rds.amazonaws.com"
engine = create_engine(db_string)
# read and check merged cases table
cases_df = pd.read_sql('cases_merged_full', con = engine)
cases_df

Unnamed: 0,FIPS,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,E_AGE65,...,Hopefulness,Income Per Capita,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,first_yr_cases
0,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,...,91.163142,26168.0,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,6589
1,1009,644.830460,57645,24222,20600,8220,909,22656,7861,10233,...,79.492703,21033.0,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,6444
2,1013,776.838201,20025,10026,6708,4640,567,20430,2141,3806,...,83.523765,19011.0,78.563680,76.109761,76.623924,69.058104,79.956648,67.920284,72.773953,2097
3,1015,605.867251,115098,53682,45033,20819,4628,24706,12620,19386,...,83.365608,22231.0,79.439032,79.955121,77.918741,54.063568,76.745724,67.456150,68.292794,14224
4,1017,596.560643,33826,16981,13516,5531,773,22827,4383,6409,...,85.371517,21532.0,76.995358,78.156771,75.891100,67.343775,79.128558,66.397785,69.554441,3488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053,48229,4570.523160,4098,1562,900,951,101,14190,1263,639,...,55.568966,14776.0,76.720396,79.603081,73.986415,70.917126,79.605796,75.878105,71.008448,512
3054,48131,1793.476183,11355,5592,3511,2751,482,17864,2386,2025,...,77.899678,19853.0,79.125428,78.895880,76.629575,60.576045,73.670302,64.571017,68.007770,1214
3055,48505,998.411980,14369,6388,4405,5609,621,17228,3226,1999,...,86.586509,16007.0,79.355639,79.572483,74.378252,77.443239,76.386871,74.001471,73.609838,1760
3056,48507,1297.406535,12131,4344,3509,4150,421,13350,2719,1665,...,88.785822,13393.0,78.392216,76.024682,75.848196,76.967659,77.303576,70.010162,71.121990,1844


In [3]:
#set index to FIPS
ccases_df = cases_df.set_index(cases_df['FIPS'])
cases_df= cases_df.drop(columns = ['FIPS'])

In [4]:
#create variable for case% for each counties population
cases_df['case_pct'] = cases_df['first_yr_cases']/cases_df['E_TOTPOP']*100
cases_df['case_pct'].head()

0    11.936594
1    11.178767
2    10.471910
3    12.358164
4    10.311595
Name: case_pct, dtype: float64

In [5]:
cases_df['case_pct'].describe()

count    3058.000000
mean        9.426600
std         3.045809
min         0.000000
25%         7.713422
50%         9.466675
75%        11.176131
max        38.010657
Name: case_pct, dtype: float64

In [6]:
# bin and cut the case_pct column into 2 classifications
q = cases_df['case_pct'].quantile(.8)
bins = [0, q , 40]
labels = ['low','high']
cases_df['case_class'] = pd.cut(cases_df['case_pct'], bins, labels = labels)
cases_df['case_class']

0       high
1        low
2        low
3       high
4        low
        ... 
3053    high
3054     low
3055    high
3056    high
3057     low
Name: case_class, Length: 3058, dtype: category
Categories (2, object): ['low' < 'high']

In [7]:
cases_df['case_class'].value_counts()

low     2422
high     612
Name: case_class, dtype: int64

In [8]:
#drop unneeded columns
cases_df = cases_df.drop('case_pct', axis =1)


In [9]:
#turn case % classifications into binary 
cases_df = pd.get_dummies(cases_df, columns = ['case_class'])
cases_df

Unnamed: 0,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,E_AGE65,E_AGE17,...,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,first_yr_cases,case_class_low,case_class_high
0,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,...,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,6589,0,1
1,644.830460,57645,24222,20600,8220,909,22656,7861,10233,13468,...,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,6444,1,0
2,776.838201,20025,10026,6708,4640,567,20430,2141,3806,4566,...,78.563680,76.109761,76.623924,69.058104,79.956648,67.920284,72.773953,2097,1,0
3,605.867251,115098,53682,45033,20819,4628,24706,12620,19386,25196,...,79.439032,79.955121,77.918741,54.063568,76.745724,67.456150,68.292794,14224,0,1
4,596.560643,33826,16981,13516,5531,773,22827,4383,6409,7006,...,76.995358,78.156771,75.891100,67.343775,79.128558,66.397785,69.554441,3488,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053,4570.523160,4098,1562,900,951,101,14190,1263,639,980,...,76.720396,79.603081,73.986415,70.917126,79.605796,75.878105,71.008448,512,0,1
3054,1793.476183,11355,5592,3511,2751,482,17864,2386,2025,2962,...,79.125428,78.895880,76.629575,60.576045,73.670302,64.571017,68.007770,1214,1,0
3055,998.411980,14369,6388,4405,5609,621,17228,3226,1999,4835,...,79.355639,79.572483,74.378252,77.443239,76.386871,74.001471,73.609838,1760,0,1
3056,1297.406535,12131,4344,3509,4150,421,13350,2719,1665,3583,...,78.392216,76.024682,75.848196,76.967659,77.303576,70.010162,71.121990,1844,0,1


In [10]:
cases_df['case_class_high'].value_counts()

0    2446
1     612
Name: case_class_high, dtype: int64

In [11]:
#seperate targets and features
## should i drop the number of cases?
X = cases_df.drop(columns = ['case_class_high','case_class_low','first_yr_cases']).values
y=cases_df['case_class_high'].values

In [12]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
len(X_train[0])

103

In [15]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 100
hidden_nodes_layer2 = 80
# hidden_nodes_layer1 = 100
# hidden_nodes_layer2 = 80
# hidden_nodes_layer3 = 80

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               10400     
                                                                 
 dense_1 (Dense)             (None, 80)                8080      
                                                                 
 dense_2 (Dense)             (None, 1)                 81        
                                                                 
Total params: 18,561
Trainable params: 18,561
Non-trainable params: 0
_________________________________________________________________


2022-08-14 11:59:11.860514: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [17]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [18]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

24/24 - 0s - loss: 2.8825 - accuracy: 0.7725 - 261ms/epoch - 11ms/step
Loss: 2.882539987564087, Accuracy: 0.772549033164978


In [19]:
# check confusion matrix
y_pred=nn.predict(X_test_scaled)
con_mat = tf.math.confusion_matrix(labels=y_test, predictions=y_pred)
con_mat



<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[602,   6],
       [154,   3]], dtype=int32)>

In [20]:
# nn.save('saved_models/aug_10_reduced_features_model.h5')