In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from flask import Flask, jsonify
from sqlalchemy.engine import url
import json
from sqlalchemy import extract
from sqlalchemy.engine import make_url
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import text
import psycopg2

In [2]:
# opening configuration file, saving into a variable, then establishing that variable as env which specifies the data as development
# environment credentials


with open('sql/config.json') as datafile:
    data = json.load(datafile)

env = data['dev']


In [3]:
# Instantiating the environment column values as the column names to hide sensitive data

db = env['db']
user = env['user']
password = env['pass']
port = env['port']
host = env['host']



In [4]:
# Connection string

engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')


In [5]:
# Reflecting an existing DB into a new model
Base = automap_base()

In [6]:
Base.prepare(autoload_with = engine)

In [7]:
# Seeing what tables there are

Base.classes.keys()

['races', 'cases', 'jhu_cre_cases_deaths', 'location']

In [8]:
# Saving the tables as references here if u want 

location = Base.classes.location

In [9]:
# Another table reference 

jhu_data = Base.classes.jhu_cre_cases_deaths

In [10]:
# Session object is the handler to the database, estb convo w db.
# Sessionmaker class creates a 'top level' session configuration that then can be used throughout the application without
# the need to repeat config arguments.'- Credit to rfkortekaas on Stack for explaining.

Session = sessionmaker(bind = engine)

session = Session()

In [11]:
# Test query to make sure database connection works

sql = session.query(jhu_data)

In [12]:
## An example of using sql commands:
sql = '''
    SELECT * FROM jhu_cre_cases_deaths;
'''

with engine.connect() as conn: 
    query = conn.execute(text(sql))
df2 = pd.DataFrame(query.fetchall())

In [13]:
df2

Unnamed: 0,fips,cases,deaths,lat,long,state,county,popuni,total_population,zero_rf,...,single_fathers_pop,plus_family_homes,highschool_grad,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,homeowner_vacancy,rental_vacancy
0,1001,19732,230,32.539527,-86.644082,Alabama,Autauga County,55688,55380,20573,...,723,111,49283,779,29291,3953,44884,2450,779,1726
1,1003,69641,724,30.727750,-87.722071,Alabama,Baldwin County,221898,212830,78622,...,2218,887,201483,3994,112058,19748,181512,7322,7322,96747
2,1005,7451,103,31.868263,-85.387129,Alabama,Barbour County,22023,25361,5024,...,220,132,16120,572,8170,2488,13323,2246,836,1629
3,1007,8067,109,32.996421,-87.125115,Alabama,Bibb County,20393,22493,6280,...,346,163,16130,265,8096,2182,14111,1264,305,1142
4,1009,18616,261,33.982109,-86.567906,Alabama,Blount County,57697,57681,18189,...,1038,115,46446,1961,27463,6231,42118,2365,403,4846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,12484,139,41.659439,-108.882788,Wyoming,Sweetwater County,41888,43521,16977,...,502,209,38872,1298,21739,4733,35185,712,879,12817
3138,56039,12123,16,43.935225,-110.589080,Wyoming,Teton County,23390,23280,7250,...,140,771,22267,1777,13028,2970,20372,561,467,5730
3139,56041,6378,43,41.287818,-110.547578,Wyoming,Uinta County,20183,20479,7744,...,322,141,18709,322,10091,2260,18063,686,100,2401
3140,56043,2749,50,43.904516,-107.680187,Wyoming,Washakie County,7738,8027,2601,...,108,77,6948,108,3869,1160,6051,379,15,1903


# Beginning ML portion of project

In [14]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [15]:
# Creating a list of columns so I can easily drop them during feature selection

df2.columns

Index(['fips', 'cases', 'deaths', 'lat', 'long', 'state', 'county', 'popuni',
       'total_population', 'zero_rf', 'one_two_rf', 'three_rf',
       'housing_units', 'hispanic_pop', 'white_pop', 'black_pop', 'native_pop',
       'asian_pop', 'pacific_islander_pop', 'other_race_pop',
       'bi_tri_racial_pop', 'male_pop', 'female_pop', 'veteran',
       'gini_ind_income', 'rural_pop', 'median_age_pop', 'elder_pop',
       'disability_pop', 'below_poverty_level', 'single_mothers_pop',
       'single_fathers_pop', 'plus_family_homes', 'highschool_grad',
       'multilingual_5yrs_plus', 'full_time_workers', 'no_health_insur',
       'internet_homes', 'no_vehicle', 'homeowner_vacancy', 'rental_vacancy'],
      dtype='object')

In [16]:
# Checking size/ shape of df

df2.shape

(3142, 41)

# pushing deaths as target


In [17]:
# Rechecking for null values
df2.isnull().sum()

fips                      0
cases                     0
deaths                    0
lat                       0
long                      0
state                     0
county                    0
popuni                    0
total_population          0
zero_rf                   0
one_two_rf                0
three_rf                  0
housing_units             0
hispanic_pop              0
white_pop                 0
black_pop                 0
native_pop                0
asian_pop                 0
pacific_islander_pop      0
other_race_pop            0
bi_tri_racial_pop         0
male_pop                  0
female_pop                0
veteran                   0
gini_ind_income           0
rural_pop                 0
median_age_pop            0
elder_pop                 0
disability_pop            0
below_poverty_level       0
single_mothers_pop        0
single_fathers_pop        0
plus_family_homes         0
highschool_grad           0
multilingual_5yrs_plus    0
full_time_workers   

In [18]:
# Checking for datatypes, confirming size of columns and that all column values are accounted for
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   fips                    3142 non-null   int64  
 1   cases                   3142 non-null   int64  
 2   deaths                  3142 non-null   int64  
 3   lat                     3142 non-null   float64
 4   long                    3142 non-null   float64
 5   state                   3142 non-null   object 
 6   county                  3142 non-null   object 
 7   popuni                  3142 non-null   int64  
 8   total_population        3142 non-null   int64  
 9   zero_rf                 3142 non-null   int64  
 10  one_two_rf              3142 non-null   int64  
 11  three_rf                3142 non-null   int64  
 12  housing_units           3142 non-null   int64  
 13  hispanic_pop            3142 non-null   int64  
 14  white_pop               3142 non-null   

- no missing values in dataset
- only integer and object values remaining (will drop lat and long columns before running classifier)


In [19]:
# Dropping lat and long columns (not necessary for this portion of the project)

dfd = df2.drop(['lat','long', 'fips'], axis = 1)

In [20]:
# Checking that cols were dropped
dfd.head()

Unnamed: 0,cases,deaths,state,county,popuni,total_population,zero_rf,one_two_rf,three_rf,housing_units,...,single_fathers_pop,plus_family_homes,highschool_grad,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,homeowner_vacancy,rental_vacancy
0,19732,230,Alabama,Autauga County,55688,55380,20573,22750,12365,23493,...,723,111,49283,779,29291,3953,44884,2450,779,1726
1,69641,724,Alabama,Baldwin County,221898,212830,78622,90552,52724,114164,...,2218,887,201483,3994,112058,19748,181512,7322,7322,96747
2,7451,103,Alabama,Barbour County,22023,25361,5024,9171,7828,12013,...,220,132,16120,572,8170,2488,13323,2246,836,1629
3,8067,109,Alabama,Bibb County,20393,22493,6280,8986,5127,9185,...,346,163,16130,265,8096,2182,14111,1264,305,1142
4,18616,261,Alabama,Blount County,57697,57681,18189,23950,15558,24323,...,1038,115,46446,1961,27463,6231,42118,2365,403,4846


In [21]:
# Creating empty column in preparation of for loop
dfd['death_cat'] = ""
dfd.head()

Unnamed: 0,cases,deaths,state,county,popuni,total_population,zero_rf,one_two_rf,three_rf,housing_units,...,plus_family_homes,highschool_grad,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,homeowner_vacancy,rental_vacancy,death_cat
0,19732,230,Alabama,Autauga County,55688,55380,20573,22750,12365,23493,...,111,49283,779,29291,3953,44884,2450,779,1726,
1,69641,724,Alabama,Baldwin County,221898,212830,78622,90552,52724,114164,...,887,201483,3994,112058,19748,181512,7322,7322,96747,
2,7451,103,Alabama,Barbour County,22023,25361,5024,9171,7828,12013,...,132,16120,572,8170,2488,13323,2246,836,1629,
3,8067,109,Alabama,Bibb County,20393,22493,6280,8986,5127,9185,...,163,16130,265,8096,2182,14111,1264,305,1142,
4,18616,261,Alabama,Blount County,57697,57681,18189,23950,15558,24323,...,115,46446,1961,27463,6231,42118,2365,403,4846,


In [22]:
dfd.deaths.describe()


count     3142.00000
mean       349.12317
std       1124.50972
min          0.00000
25%         47.00000
50%        110.00000
75%        261.00000
max      35250.00000
Name: deaths, dtype: float64

In [23]:
## using lower quartiles, mean, upper quartile 47, 110, 261 ##

death_catlow = 47
death_catmed = 110
death_catintermed = 261
death_cathigh = 369

count = 0


for (column, columnData) in dfd.iterrows():
    deaths = dfd['deaths'].values[count]
    dc = dfd['death_cat']
    if deaths <= death_catlow:
        dc.values[count] = 'low'
    elif deaths > death_catlow and deaths <= death_catmed:
        dc.values[count] = 'med'
    elif deaths > death_catmed and deaths <= death_catintermed:
        dc.values[count] = 'intermed'
    else:
        dc.values[count] = 'high'
        

    count = count + 1


In [24]:
# Assigning numerical values and storing in another column

dfd['State_Cat'] = LabelEncoder().fit_transform(dfd['state'])
dfd['County_Cat'] = LabelEncoder().fit_transform(dfd['county'])
dfd['Death_Cat'] = LabelEncoder().fit_transform(dfd['death_cat'])

In [25]:
# Dropping original obj value columns, and additional pop column not necessary for the 

cdrop = dfd[['state','county','death_cat', 'popuni']]
cdrop

dfdd = dfd.drop(cdrop, axis = 1)

In [26]:
# Dropping original str value columns and additional population column 

cdrop = dfd[['state','county','death_cat', 'popuni']]
cdrop

dfdd = dfd.drop(cdrop, axis = 1)

In [27]:
# Confirming everything is an integer now

dfdd.dtypes

cases                     int64
deaths                    int64
total_population          int64
zero_rf                   int64
one_two_rf                int64
three_rf                  int64
housing_units             int64
hispanic_pop              int64
white_pop                 int64
black_pop                 int64
native_pop                int64
asian_pop                 int64
pacific_islander_pop      int64
other_race_pop            int64
bi_tri_racial_pop         int64
male_pop                  int64
female_pop                int64
veteran                   int64
gini_ind_income           int64
rural_pop                 int64
median_age_pop            int64
elder_pop                 int64
disability_pop            int64
below_poverty_level       int64
single_mothers_pop        int64
single_fathers_pop        int64
plus_family_homes         int64
highschool_grad           int64
multilingual_5yrs_plus    int64
full_time_workers         int64
no_health_insur           int64
internet

In [28]:
dfdd.describe()

Unnamed: 0,cases,deaths,total_population,zero_rf,one_two_rf,three_rf,housing_units,hispanic_pop,white_pop,black_pop,...,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,homeowner_vacancy,rental_vacancy,State_Cat,County_Cat,Death_Cat
count,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,...,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0
mean,32138.84,349.12317,103341.1,35555.82,45117.53,22193.37,43739.33,18596.88,62411.13,12606.47,...,8687.606,52432.76,9114.868555,85186.1,8662.721,-180860.7,6116.568428,26.237428,938.991407,1.50191
std,110996.9,1124.50972,331170.1,102907.0,158383.7,73778.44,127931.7,125763.9,142795.0,53758.23,...,60571.88,169148.7,36212.828378,280530.9,50298.05,10228350.0,18053.825046,14.270066,526.566116,1.117071
min,0.0,0.0,66.0,20.0,39.0,27.0,66.0,0.0,19.0,0.0,...,0.0,62.0,0.0,57.0,0.0,-573333300.0,0.0,0.0,0.0,0.0
25%,3097.75,47.0,10952.0,3045.5,4517.25,2867.0,5505.0,341.25,7806.0,106.0,...,114.0,4905.25,937.25,7417.5,591.0,159.0,592.0,14.0,490.25,1.0
50%,7899.0,110.0,25739.5,7858.5,10743.5,6488.5,12496.5,1049.5,19781.0,777.0,...,419.5,12099.5,2296.5,18429.0,1514.0,450.5,1586.0,25.0,939.5,2.0
75%,21209.75,261.0,67866.0,22611.5,28592.25,15406.25,31481.0,5027.25,52619.0,5303.5,...,1932.25,32759.25,5784.75,52421.25,3988.75,1211.5,4444.75,40.0,1377.0,2.0
max,3691301.0,35250.0,10081570.0,2503900.0,5264628.0,2180574.0,3542800.0,4825314.0,2606664.0,1180274.0,...,2347988.0,4855161.0,955113.0,8387092.0,1421880.0,102188.0,398169.0,50.0,1876.0,3.0


In [29]:
dfdd

Unnamed: 0,cases,deaths,total_population,zero_rf,one_two_rf,three_rf,housing_units,hispanic_pop,white_pop,black_pop,...,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,homeowner_vacancy,rental_vacancy,State_Cat,County_Cat,Death_Cat
0,19732,230,55380,20573,22750,12365,23493,1559,41543,10580,...,779,29291,3953,44884,2450,779,1726,0,82,1
1,69641,724,212830,78622,90552,52724,114164,10207,184397,20414,...,3994,112058,19748,181512,7322,7322,96747,0,89,0
2,7451,103,25361,5024,9171,7828,12013,969,10086,10438,...,572,8170,2488,13323,2246,836,1629,0,100,3
3,8067,109,22493,6280,8986,5127,9185,530,15192,4506,...,265,8096,2182,14111,1264,305,1142,0,149,3
4,18616,261,57681,18189,23950,15558,24323,5365,50138,865,...,1961,27463,6231,42118,2365,403,4846,0,164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,12484,139,43521,16977,17781,7130,19771,6660,33342,460,...,1298,21739,4733,35185,712,879,12817,50,1635,1
3138,12123,16,23280,7250,11567,4573,13848,3508,19016,280,...,1777,13028,2970,20372,561,467,5730,50,1660,2
3139,6378,43,20479,7744,9346,3093,9041,1836,17660,20,...,322,10091,2260,18063,686,100,2401,50,1713,2
3140,2749,50,8027,2601,3215,1922,3860,1098,6337,0,...,108,3869,1160,6051,379,15,1903,50,1773,3


In [30]:
## Dropping homeowner vacancy as it has a suspicious outlier
dfdd = dfdd.drop(['homeowner_vacancy'], axis = 1)


## Outcomes

- Hispanic population has 88% correlation with deaths, 80% with zero risk factors, 92% with one to two risk factors and roughly 91% with three risk factors.

- White population has 85% correlation with deaths, a 97% correlation with zero risk factors, 88% with one to two risk factors, and 86% with three risk factors.

- Black population has a 77% correlation with deaths, a 76% correlation with zero risk factors, 75% with one to two risk factors, and an 83% with three risk factors.

- Asian population has a 76% correlation with deaths, a 78% percent correlation with zero risk factors, a 85% correlation with one to two risk factors, and a 80% correlation with three risk factors.

- The Bi and Tri racial populations have a 80% correlation with deaths, an 88% correlation with zero risk factors, an 89% correlation with one to two risk factors, and 83% correlation with three risk factors.

- Other Race population has a 80% correlation with the target, 73% correlation with zero risk factors, 77% with one to two risk factors and and and 78% with three risk factors.

- The native population has a 36% correlation with the target, 34% correlation with zero risk factors, 34% with one to two risk factors, and 33% with three risk factors.

- Pacific Islander population has a 26% correlation with the target, 34% correlation with zero risk factors, 36% with one to two risk factors, and 30% with three risk factors.

- **Zero Risk factors as a whole, has a 90% correlation with the target, One to Two risk factors has a 95% correlation with the target, and three risk factors has a 97% correlation with the target. 


In [31]:
from sklearn.decomposition import PCA
# normalizing data

In [32]:
# Creating features

X = dfdd.drop(['total_population','gini_ind_income','State_Cat','County_Cat', 'plus_family_homes','rural_pop',
              'rental_vacancy','Death_Cat'], axis = 1)
# X = dfdd.drop(['Death_Cat','gini_ind_income'], axis = 1)

y = dfdd['Death_Cat']

In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size= .30, train_size = .70, random_state=49)

In [34]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on the training set
scaler.fit(x_train)

In [35]:
# Apply to both the train set and the test set.
X_train = scaler.transform(x_train)

In [36]:
X_test = scaler.transform(x_test)

In [37]:
# Apply PCA

In [48]:
pca = PCA(n_components = 2)

In [49]:
# Fit on the train set only
pca.fit(X_train)

In [50]:
#Apply transformation on both train and test set
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [51]:
var = pca.explained_variance_ratio_
var.sum()

0.892884671701869

In [52]:
# Storing methods as variables for formatting

cr = classification_report
ar = accuracy_score
cm = confusion_matrix

In [53]:
from sklearn.linear_model import LogisticRegression

LRM = LogisticRegression()
LRM.fit(X_train,y_train)
predLRM = LRM.predict(X_test)

print(f"the accuracy score is:{ar(y_test,predLRM)}")

print(f"confusion matrix:\n{cm(y_test,predLRM)}")



the accuracy score is:0.7571580063626723
confusion matrix:
[[184  43   0   0]
 [ 14 166   1  54]
 [  2   4 218  31]
 [  1  28  51 146]]


In [54]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier()
RFC.fit(X_train, y_train)
predRFC = RFC.predict(X_test)


print(f"the accuracy score is:{ar(y_test, predRFC)}")

print(f"confusion_matrix:\n{cm(y_test, predRFC)}")



the accuracy score is:0.7751855779427359
confusion_matrix:
[[203  23   0   1]
 [ 17 181   1  36]
 [  2   5 194  54]
 [  3  44  26 153]]


In [55]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
DTpredict = DT.predict(X_test)

print(f"the accuracy score is:{ar(y_test, DTpredict)}")

print(f"confusion matrix:\n{cm(y_test, DTpredict)}")



the accuracy score is:0.7232237539766702
confusion matrix:
[[192  30   2   3]
 [ 25 156   3  51]
 [  2   7 193  53]
 [  3  51  31 141]]


In [56]:
from sklearn.neighbors import KNeighborsClassifier

neighborC = KNeighborsClassifier()
neighborC.fit(X_train, y_train)
predneighbx = neighborC.predict(X_test)

print(f"the accuracy score is:{ar(y_test, predneighbx)}")

print(f"confusion matrix:\n{cm(y_test, predneighbx)}")


the accuracy score is:0.7751855779427359
confusion matrix:
[[204  23   0   0]
 [ 21 184   0  30]
 [  2   7 192  54]
 [  5  38  32 151]]


In [57]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lm = LinearRegression()
lm.fit(X_train, y_train)
predictlm = lm.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predictlm)
r2 = r2_score(y_test, predictlm)
print(f"mean squared error (MSE): {mse}")

print(f"R-squared (R2): {r2}")
lm.score(X_test, y_test)












mean squared error (MSE): 1.060813458618592
R-squared (R2): 0.12378872405194485


0.12378872405194485