In [114]:
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from flask import Flask, jsonify
from sqlalchemy.engine import url
import json
from sqlalchemy import extract
from sqlalchemy.engine import make_url
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import text
import psycopg2

In [115]:
with open('config_copy.json') as datafile:
    data = json.load(datafile)

env = data['dev']

In [116]:
db = env['db']
user = env['user']
password = env['pass']
port = env['port']
host = env['host']

In [117]:
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')

In [118]:
Base = automap_base()

In [119]:
Base.prepare(autoload_with = engine)

In [120]:
Base.classes.keys()

['jhu_cre_cases_deaths', 'location']

In [121]:
location = Base.classes.location

In [122]:
jhu_data = Base.classes.jhu_cre_cases_deaths

In [123]:
Session = sessionmaker(bind = engine)

session = Session()

In [124]:
sql = session.query(jhu_data)

In [125]:
jhucredf = pd.DataFrame(engine.connect().execute(text(str(sql))))

In [126]:
jhucredf

Unnamed: 0,jhu_cre_cases_deaths_fips,jhu_cre_cases_deaths_cases,jhu_cre_cases_deaths_deaths,jhu_cre_cases_deaths_lat,jhu_cre_cases_deaths_long,jhu_cre_cases_deaths_state,jhu_cre_cases_deaths_county,jhu_cre_cases_deaths_popuni,jhu_cre_cases_deaths_total_population,jhu_cre_cases_deaths_zero_rf,...,jhu_cre_cases_deaths_single_fathers_pop,jhu_cre_cases_deaths_plus_family_homes,jhu_cre_cases_deaths_highschool_grad,jhu_cre_cases_deaths_multilingual_5yrs_plus,jhu_cre_cases_deaths_full_time_workers,jhu_cre_cases_deaths_no_health_insur,jhu_cre_cases_deaths_internet_homes,jhu_cre_cases_deaths_no_vehicle,jhu_cre_cases_deaths_homeowner_vacancy,jhu_cre_cases_deaths_rental_vacancy
0,1001,19732,230,32.539527,-86.644082,Alabama,Autauga County,55688,55380,20573,...,723,111,49283,779,29291,3953,44884,2450,779,1726
1,1003,69641,724,30.727750,-87.722071,Alabama,Baldwin County,221898,212830,78622,...,2218,887,201483,3994,112058,19748,181512,7322,7322,96747
2,1005,7451,103,31.868263,-85.387129,Alabama,Barbour County,22023,25361,5024,...,220,132,16120,572,8170,2488,13323,2246,836,1629
3,1007,8067,109,32.996421,-87.125115,Alabama,Bibb County,20393,22493,6280,...,346,163,16130,265,8096,2182,14111,1264,305,1142
4,1009,18616,261,33.982109,-86.567906,Alabama,Blount County,57697,57681,18189,...,1038,115,46446,1961,27463,6231,42118,2365,403,4846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,12484,139,41.659439,-108.882788,Wyoming,Sweetwater County,41888,43521,16977,...,502,209,38872,1298,21739,4733,35185,712,879,12817
3138,56039,12123,16,43.935225,-110.589080,Wyoming,Teton County,23390,23280,7250,...,140,771,22267,1777,13028,2970,20372,561,467,5730
3139,56041,6378,43,41.287818,-110.547578,Wyoming,Uinta County,20183,20479,7744,...,322,141,18709,322,10091,2260,18063,686,100,2401
3140,56043,2749,50,43.904516,-107.680187,Wyoming,Washakie County,7738,8027,2601,...,108,77,6948,108,3869,1160,6051,379,15,1903


In [127]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [128]:
jhucredf.columns

Index(['jhu_cre_cases_deaths_fips', 'jhu_cre_cases_deaths_cases',
       'jhu_cre_cases_deaths_deaths', 'jhu_cre_cases_deaths_lat',
       'jhu_cre_cases_deaths_long', 'jhu_cre_cases_deaths_state',
       'jhu_cre_cases_deaths_county', 'jhu_cre_cases_deaths_popuni',
       'jhu_cre_cases_deaths_total_population', 'jhu_cre_cases_deaths_zero_rf',
       'jhu_cre_cases_deaths_one_two_rf', 'jhu_cre_cases_deaths_three_rf',
       'jhu_cre_cases_deaths_housing_units',
       'jhu_cre_cases_deaths_hispanic_pop', 'jhu_cre_cases_deaths_white_pop',
       'jhu_cre_cases_deaths_black_pop', 'jhu_cre_cases_deaths_native_pop',
       'jhu_cre_cases_deaths_asian_pop',
       'jhu_cre_cases_deaths_pacific_islander_pop',
       'jhu_cre_cases_deaths_other_race_pop',
       'jhu_cre_cases_deaths_bi_tri_racial_pop',
       'jhu_cre_cases_deaths_male_pop', 'jhu_cre_cases_deaths_female_pop',
       'jhu_cre_cases_deaths_veteran', 'jhu_cre_cases_deaths_gini_ind_income',
       'jhu_cre_cases_deaths_rural_po

In [129]:
jhucredf.isnull().sum()

jhu_cre_cases_deaths_fips                      0
jhu_cre_cases_deaths_cases                     0
jhu_cre_cases_deaths_deaths                    0
jhu_cre_cases_deaths_lat                       0
jhu_cre_cases_deaths_long                      0
jhu_cre_cases_deaths_state                     0
jhu_cre_cases_deaths_county                    0
jhu_cre_cases_deaths_popuni                    0
jhu_cre_cases_deaths_total_population          0
jhu_cre_cases_deaths_zero_rf                   0
jhu_cre_cases_deaths_one_two_rf                0
jhu_cre_cases_deaths_three_rf                  0
jhu_cre_cases_deaths_housing_units             0
jhu_cre_cases_deaths_hispanic_pop              0
jhu_cre_cases_deaths_white_pop                 0
jhu_cre_cases_deaths_black_pop                 0
jhu_cre_cases_deaths_native_pop                0
jhu_cre_cases_deaths_asian_pop                 0
jhu_cre_cases_deaths_pacific_islander_pop      0
jhu_cre_cases_deaths_other_race_pop            0
jhu_cre_cases_deaths

In [130]:
jhucredf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 41 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   jhu_cre_cases_deaths_fips                    3142 non-null   int64  
 1   jhu_cre_cases_deaths_cases                   3142 non-null   int64  
 2   jhu_cre_cases_deaths_deaths                  3142 non-null   int64  
 3   jhu_cre_cases_deaths_lat                     3142 non-null   float64
 4   jhu_cre_cases_deaths_long                    3142 non-null   float64
 5   jhu_cre_cases_deaths_state                   3142 non-null   object 
 6   jhu_cre_cases_deaths_county                  3142 non-null   object 
 7   jhu_cre_cases_deaths_popuni                  3142 non-null   int64  
 8   jhu_cre_cases_deaths_total_population        3142 non-null   int64  
 9   jhu_cre_cases_deaths_zero_rf                 3142 non-null   int64  
 10  

In [131]:
jhu_cre = jhucredf.drop(["jhu_cre_cases_deaths_fips", "jhu_cre_cases_deaths_lat", "jhu_cre_cases_deaths_long", "jhu_cre_cases_deaths_state", "jhu_cre_cases_deaths_county" ], axis = 1)
jhu_cre.head()

Unnamed: 0,jhu_cre_cases_deaths_cases,jhu_cre_cases_deaths_deaths,jhu_cre_cases_deaths_popuni,jhu_cre_cases_deaths_total_population,jhu_cre_cases_deaths_zero_rf,jhu_cre_cases_deaths_one_two_rf,jhu_cre_cases_deaths_three_rf,jhu_cre_cases_deaths_housing_units,jhu_cre_cases_deaths_hispanic_pop,jhu_cre_cases_deaths_white_pop,...,jhu_cre_cases_deaths_single_fathers_pop,jhu_cre_cases_deaths_plus_family_homes,jhu_cre_cases_deaths_highschool_grad,jhu_cre_cases_deaths_multilingual_5yrs_plus,jhu_cre_cases_deaths_full_time_workers,jhu_cre_cases_deaths_no_health_insur,jhu_cre_cases_deaths_internet_homes,jhu_cre_cases_deaths_no_vehicle,jhu_cre_cases_deaths_homeowner_vacancy,jhu_cre_cases_deaths_rental_vacancy
0,19732,230,55688,55380,20573,22750,12365,23493,1559,41543,...,723,111,49283,779,29291,3953,44884,2450,779,1726
1,69641,724,221898,212830,78622,90552,52724,114164,10207,184397,...,2218,887,201483,3994,112058,19748,181512,7322,7322,96747
2,7451,103,22023,25361,5024,9171,7828,12013,969,10086,...,220,132,16120,572,8170,2488,13323,2246,836,1629
3,8067,109,20393,22493,6280,8986,5127,9185,530,15192,...,346,163,16130,265,8096,2182,14111,1264,305,1142
4,18616,261,57697,57681,18189,23950,15558,24323,5365,50138,...,1038,115,46446,1961,27463,6231,42118,2365,403,4846


In [132]:
# Creating empty column in preparation of for loop
jhu_cre['death_cat'] = ""
jhu_cre.head()

Unnamed: 0,jhu_cre_cases_deaths_cases,jhu_cre_cases_deaths_deaths,jhu_cre_cases_deaths_popuni,jhu_cre_cases_deaths_total_population,jhu_cre_cases_deaths_zero_rf,jhu_cre_cases_deaths_one_two_rf,jhu_cre_cases_deaths_three_rf,jhu_cre_cases_deaths_housing_units,jhu_cre_cases_deaths_hispanic_pop,jhu_cre_cases_deaths_white_pop,...,jhu_cre_cases_deaths_plus_family_homes,jhu_cre_cases_deaths_highschool_grad,jhu_cre_cases_deaths_multilingual_5yrs_plus,jhu_cre_cases_deaths_full_time_workers,jhu_cre_cases_deaths_no_health_insur,jhu_cre_cases_deaths_internet_homes,jhu_cre_cases_deaths_no_vehicle,jhu_cre_cases_deaths_homeowner_vacancy,jhu_cre_cases_deaths_rental_vacancy,death_cat
0,19732,230,55688,55380,20573,22750,12365,23493,1559,41543,...,111,49283,779,29291,3953,44884,2450,779,1726,
1,69641,724,221898,212830,78622,90552,52724,114164,10207,184397,...,887,201483,3994,112058,19748,181512,7322,7322,96747,
2,7451,103,22023,25361,5024,9171,7828,12013,969,10086,...,132,16120,572,8170,2488,13323,2246,836,1629,
3,8067,109,20393,22493,6280,8986,5127,9185,530,15192,...,163,16130,265,8096,2182,14111,1264,305,1142,
4,18616,261,57697,57681,18189,23950,15558,24323,5365,50138,...,115,46446,1961,27463,6231,42118,2365,403,4846,


In [133]:
jhu_cre.describe()

Unnamed: 0,jhu_cre_cases_deaths_cases,jhu_cre_cases_deaths_deaths,jhu_cre_cases_deaths_popuni,jhu_cre_cases_deaths_total_population,jhu_cre_cases_deaths_zero_rf,jhu_cre_cases_deaths_one_two_rf,jhu_cre_cases_deaths_three_rf,jhu_cre_cases_deaths_housing_units,jhu_cre_cases_deaths_hispanic_pop,jhu_cre_cases_deaths_white_pop,...,jhu_cre_cases_deaths_single_fathers_pop,jhu_cre_cases_deaths_plus_family_homes,jhu_cre_cases_deaths_highschool_grad,jhu_cre_cases_deaths_multilingual_5yrs_plus,jhu_cre_cases_deaths_full_time_workers,jhu_cre_cases_deaths_no_health_insur,jhu_cre_cases_deaths_internet_homes,jhu_cre_cases_deaths_no_vehicle,jhu_cre_cases_deaths_homeowner_vacancy,jhu_cre_cases_deaths_rental_vacancy
count,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,...,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0
mean,32138.84,349.12317,102866.7,103341.1,35555.82,45117.53,22193.37,43739.33,18596.88,62411.13,...,1312.223425,1155.366327,90462.02,8687.606,52432.76,9114.868555,85186.1,8662.721,-180860.7,6116.568428
std,110996.9,1124.50972,330322.4,331170.1,102907.0,158383.7,73778.44,127931.7,125763.9,142795.0,...,4140.653482,9782.395332,279780.6,60571.88,169148.7,36212.828378,280530.9,50298.05,10228350.0,18053.825046
min,0.0,0.0,86.0,66.0,20.0,39.0,27.0,66.0,0.0,19.0,...,0.0,0.0,77.0,0.0,62.0,0.0,57.0,0.0,-573333300.0,0.0
25%,3097.75,47.0,10709.25,10952.0,3045.5,4517.25,2867.0,5505.0,341.25,7806.0,...,119.0,32.25,9015.75,114.0,4905.25,937.25,7417.5,591.0,159.0,592.0
50%,7899.0,110.0,25195.5,25739.5,7858.5,10743.5,6488.5,12496.5,1049.5,19781.0,...,331.0,127.5,21460.0,419.5,12099.5,2296.5,18429.0,1514.0,450.5,1586.0
75%,21209.75,261.0,66950.0,67866.0,22611.5,28592.25,15406.25,31481.0,5027.25,52619.0,...,963.0,403.0,58649.75,1932.25,32759.25,5784.75,52421.25,3988.75,1211.5,4444.75
max,3691301.0,35250.0,9949102.0,10081570.0,2503900.0,5264628.0,2180574.0,3542800.0,4825314.0,2606664.0,...,119389.0,467607.0,7869739.0,2347988.0,4855161.0,955113.0,8387092.0,1421880.0,102188.0,398169.0


In [134]:
## using lower quartiles, mean, upper quartile 47, 110, 261 ##

death_catlow = 47
death_catmed = 110
death_catintermed = 261
death_cathigh = 369

count = 0


for (column, columnData) in jhu_cre.iterrows():
    deaths = jhu_cre['jhu_cre_cases_deaths_deaths'].values[count]
    dc = jhu_cre['death_cat']
    if deaths <= death_catlow:
        dc.values[count] = 'low'
    elif deaths > death_catlow and deaths <= death_catmed:
        dc.values[count] = 'med'
    elif deaths > death_catmed and deaths <= death_catintermed:
        dc.values[count] = 'intermed'
    else:
        dc.values[count] = 'high'
        

    count = count + 1

In [135]:
# Assigning numerical values and storing in another column

jhu_cre['death_Cat'] = LabelEncoder().fit_transform(jhu_cre['death_cat'])

In [136]:
jhu_cre.dtypes

jhu_cre_cases_deaths_cases                      int64
jhu_cre_cases_deaths_deaths                     int64
jhu_cre_cases_deaths_popuni                     int64
jhu_cre_cases_deaths_total_population           int64
jhu_cre_cases_deaths_zero_rf                    int64
jhu_cre_cases_deaths_one_two_rf                 int64
jhu_cre_cases_deaths_three_rf                   int64
jhu_cre_cases_deaths_housing_units              int64
jhu_cre_cases_deaths_hispanic_pop               int64
jhu_cre_cases_deaths_white_pop                  int64
jhu_cre_cases_deaths_black_pop                  int64
jhu_cre_cases_deaths_native_pop                 int64
jhu_cre_cases_deaths_asian_pop                  int64
jhu_cre_cases_deaths_pacific_islander_pop       int64
jhu_cre_cases_deaths_other_race_pop             int64
jhu_cre_cases_deaths_bi_tri_racial_pop          int64
jhu_cre_cases_deaths_male_pop                   int64
jhu_cre_cases_deaths_female_pop                 int64
jhu_cre_cases_deaths_veteran

In [155]:
y = jhu_cre["jhu_cre_cases_deaths_deaths"]
X = jhu_cre.drop(columns=["jhu_cre_cases_deaths_deaths", "death_cat", "jhu_cre_cases_deaths_homeowner_vacancy", "jhu_cre_cases_deaths_asian_pop"], axis=1)

In [156]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=50)

In [157]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [158]:
scaler.fit(X_train)

StandardScaler()

In [159]:
X_train = scaler.transform(X_train)

In [160]:
X_test = scaler.transform(X_test)

In [161]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)

In [162]:
pca.fit(X_train)

PCA(n_components=2)

In [163]:
#Apply transformation on both train and test set
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [164]:
var = pca.explained_variance_ratio_
var.sum()

0.8588569090241316

In [165]:
# Storing methods as variables for formatting
cr = classification_report
ar = accuracy_score
cm = confusion_matrix

In [166]:
from sklearn.linear_model import LogisticRegression

LRM = LogisticRegression()
LRM.fit(X_train,y_train)
predLRM = LRM.predict(X_test)

print(f"the accuracy score is:{ar(y_test,predLRM)}")

print(f"confusion matrix:\n{cm(y_test,predLRM)}")

the accuracy score is:0.01653944020356234
confusion matrix:
[[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]]


In [92]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=1000,
   random_state=1)

In [93]:
classifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'high'

In [94]:
y_pred = classifier.predict(X_test)

In [95]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.013994910941475827
