In [66]:
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from flask import Flask, jsonify
from sqlalchemy.engine import url
import json
from sqlalchemy import extract
from sqlalchemy.engine import make_url
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import text
import psycopg2
from decimal import *
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA


In [2]:
# opening configuration file, saving into a variable, then establishing that variable as env which specifies the data as development
# environment credentials
with open('sql/config.json') as datafile:
    data = json.load(datafile)

env = data['dev']

In [3]:
# Instantiating the environment column values as the column names to hide sensitive data
db = env['db']
user = env['user']
password = env['pass']
port = env['port']
host = env['host']

In [4]:
# Connection string
engine = sqlalchemy.create_engine(
    f'postgresql://{user}:{password}@{host}:{port}/{db}')

In [5]:
# Reflecting an existing DB into a new model
Base = automap_base()

In [6]:
Base.prepare(autoload_with = engine)

In [7]:
# Seeing what tables there are
Base.classes.keys()

['races', 'cases', 'jhu_cre_cases_deaths', 'location']

In [8]:
# Saving the tables as references here if u want 
location = Base.classes.location

In [9]:
# Another table reference 
jhu_data = Base.classes.jhu_cre_cases_deaths

In [10]:
# Session object is the handler to the database, estb convo w db.
# Sessionmaker class creates a 'top level' session configuration that then can be used throughout the application without
# the need to repeat config arguments.'- Credit to rfkortekaas on Stack for explaining.
Session = sessionmaker(bind=engine)

session = Session()

In [11]:
# Test query to make sure database connection works
sql = session.query(jhu_data)

In [12]:
# An example of using sql commands:
sql = '''
    SELECT * FROM jhu_cre_cases_deaths;
'''

with engine.connect() as conn:
    query = conn.execute(text(sql))
df2 = pd.DataFrame(query.fetchall())

In [13]:
df2.head()

Unnamed: 0,fips,cases,deaths,lat,long,state,county,popuni,total_population,zero_rf,...,single_fathers_pop,plus_family_homes,highschool_grad,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,homeowner_vacancy,rental_vacancy
0,1001,19732,230,32.539527,-86.644082,Alabama,Autauga County,55688,55380,20573,...,723,111,49283,779,29291,3953,44884,2450,779,1726
1,1003,69641,724,30.727750,-87.722071,Alabama,Baldwin County,221898,212830,78622,...,2218,887,201483,3994,112058,19748,181512,7322,7322,96747
2,1005,7451,103,31.868263,-85.387129,Alabama,Barbour County,22023,25361,5024,...,220,132,16120,572,8170,2488,13323,2246,836,1629
3,1007,8067,109,32.996421,-87.125115,Alabama,Bibb County,20393,22493,6280,...,346,163,16130,265,8096,2182,14111,1264,305,1142
4,1009,18616,261,33.982109,-86.567906,Alabama,Blount County,57697,57681,18189,...,1038,115,46446,1961,27463,6231,42118,2365,403,4846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,12484,139,41.659439,-108.882788,Wyoming,Sweetwater County,41888,43521,16977,...,502,209,38872,1298,21739,4733,35185,712,879,12817
3138,56039,12123,16,43.935225,-110.589080,Wyoming,Teton County,23390,23280,7250,...,140,771,22267,1777,13028,2970,20372,561,467,5730
3139,56041,6378,43,41.287818,-110.547578,Wyoming,Uinta County,20183,20479,7744,...,322,141,18709,322,10091,2260,18063,686,100,2401
3140,56043,2749,50,43.904516,-107.680187,Wyoming,Washakie County,7738,8027,2601,...,108,77,6948,108,3869,1160,6051,379,15,1903


# Beginning ML portion of project

In [14]:
# Creating a list of columns so I can easily drop them during feature selection
df2.columns

Index(['fips', 'cases', 'deaths', 'lat', 'long', 'state', 'county', 'popuni',
       'total_population', 'zero_rf', 'one_two_rf', 'three_rf',
       'housing_units', 'hispanic_pop', 'white_pop', 'black_pop', 'native_pop',
       'asian_pop', 'pacific_islander_pop', 'other_race_pop',
       'bi_tri_racial_pop', 'male_pop', 'female_pop', 'veteran',
       'gini_ind_income', 'rural_pop', 'median_age_pop', 'elder_pop',
       'disability_pop', 'below_poverty_level', 'single_mothers_pop',
       'single_fathers_pop', 'plus_family_homes', 'highschool_grad',
       'multilingual_5yrs_plus', 'full_time_workers', 'no_health_insur',
       'internet_homes', 'no_vehicle', 'homeowner_vacancy', 'rental_vacancy'],
      dtype='object')

In [15]:
# Checking size/ shape of df
df2.shape

(3142, 41)

In [16]:
df2.skew()

fips                      -0.079662
cases                     16.046049
deaths                    14.937677
lat                        0.550291
long                      -1.425002
popuni                    13.465008
total_population          13.681844
zero_rf                    9.778660
one_two_rf                15.992050
three_rf                  13.883655
housing_units             11.936929
hispanic_pop              22.641934
white_pop                  7.353834
black_pop                 10.467775
native_pop                13.429057
asian_pop                 19.966019
pacific_islander_pop      33.364578
other_race_pop            17.496048
bi_tri_racial_pop         11.791882
male_pop                  13.556421
female_pop                13.375153
veteran                    8.146015
gini_ind_income           14.312186
rural_pop                  1.928548
median_age_pop            13.071178
elder_pop                 12.119054
disability_pop            11.662894
below_poverty_level       14

In [17]:
# Rechecking for null values
df2.isnull().sum()

fips                      0
cases                     0
deaths                    0
lat                       0
long                      0
state                     0
county                    0
popuni                    0
total_population          0
zero_rf                   0
one_two_rf                0
three_rf                  0
housing_units             0
hispanic_pop              0
white_pop                 0
black_pop                 0
native_pop                0
asian_pop                 0
pacific_islander_pop      0
other_race_pop            0
bi_tri_racial_pop         0
male_pop                  0
female_pop                0
veteran                   0
gini_ind_income           0
rural_pop                 0
median_age_pop            0
elder_pop                 0
disability_pop            0
below_poverty_level       0
single_mothers_pop        0
single_fathers_pop        0
plus_family_homes         0
highschool_grad           0
multilingual_5yrs_plus    0
full_time_workers   

In [80]:
# Checking for datatypes, confirming size of columns and that all column values are accounted for
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   fips                    3142 non-null   int64  
 1   cases                   3142 non-null   int64  
 2   deaths                  3142 non-null   int64  
 3   lat                     3142 non-null   float64
 4   long                    3142 non-null   float64
 5   state                   3142 non-null   object 
 6   county                  3142 non-null   object 
 7   popuni                  3142 non-null   int64  
 8   total_population        3142 non-null   int64  
 9   zero_rf                 3142 non-null   int64  
 10  one_two_rf              3142 non-null   int64  
 11  three_rf                3142 non-null   int64  
 12  housing_units           3142 non-null   int64  
 13  hispanic_pop            3142 non-null   int64  
 14  white_pop               3142 non-null   

In [78]:
# Median. If a variable has a mean greater than the median, it has right skew. If it has a mean less than the median, left skew.
df2.median().head()

fips      29176.000000
cases      7899.000000
deaths      110.000000
lat          38.378924
long        -90.400244
dtype: float64

In [20]:
# Dropping lat and long columns (not necessary for this portion of the project)
dfd = df2.drop(['lat','long', 'fips'], axis = 1)

In [21]:
# Creating empty column in preparation of for loop
dfd['death_cat'] = ""
dfd.head()

Unnamed: 0,cases,deaths,state,county,popuni,total_population,zero_rf,one_two_rf,three_rf,housing_units,...,plus_family_homes,highschool_grad,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,homeowner_vacancy,rental_vacancy,death_cat
0,19732,230,Alabama,Autauga County,55688,55380,20573,22750,12365,23493,...,111,49283,779,29291,3953,44884,2450,779,1726,
1,69641,724,Alabama,Baldwin County,221898,212830,78622,90552,52724,114164,...,887,201483,3994,112058,19748,181512,7322,7322,96747,
2,7451,103,Alabama,Barbour County,22023,25361,5024,9171,7828,12013,...,132,16120,572,8170,2488,13323,2246,836,1629,
3,8067,109,Alabama,Bibb County,20393,22493,6280,8986,5127,9185,...,163,16130,265,8096,2182,14111,1264,305,1142,
4,18616,261,Alabama,Blount County,57697,57681,18189,23950,15558,24323,...,115,46446,1961,27463,6231,42118,2365,403,4846,


In [96]:
## using lower quartiles, mean, upper quartile 47, 110, 261 ##

death_catlow = 47
death_catmed = 110
death_catintermed = 261
death_cathigh = 261

count = 0

for (column, columnData) in dfd.iterrows():
    deaths = dfd['deaths'].values[count]
    dc = dfd['death_cat']
    if deaths <= death_catlow:
        dc.values[count] = 'low'
    elif deaths > death_catlow and deaths <= death_catmed:
        dc.values[count] = 'med'
    elif deaths > death_catmed and deaths <= death_catintermed:
        dc.values[count] = 'intermed'
    else:
        dc.values[count] = 'high'

    count = count + 1

In [97]:
# addressing right skew values with log transformation. cannot perform on 'deaths' column as you return a NaN.
df_right_skew_adjusted = df2[['popuni', 'total_population',
                              'one_two_rf', 'three_rf']].apply(lambda x: np.log(x))

In [98]:
# Adjusting the right skewed data display so that it isn't a float
df_right_skew_adjusted = df_right_skew_adjusted.astype(int).round()

In [99]:
# Checking skew
df_right_skew_adjusted.skew()

popuni              0.294058
total_population    0.271641
one_two_rf          0.278327
three_rf            0.228708
dtype: float64

In [100]:
df_left_skew = df2.drop(['fips', 'popuni', 'total_population', 'one_two_rf', 'three_rf', 'lat', 'long',
                         'gini_ind_income', 'homeowner_vacancy', 'deaths', 'state', 'county'], axis=1)

In [101]:
# For loop squaring off all left skewed data
df_left_skew_adjusted = pd.DataFrame()

for (columnName, columnData) in df_left_skew.iteritems():

    df_left_skew_adjusted[columnName] = ""
    current_col = df_left_skew[columnName]
    col_list = current_col.to_list()

    sqrlst = []
    for x in col_list:
        square = x**2
        sqrlst.append(square)

    df_left_skew_adjusted[columnName] = sqrlst

In [102]:
df_left_skew_adjusted.dtypes

cases                     int64
zero_rf                   int64
housing_units             int64
hispanic_pop              int64
white_pop                 int64
black_pop                 int64
native_pop                int64
asian_pop                 int64
pacific_islander_pop      int64
other_race_pop            int64
bi_tri_racial_pop         int64
male_pop                  int64
female_pop                int64
veteran                   int64
rural_pop                 int64
median_age_pop            int64
elder_pop                 int64
disability_pop            int64
below_poverty_level       int64
single_mothers_pop        int64
single_fathers_pop        int64
plus_family_homes         int64
highschool_grad           int64
multilingual_5yrs_plus    int64
full_time_workers         int64
no_health_insur           int64
internet_homes            int64
no_vehicle                int64
rental_vacancy            int64
dtype: object

In [103]:
df_left_skew_adjusted.isnull().count().head()

cases            3142
zero_rf          3142
housing_units    3142
hispanic_pop     3142
white_pop        3142
dtype: int64

In [104]:
# beginning work around for pandas and pd.concat not liking my indexes (when running original
# concat with just a combination of the LSA and RSA dfs, was returning NAN values, so,
# this is a workaround.)
copy_ls = df_left_skew_adjusted

In [105]:
# Making copy of the rs data
copy_rs = df_right_skew_adjusted

In [106]:
# Pulling down deaths from above- this could not be transformed along with the right skewed data
# Returned a NAN. 
dths = dfd['deaths']

In [107]:
dths.reset_index(drop=True, inplace=True)
copy_ls.reset_index(drop=True, inplace=True)
copy_rs.reset_index(drop=True, inplace=True)

In [108]:
result = pd.concat([copy_rs, copy_ls, dths], axis=1)

In [109]:
result.head()

Unnamed: 0,popuni,total_population,one_two_rf,three_rf,cases,zero_rf,housing_units,hispanic_pop,white_pop,black_pop,...,single_fathers_pop,plus_family_homes,highschool_grad,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,rental_vacancy,deaths
0,10,10,10,9,389351824,423248329,551921049,2430481,1725820849,111936400,...,522729,12321,2428814089,606841,857962681,15626209,2014573456,6002500,2979076,230
1,12,12,11,10,4849868881,6181418884,13033418896,104182849,34002253609,416731396,...,4919524,786769,40595399289,15952036,12556995364,389983504,32946606144,53611684,9359982009,724
2,9,10,9,8,55517401,25240576,144312169,938961,101727396,108951844,...,48400,17424,259854400,327184,66748900,6190144,177502329,5044516,2653641,103
3,9,10,9,8,65076489,39438400,84364225,280900,230796864,20304036,...,119716,26569,260176900,70225,65545216,4761124,199120321,1597696,1304164,109
4,10,10,10,9,346555456,330839721,591608329,28783225,2513819044,748225,...,1077444,13225,2157230916,3845521,754216369,38825361,1773925924,5593225,23483716,261


In [110]:
result.isnull().count()

popuni                    3142
total_population          3142
one_two_rf                3142
three_rf                  3142
cases                     3142
zero_rf                   3142
housing_units             3142
hispanic_pop              3142
white_pop                 3142
black_pop                 3142
native_pop                3142
asian_pop                 3142
pacific_islander_pop      3142
other_race_pop            3142
bi_tri_racial_pop         3142
male_pop                  3142
female_pop                3142
veteran                   3142
rural_pop                 3142
median_age_pop            3142
elder_pop                 3142
disability_pop            3142
below_poverty_level       3142
single_mothers_pop        3142
single_fathers_pop        3142
plus_family_homes         3142
highschool_grad           3142
multilingual_5yrs_plus    3142
full_time_workers         3142
no_health_insur           3142
internet_homes            3142
no_vehicle                3142
rental_v

In [111]:
# Label encoding all categorical data
result['Zero_RF'] = LabelEncoder().fit_transform(result['zero_rf'])
result['One_Two_RF'] = LabelEncoder().fit_transform(result['one_two_rf'])
result['Three_RF'] = LabelEncoder().fit_transform(result['three_rf'])
result['State_Cat'] = LabelEncoder().fit_transform(dfd['state'])
result['County_Cat'] = LabelEncoder().fit_transform(dfd['county'])
result['Death_Cat'] = LabelEncoder().fit_transform(dfd['death_cat'])

In [112]:
result

Unnamed: 0,popuni,total_population,one_two_rf,three_rf,cases,zero_rf,housing_units,hispanic_pop,white_pop,black_pop,...,internet_homes,no_vehicle,rental_vacancy,deaths,Zero_RF,One_Two_RF,Three_RF,State_Cat,County_Cat,Death_Cat
0,10,10,10,9,389351824,423248329,551921049,2430481,1725820849,111936400,...,2014573456,6002500,2979076,230,2114,7,6,0,82,1
1,12,12,11,10,4849868881,6181418884,13033418896,104182849,34002253609,416731396,...,32946606144,53611684,9359982009,724,2642,8,7,0,89,0
2,9,10,9,8,55517401,25240576,144312169,938961,101727396,108951844,...,177502329,5044516,2653641,103,1039,6,5,0,100,3
3,9,10,9,8,65076489,39438400,84364225,280900,230796864,20304036,...,199120321,1597696,1304164,109,1224,6,5,0,149,3
4,10,10,10,9,346555456,330839721,591608329,28783225,2513819044,748225,...,1773925924,5593225,23483716,261,2038,7,6,0,164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,10,10,9,8,155850256,288218529,390892441,44355600,1111688964,211600,...,1237984225,506944,164275489,139,1992,6,5,50,1635,1
3138,10,10,9,8,146967129,52562500,191767104,12306064,361608256,78400,...,415018384,314721,32832900,16,1345,6,5,50,1660,2
3139,9,9,9,8,40678884,59969536,81739681,3370896,311875600,400,...,326271969,470596,5764801,43,1387,6,5,50,1713,2
3140,8,8,8,7,7557001,6765201,14899600,1205604,40157569,0,...,36614601,143641,3621409,50,576,5,4,50,1773,3


In [113]:
# Creating features
y = result['Death_Cat']

X = result.drop(['One_Two_RF','Three_RF', 'Death_Cat','total_population',
                 'single_fathers_pop', 'asian_pop','native_pop','County_Cat','State_Cat','Zero_RF'
                ], axis = 1)

In [114]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.20, train_size=.80,
                                                    random_state=42)

In [115]:
scaler = StandardScaler()

# Fit on the training set
scaler.fit(x_train)

In [116]:
# Apply to both the train set and the test set.
X_train = scaler.transform(x_train)

In [117]:
X_test = scaler.transform(x_test)

In [118]:
# Storing methods as variables for formatting
cr = classification_report
ar = accuracy_score
cm = confusion_matrix

In [119]:
# Using Unsup learning PCA method to ensure not overfitting
pca = PCA(.95)

In [120]:
pca.fit(X_train)

In [121]:
#Apply transformation on both train and test set
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [122]:
var = pca.explained_variance_ratio_
var.sum()

0.9641150518678108

In [123]:
RFC = RandomForestClassifier()
RFC.fit(X_train, y_train)
predRFC = RFC.predict(X_test)
print(f"the accuracy score is:{ar(y_test, predRFC)}")
print(f"confusion_matrix:\n{cm(y_test, predRFC)}")
print(f"classification report:\n{cr(y_test,predRFC)}")

the accuracy score is:0.9348171701112877
confusion_matrix:
[[143   5   0   0]
 [  9 149   0   5]
 [  0   1 156   6]
 [  0   7   8 140]]
classification report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       148
           1       0.92      0.91      0.92       163
           2       0.95      0.96      0.95       163
           3       0.93      0.90      0.92       155

    accuracy                           0.93       629
   macro avg       0.93      0.94      0.93       629
weighted avg       0.93      0.93      0.93       629



# For reference, this is the first iteration of the RFC before
# adding categories and before optimizing by reducing skew.
#### RFC's can handle both normally distributed data and data that has
#### an irregular distribution. By adjusting both skews and then choosing
#### variables that in intitial testing (used with parametric tests) showed 
#### a high magnitude of importance, we can produce this result with RFC
#### without overfitting, by using PCA.

In [50]:
RFC = RandomForestClassifier()
RFC.fit(X_train, y_train)
predRFC = RFC.predict(X_test)
print(f"the accuracy score is:{ar(y_test, predRFC)}")
print(f"confusion_matrix:\n{cm(y_test, predRFC)}")

the accuracy score is:0.016967126193001062
confusion_matrix:
[[4 0 1 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
