In [47]:
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from flask import Flask, jsonify
from sqlalchemy.engine import url
import json
from sqlalchemy import extract
from sqlalchemy.engine import make_url
import psycopg2
from sqlalchemy.sql import text
import os
data = os.path.join("JHU_Cases_Deaths_with_CRE_Data_2_0.csv")

df = pd.read_csv(data)

df.columns

Index(['fips', 'cases', 'deaths', 'lat', 'long', 'state', 'county', 'popuni',
       'total_population', 'zero_rf', 'one_two_rf', 'three_rf',
       'housing_units', 'hispanic_pop', 'white_pop', 'black_pop', 'native_pop',
       'asian_pop', 'pacific_islander_pop', 'other_race_pop',
       'bi_tri_racial_pop', 'male_pop', 'female_pop', 'veteran',
       'gini_ind_income', 'rural_pop', 'median_age_pop', 'elder_pop',
       'disability_pop', 'below_poverty_level', 'single_mothers_pop',
       'single_fathers_pop', 'plus_family_homes', 'highschool_grad',
       'multilingual_5yrs_plus', 'full_time_workers', 'no_health_insur',
       'internet_homes', 'no_vehicle', 'homeowner_vacancy', 'rental_vacancy'],
      dtype='object')

In [48]:
# Define the death thresholds
death_catlow = 47
death_catmed = 110
death_catintermed = 261
death_cathigh = 369


# Iterate over each row and assign a category based on the 'deaths' column
for (index, row) in df.iterrows():
    deaths = row['deaths']
    if deaths <= death_catlow:
        df.at[index, 'death_cat'] = 'low'
    elif deaths <= death_catmed:
        df.at[index, 'death_cat'] = 'med'
    elif deaths <= death_catintermed:
        df.at[index, 'death_cat'] = 'intermed'
    else:
        df.at[index, 'death_cat'] = 'high'

In [49]:
df.drop(columns=(["lat","long","state","county","fips",'popuni','total_population','cases', 'deaths']), inplace= True)

In [50]:
df.columns

Index(['zero_rf', 'one_two_rf', 'three_rf', 'housing_units', 'hispanic_pop',
       'white_pop', 'black_pop', 'native_pop', 'asian_pop',
       'pacific_islander_pop', 'other_race_pop', 'bi_tri_racial_pop',
       'male_pop', 'female_pop', 'veteran', 'gini_ind_income', 'rural_pop',
       'median_age_pop', 'elder_pop', 'disability_pop', 'below_poverty_level',
       'single_mothers_pop', 'single_fathers_pop', 'plus_family_homes',
       'highschool_grad', 'multilingual_5yrs_plus', 'full_time_workers',
       'no_health_insur', 'internet_homes', 'no_vehicle', 'homeowner_vacancy',
       'rental_vacancy', 'death_cat'],
      dtype='object')

In [51]:
import pandas as pd
import sklearn
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
#PCA (Principal Component Analysis)
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [52]:
from sklearn.preprocessing import LabelEncoder
df['death_cat'] = LabelEncoder().fit_transform(df['death_cat'])

In [53]:
df.head()

Unnamed: 0,zero_rf,one_two_rf,three_rf,housing_units,hispanic_pop,white_pop,black_pop,native_pop,asian_pop,pacific_islander_pop,...,plus_family_homes,highschool_grad,multilingual_5yrs_plus,full_time_workers,no_health_insur,internet_homes,no_vehicle,homeowner_vacancy,rental_vacancy,death_cat
0,20573,22750,12365,23493,1559,41543,10580,167,556,0,...,111,49283,779,29291,3953,44884,2450,779,1726,1
1,78622,90552,52724,114164,10207,184397,20414,1553,1997,0,...,887,201483,3994,112058,19748,181512,7322,7322,96747,0
2,5024,9171,7828,12013,969,10086,10438,66,110,0,...,132,16120,572,8170,2488,13323,2246,836,1629,3
3,6280,8986,5127,9185,530,15192,4506,20,20,0,...,163,16130,265,8096,2182,14111,1264,305,1142,3
4,18189,23950,15558,24323,5365,50138,865,57,230,0,...,115,46446,1961,27463,6231,42118,2365,403,4846,1


In [84]:

# Loop over all columns and run logistic regression
for column in columns:

    X = df[[column]]
    y = df['death_cat']

    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    #split the data into training

    X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=20, stratify= y, test_size=0.25)


    # Building logistic regression model on training data
    classifier = LogisticRegression(max_iter=5000).fit(X_train, y_train)
    death_pred = classifier.predict(X_test)

    print(f"Training Data Score: {classifier.score(X_train, y_train)}")
    
    
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
    
    # Accuracy statistics
    
    print(f"Column: {column}, Accuracy Score: {metrics.accuracy_score(y_test, death_pred)}")
    print("--------------------------------------------------------------------------------")

Training Data Score: 0.24915110356536502
Testing Data Score: 0.24936386768447838
Column: zero_rf, Accuracy Score: 0.24936386768447838
--------------------------------------------------------------------------------
Training Data Score: 0.24915110356536502
Testing Data Score: 0.24936386768447838
Column: one_two_rf, Accuracy Score: 0.24936386768447838
--------------------------------------------------------------------------------
Training Data Score: 0.7771646859083192
Testing Data Score: 0.8142493638676844
Column: three_rf, Accuracy Score: 0.8142493638676844
--------------------------------------------------------------------------------
Training Data Score: 0.24915110356536502
Testing Data Score: 0.24936386768447838
Column: housing_units, Accuracy Score: 0.24936386768447838
--------------------------------------------------------------------------------
Training Data Score: 0.45755517826825126
Testing Data Score: 0.4541984732824427
Column: hispanic_pop, Accuracy Score: 0.4541984732824