In [1]:
#import dependencies
import pandas as pd
import tensorflow as tf
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
#read in merged and cleaned data
df = pd.read_csv('../../data/merged_cleaned_data/deaths_merged_data_svi_val_cols.csv')
df

Unnamed: 0.1,Unnamed: 0,FIPS,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,...,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,dem_pct,BEDS,first_year_deaths
0,0,1001,594.443459,55200,23315,21115,8422,1065,29372,4204,...,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,27.018365,85.0,99
1,1,1009,644.830460,57645,24222,20600,8220,909,22656,7861,...,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,9.569378,40.0,131
2,2,1013,776.838201,20025,10026,6708,4640,567,20430,2141,...,78.563680,76.109761,76.623924,69.058104,79.956648,67.920284,72.773953,41.789629,94.0,66
3,3,1015,605.867251,115098,53682,45033,20819,4628,24706,12620,...,79.439032,79.955121,77.918741,54.063568,76.745724,67.456150,68.292794,29.845243,552.0,305
4,4,1017,596.560643,33826,16981,13516,5531,773,22827,4383,...,76.995358,78.156771,75.891100,67.343775,79.128558,66.397785,69.554441,41.644857,115.0,117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053,3053,48131,1793.476183,11355,5592,3511,2751,482,17864,2386,...,79.125428,78.895880,76.629575,60.576045,73.670302,64.571017,68.007770,50.959826,0.0,39
3054,3054,48505,998.411980,14369,6388,4405,5609,621,17228,3226,...,79.355639,79.572483,74.378252,77.443239,76.386871,74.001471,73.609838,47.134744,0.0,30
3055,3055,48507,1297.406535,12131,4344,3509,4150,421,13350,2719,...,78.392216,76.024682,75.848196,76.967659,77.303576,70.010162,71.121990,65.403060,0.0,40
3056,3056,48127,1328.884075,10663,4408,3309,3148,694,18121,2137,...,79.457964,79.141556,66.677924,73.140121,71.659882,68.065499,66.378831,61.756683,48.0,29


In [3]:
#set index to FIPS
df = df.set_index(df['FIPS'])
df= df.drop(columns = ['FIPS','Unnamed: 0'])

In [4]:
#create variable for case% for each counties population
df['case_pct'] = df['first_year_deaths']/df['E_TOTPOP']*100
df['case_pct'].head()

FIPS
1001    0.179348
1009    0.227253
1013    0.329588
1015    0.264992
1017    0.345888
Name: case_pct, dtype: float64

In [5]:
df['case_pct'].describe()

count    3058.000000
mean        0.187585
std         0.108421
min         0.000000
25%         0.112688
50%         0.173362
75%         0.242428
max         0.788566
Name: case_pct, dtype: float64

In [6]:
# bin and cut the case_pct column into 2 classifications
q = df['case_pct'].quantile(.8)
bins = [0, q , 40]
labels = ['low','high']
df['case_class'] = pd.cut(df['case_pct'], bins, labels = labels)
df['case_class']

FIPS
1001      low
1009      low
1013     high
1015     high
1017     high
         ... 
48131    high
48505     low
48507    high
48127    high
48247    high
Name: case_class, Length: 3058, dtype: category
Categories (2, object): ['low' < 'high']

In [7]:
df['case_class'].value_counts()

low     2388
high     612
Name: case_class, dtype: int64

In [8]:
#drop unneeded columns
df = df.drop('case_pct', axis =1)
df

Unnamed: 0_level_0,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,E_AGE65,E_AGE17,...,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,dem_pct,BEDS,first_year_deaths,case_class
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,...,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,27.018365,85.0,99,low
1009,644.830460,57645,24222,20600,8220,909,22656,7861,10233,13468,...,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,9.569378,40.0,131,low
1013,776.838201,20025,10026,6708,4640,567,20430,2141,3806,4566,...,76.109761,76.623924,69.058104,79.956648,67.920284,72.773953,41.789629,94.0,66,high
1015,605.867251,115098,53682,45033,20819,4628,24706,12620,19386,25196,...,79.955121,77.918741,54.063568,76.745724,67.456150,68.292794,29.845243,552.0,305,high
1017,596.560643,33826,16981,13516,5531,773,22827,4383,6409,7006,...,78.156771,75.891100,67.343775,79.128558,66.397785,69.554441,41.644857,115.0,117,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48131,1793.476183,11355,5592,3511,2751,482,17864,2386,2025,2962,...,78.895880,76.629575,60.576045,73.670302,64.571017,68.007770,50.959826,0.0,39,high
48505,998.411980,14369,6388,4405,5609,621,17228,3226,1999,4835,...,79.572483,74.378252,77.443239,76.386871,74.001471,73.609838,47.134744,0.0,30,low
48507,1297.406535,12131,4344,3509,4150,421,13350,2719,1665,3583,...,76.024682,75.848196,76.967659,77.303576,70.010162,71.121990,65.403060,0.0,40,high
48127,1328.884075,10663,4408,3309,3148,694,18121,2137,1734,3195,...,79.141556,66.677924,73.140121,71.659882,68.065499,66.378831,61.756683,48.0,29,high


In [9]:
#turn case % classifications into binary 
df = pd.get_dummies(df, columns = ['case_class'])
df

Unnamed: 0_level_0,AREA_SQMI,E_TOTPOP,E_HU,E_HH,E_POV,E_UNEMP,E_PCI,E_NOHSDP,E_AGE65,E_AGE17,...,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,dem_pct,BEDS,first_year_deaths,case_class_low,case_class_high
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,594.443459,55200,23315,21115,8422,1065,29372,4204,8050,13369,...,91.106719,53.333333,82.142857,70.000000,60.380952,27.018365,85.0,99,1,0
1009,644.830460,57645,24222,20600,8220,909,22656,7861,10233,13468,...,92.045455,57.603815,79.307632,64.953288,76.000000,9.569378,40.0,131,1,0
1013,776.838201,20025,10026,6708,4640,567,20430,2141,3806,4566,...,76.623924,69.058104,79.956648,67.920284,72.773953,41.789629,94.0,66,0,1
1015,605.867251,115098,53682,45033,20819,4628,24706,12620,19386,25196,...,77.918741,54.063568,76.745724,67.456150,68.292794,29.845243,552.0,305,0,1
1017,596.560643,33826,16981,13516,5531,773,22827,4383,6409,7006,...,75.891100,67.343775,79.128558,66.397785,69.554441,41.644857,115.0,117,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48131,1793.476183,11355,5592,3511,2751,482,17864,2386,2025,2962,...,76.629575,60.576045,73.670302,64.571017,68.007770,50.959826,0.0,39,0,1
48505,998.411980,14369,6388,4405,5609,621,17228,3226,1999,4835,...,74.378252,77.443239,76.386871,74.001471,73.609838,47.134744,0.0,30,1,0
48507,1297.406535,12131,4344,3509,4150,421,13350,2719,1665,3583,...,75.848196,76.967659,77.303576,70.010162,71.121990,65.403060,0.0,40,0,1
48127,1328.884075,10663,4408,3309,3148,694,18121,2137,1734,3195,...,66.677924,73.140121,71.659882,68.065499,66.378831,61.756683,48.0,29,0,1


In [10]:
X = df.drop(columns= ['case_class_low','case_class_high','first_year_deaths']).values
y= df['case_class_high'].values

In [11]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = .7, random_state=78)

In [12]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
rf_model = RandomForestClassifier(n_estimators=256, random_state=78) 

In [14]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [16]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [17]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,736,28
Actual 1,130,24


In [18]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
model = model.fit(X_train_scaled, y_train)

In [19]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
predictions

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [20]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,609,155
Actual 1,92,62


In [21]:
# We can sort the features by their importance.
X = df.drop(['case_class_low','case_class_high'], axis=1).columns
importance_df = pd.DataFrame(sorted(zip(model.feature_importances_, X), reverse=True))
importance_df

Unnamed: 0,0,1
0,0.076132,E_MUNIT
1,0.058795,E_PCI
2,0.044517,Agreeableness
3,0.042486,Conflict Awareness
4,0.039525,dem_pct
5,0.039198,Conscientiousness
6,0.038495,Extraversion
7,0.037722,Collectivism
8,0.037545,Openness
9,0.037513,E_MINRTY


In [22]:
impact = importance_df.loc[importance_df[0]>0]
impact_col = impact[1].to_list()
impact

Unnamed: 0,0,1
0,0.076132,E_MUNIT
1,0.058795,E_PCI
2,0.044517,Agreeableness
3,0.042486,Conflict Awareness
4,0.039525,dem_pct
5,0.039198,Conscientiousness
6,0.038495,Extraversion
7,0.037722,Collectivism
8,0.037545,Openness
9,0.037513,E_MINRTY


In [23]:
impact_df = df[impact_col]
impact_df

Unnamed: 0_level_0,E_MUNIT,E_PCI,Agreeableness,Conflict Awareness,dem_pct,Conscientiousness,Extraversion,Collectivism,Openness,E_MINRTY,...,E_UNEMP,Employment Rate,E_TOTPOP,Belief In Science,E_POV,E_GROUPQ,E_AGE65,E_HH,E_DAYPOP,Work Ethic
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,886,29372,86.279655,63.444323,27.018365,84.113820,86.112526,57.142857,78.222354,13788,...,1065,94.7,55200,70.833333,8422,546,8050,21115,37301,60.380952
1009,211,22656,85.548096,63.136502,9.569378,83.165976,87.164757,80.000000,78.193105,7413,...,909,94.5,57645,33.333333,8220,543,10233,20600,40036,76.000000
1013,134,20430,88.377418,60.788532,41.789629,83.276291,88.738491,66.859553,76.109761,9641,...,567,92.9,20025,62.494369,4640,322,3806,6708,17280,72.773953
1015,1990,24706,85.039324,47.565763,29.845243,82.334057,87.397860,66.764546,79.955121,31675,...,4628,93.3,115098,63.193373,20819,3112,19386,45033,117894,68.292794
1017,679,22827,86.224666,65.485423,41.644857,83.807126,88.052287,69.716067,78.156771,14954,...,773,94.5,33826,62.067731,5531,512,6409,13516,27176,69.554441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48131,26,17864,87.364932,43.491311,50.959826,81.750271,83.085768,62.556323,78.895880,10327,...,482,88.9,11355,64.251351,2751,504,2025,3511,9592,68.007770
48505,40,17228,87.316650,46.302376,47.134744,76.614836,87.366021,69.883855,79.572483,13754,...,621,89.0,14369,68.400514,5609,35,1999,4405,10924,73.609838
48507,75,13350,85.386374,60.553983,65.403060,80.708935,89.988278,69.234296,76.024682,11529,...,421,85.9,12131,61.200477,4150,196,1665,3509,10672,71.121990
48127,166,18121,83.245971,65.650194,61.756683,84.727838,84.914011,68.726270,79.141556,9655,...,694,92.8,10663,66.223930,3148,124,1734,3309,8229,66.378831


In [24]:
X = impact_df.values
y= df['case_class_high'].values

In [25]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = .7, random_state=78)
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=128,
    learning_rate=learning_rate,
    max_features=5,
    max_depth=3,
    random_state=0)
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
           classifier.score(
           X_train_scaled,
           y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
           classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.849
Accuracy score (validation): 0.832
Learning rate:  0.1
Accuracy score (training): 0.886
Accuracy score (validation): 0.833
Learning rate:  0.25
Accuracy score (training): 0.959
Accuracy score (validation): 0.810
Learning rate:  0.5
Accuracy score (training): 0.993
Accuracy score (validation): 0.805
Learning rate:  0.75
Accuracy score (training): 1.000
Accuracy score (validation): 0.793
Learning rate:  1
Accuracy score (training): 1.000
Accuracy score (validation): 0.808


In [27]:
classifier = GradientBoostingClassifier(n_estimators=200,
   learning_rate=0.1, max_features='auto', max_depth=10, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [28]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8322440087145969


In [29]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,721,43
Actual 1,111,43


In [30]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.87      0.94      0.90       764
           1       0.50      0.28      0.36       154

    accuracy                           0.83       918
   macro avg       0.68      0.61      0.63       918
weighted avg       0.81      0.83      0.81       918

