In [1]:
#import dependencies
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import psycopg2
from imblearn.combine import SMOTEENN
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
deaths_df = pd.read_csv('../../../data/cleaned_data/cre_svi_death_pct.csv', index_col = 'FIPS')
deaths_df = deaths_df.rename(columns = {'3/31/21':'first_yr_deaths'})
deaths_df

Unnamed: 0_level_0,POPUNI,PRED0_PE,PRED12_PE,PRED3_PE,first_yr_deaths,BEDS,dem_pct,Agreeableness,Belief In Science,Collectivism,...,Hopefulness,Income Per Capita,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,AREA_SQMI
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,55688,36.94,40.85,22.20,99,85.0,27.018365,86.279655,70.833333,57.142857,...,91.163142,26168.0,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,594.443459
1003,221898,35.43,40.81,23.76,301,332.0,22.409030,85.603337,63.268161,67.948815,...,82.484017,28069.0,77.232120,80.086368,71.771566,67.272980,75.586018,66.983549,70.972246,1589.793007
1005,22023,22.81,41.64,35.54,55,74.0,45.788173,87.711609,63.046939,70.099756,...,61.927181,17249.0,80.375206,78.783778,73.657368,76.066481,78.753019,65.170377,68.704105,885.001636
1007,20393,30.79,44.06,25.14,58,35.0,20.698280,84.830261,63.262028,72.034797,...,85.258871,18988.0,80.813736,77.837027,69.974652,75.136154,76.929754,69.859503,67.931677,622.461089
1009,57697,31.53,41.51,26.97,131,40.0,9.569378,85.548096,33.333333,80.000000,...,79.492703,21033.0,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,644.830460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,41888,40.53,42.45,17.02,37,115.0,22.894957,83.811791,68.303853,68.673956,...,82.403142,30945.0,79.384759,79.347081,68.147062,73.938691,76.390464,67.420658,70.956334,10426.975725
56039,23390,31.00,49.45,19.55,9,48.0,66.599040,82.886955,73.489916,63.115088,...,84.036899,46499.0,71.547359,80.522872,65.399695,79.598153,79.698193,70.877600,70.938645,3996.844622
56041,20183,38.37,46.31,15.32,12,42.0,16.819960,84.272810,67.029022,67.552392,...,84.089095,25636.0,78.771570,77.859042,67.603416,69.705859,73.332067,67.404487,69.299391,2081.719807
56043,7738,33.61,41.55,24.84,26,18.0,16.145833,80.773973,68.663949,66.701109,...,87.485019,26325.0,76.249370,77.658224,67.412774,82.820701,78.925326,74.628788,70.050103,2238.672972


In [3]:
# drop counties with 0 cases 
## these seem to be errors- mostly in Utah, some counties with large populations
zeros = deaths_df.loc[deaths_df['first_yr_deaths']==0]
deaths_df = deaths_df.drop(index = zeros.index)
deaths_df

Unnamed: 0_level_0,POPUNI,PRED0_PE,PRED12_PE,PRED3_PE,first_yr_deaths,BEDS,dem_pct,Agreeableness,Belief In Science,Collectivism,...,Hopefulness,Income Per Capita,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,AREA_SQMI
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,55688,36.94,40.85,22.20,99,85.0,27.018365,86.279655,70.833333,57.142857,...,91.163142,26168.0,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,594.443459
1003,221898,35.43,40.81,23.76,301,332.0,22.409030,85.603337,63.268161,67.948815,...,82.484017,28069.0,77.232120,80.086368,71.771566,67.272980,75.586018,66.983549,70.972246,1589.793007
1005,22023,22.81,41.64,35.54,55,74.0,45.788173,87.711609,63.046939,70.099756,...,61.927181,17249.0,80.375206,78.783778,73.657368,76.066481,78.753019,65.170377,68.704105,885.001636
1007,20393,30.79,44.06,25.14,58,35.0,20.698280,84.830261,63.262028,72.034797,...,85.258871,18988.0,80.813736,77.837027,69.974652,75.136154,76.929754,69.859503,67.931677,622.461089
1009,57697,31.53,41.51,26.97,131,40.0,9.569378,85.548096,33.333333,80.000000,...,79.492703,21033.0,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,644.830460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,41888,40.53,42.45,17.02,37,115.0,22.894957,83.811791,68.303853,68.673956,...,82.403142,30945.0,79.384759,79.347081,68.147062,73.938691,76.390464,67.420658,70.956334,10426.975725
56039,23390,31.00,49.45,19.55,9,48.0,66.599040,82.886955,73.489916,63.115088,...,84.036899,46499.0,71.547359,80.522872,65.399695,79.598153,79.698193,70.877600,70.938645,3996.844622
56041,20183,38.37,46.31,15.32,12,42.0,16.819960,84.272810,67.029022,67.552392,...,84.089095,25636.0,78.771570,77.859042,67.603416,69.705859,73.332067,67.404487,69.299391,2081.719807
56043,7738,33.61,41.55,24.84,26,18.0,16.145833,80.773973,68.663949,66.701109,...,87.485019,26325.0,76.249370,77.658224,67.412774,82.820701,78.925326,74.628788,70.050103,2238.672972


In [4]:
# #set index to FIPS
# deaths_df = deaths_df.set_index(deaths_df['FIPS'])
# deaths_df= deaths_df.drop(columns = ['FIPS'])

In [5]:
deaths_df['pop_density'] = deaths_df['POPUNI']/deaths_df['AREA_SQMI']

In [6]:
#create variable for case% for each counties population
deaths_df['death_pct'] = deaths_df['first_yr_deaths']/deaths_df['POPUNI']*100
deaths_df['death_pct'].head()

FIPS
1001    0.177776
1003    0.135648
1005    0.249739
1007    0.284411
1009    0.227048
Name: death_pct, dtype: float64

In [7]:
deaths_df['death_pct'].describe()

count    3000.000000
mean        0.197046
std         0.113159
min         0.001390
25%         0.117239
50%         0.178936
75%         0.252381
max         0.865801
Name: death_pct, dtype: float64

In [8]:
# bin and cut the case_pct column into 2 classifications
q = deaths_df['death_pct'].quantile(.9)
bins = [0, q , 40]
labels = ['low','high']
deaths_df['death_class'] = pd.cut(deaths_df['death_pct'], bins, labels = labels)
deaths_df['death_class']

FIPS
1001     low
1003     low
1005     low
1007     low
1009     low
        ... 
56037    low
56039    low
56041    low
56043    low
56045    low
Name: death_class, Length: 3000, dtype: category
Categories (2, object): ['low' < 'high']

In [9]:
deaths_df['death_class'].value_counts()

low     2700
high     300
Name: death_class, dtype: int64

In [10]:
#drop unneeded columns
deaths_df = deaths_df.drop('death_pct', axis =1)
deaths_df

Unnamed: 0_level_0,POPUNI,PRED0_PE,PRED12_PE,PRED3_PE,first_yr_deaths,BEDS,dem_pct,Agreeableness,Belief In Science,Collectivism,...,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,AREA_SQMI,pop_density,death_class
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,55688,36.94,40.85,22.20,99,85.0,27.018365,86.279655,70.833333,57.142857,...,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,594.443459,93.680903,low
1003,221898,35.43,40.81,23.76,301,332.0,22.409030,85.603337,63.268161,67.948815,...,77.232120,80.086368,71.771566,67.272980,75.586018,66.983549,70.972246,1589.793007,139.576661,low
1005,22023,22.81,41.64,35.54,55,74.0,45.788173,87.711609,63.046939,70.099756,...,80.375206,78.783778,73.657368,76.066481,78.753019,65.170377,68.704105,885.001636,24.884700,low
1007,20393,30.79,44.06,25.14,58,35.0,20.698280,84.830261,63.262028,72.034797,...,80.813736,77.837027,69.974652,75.136154,76.929754,69.859503,67.931677,622.461089,32.761887,low
1009,57697,31.53,41.51,26.97,131,40.0,9.569378,85.548096,33.333333,80.000000,...,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,644.830460,89.476232,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,41888,40.53,42.45,17.02,37,115.0,22.894957,83.811791,68.303853,68.673956,...,79.384759,79.347081,68.147062,73.938691,76.390464,67.420658,70.956334,10426.975725,4.017272,low
56039,23390,31.00,49.45,19.55,9,48.0,66.599040,82.886955,73.489916,63.115088,...,71.547359,80.522872,65.399695,79.598153,79.698193,70.877600,70.938645,3996.844622,5.852116,low
56041,20183,38.37,46.31,15.32,12,42.0,16.819960,84.272810,67.029022,67.552392,...,78.771570,77.859042,67.603416,69.705859,73.332067,67.404487,69.299391,2081.719807,9.695349,low
56043,7738,33.61,41.55,24.84,26,18.0,16.145833,80.773973,68.663949,66.701109,...,76.249370,77.658224,67.412774,82.820701,78.925326,74.628788,70.050103,2238.672972,3.456512,low


In [11]:
#turn case % classifications into binary 
deaths_df = pd.get_dummies(deaths_df, columns = ['death_class'])
deaths_df

Unnamed: 0_level_0,POPUNI,PRED0_PE,PRED12_PE,PRED3_PE,first_yr_deaths,BEDS,dem_pct,Agreeableness,Belief In Science,Collectivism,...,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,AREA_SQMI,pop_density,death_class_low,death_class_high
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,55688,36.94,40.85,22.20,99,85.0,27.018365,86.279655,70.833333,57.142857,...,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,594.443459,93.680903,1,0
1003,221898,35.43,40.81,23.76,301,332.0,22.409030,85.603337,63.268161,67.948815,...,80.086368,71.771566,67.272980,75.586018,66.983549,70.972246,1589.793007,139.576661,1,0
1005,22023,22.81,41.64,35.54,55,74.0,45.788173,87.711609,63.046939,70.099756,...,78.783778,73.657368,76.066481,78.753019,65.170377,68.704105,885.001636,24.884700,1,0
1007,20393,30.79,44.06,25.14,58,35.0,20.698280,84.830261,63.262028,72.034797,...,77.837027,69.974652,75.136154,76.929754,69.859503,67.931677,622.461089,32.761887,1,0
1009,57697,31.53,41.51,26.97,131,40.0,9.569378,85.548096,33.333333,80.000000,...,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,644.830460,89.476232,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,41888,40.53,42.45,17.02,37,115.0,22.894957,83.811791,68.303853,68.673956,...,79.347081,68.147062,73.938691,76.390464,67.420658,70.956334,10426.975725,4.017272,1,0
56039,23390,31.00,49.45,19.55,9,48.0,66.599040,82.886955,73.489916,63.115088,...,80.522872,65.399695,79.598153,79.698193,70.877600,70.938645,3996.844622,5.852116,1,0
56041,20183,38.37,46.31,15.32,12,42.0,16.819960,84.272810,67.029022,67.552392,...,77.859042,67.603416,69.705859,73.332067,67.404487,69.299391,2081.719807,9.695349,1,0
56043,7738,33.61,41.55,24.84,26,18.0,16.145833,80.773973,68.663949,66.701109,...,77.658224,67.412774,82.820701,78.925326,74.628788,70.050103,2238.672972,3.456512,1,0


In [12]:
X_df = deaths_df.drop(columns= ['death_class_low','death_class_high','first_yr_deaths','POPUNI','AREA_SQMI'])
X_df

Unnamed: 0_level_0,PRED0_PE,PRED12_PE,PRED3_PE,BEDS,dem_pct,Agreeableness,Belief In Science,Collectivism,Conflict Awareness,Conscientiousness,...,Hopefulness,Income Per Capita,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic,pop_density
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,36.94,40.85,22.20,85.0,27.018365,86.279655,70.833333,57.142857,63.444323,84.113820,...,91.163142,26168.0,77.925476,78.222354,91.106719,53.333333,82.142857,70.000000,60.380952,93.680903
1003,35.43,40.81,23.76,332.0,22.409030,85.603337,63.268161,67.948815,63.751017,83.108459,...,82.484017,28069.0,77.232120,80.086368,71.771566,67.272980,75.586018,66.983549,70.972246,139.576661
1005,22.81,41.64,35.54,74.0,45.788173,87.711609,63.046939,70.099756,51.165707,83.933892,...,61.927181,17249.0,80.375206,78.783778,73.657368,76.066481,78.753019,65.170377,68.704105,24.884700
1007,30.79,44.06,25.14,35.0,20.698280,84.830261,63.262028,72.034797,61.796095,82.099210,...,85.258871,18988.0,80.813736,77.837027,69.974652,75.136154,76.929754,69.859503,67.931677,32.761887
1009,31.53,41.51,26.97,40.0,9.569378,85.548096,33.333333,80.000000,63.136502,83.165976,...,79.492703,21033.0,78.764620,78.193105,92.045455,57.603815,79.307632,64.953288,76.000000,89.476232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,40.53,42.45,17.02,115.0,22.894957,83.811791,68.303853,68.673956,62.498854,84.792307,...,82.403142,30945.0,79.384759,79.347081,68.147062,73.938691,76.390464,67.420658,70.956334,4.017272
56039,31.00,49.45,19.55,48.0,66.599040,82.886955,73.489916,63.115088,61.166618,80.059909,...,84.036899,46499.0,71.547359,80.522872,65.399695,79.598153,79.698193,70.877600,70.938645,5.852116
56041,38.37,46.31,15.32,42.0,16.819960,84.272810,67.029022,67.552392,60.175516,84.634029,...,84.089095,25636.0,78.771570,77.859042,67.603416,69.705859,73.332067,67.404487,69.299391,9.695349
56043,33.61,41.55,24.84,18.0,16.145833,80.773973,68.663949,66.701109,55.520155,82.170372,...,87.485019,26325.0,76.249370,77.658224,67.412774,82.820701,78.925326,74.628788,70.050103,3.456512


In [13]:
X = X_df.values
y= deaths_df['death_class_high']

In [14]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = .7, random_state=78)

In [15]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
model = model.fit(X_train, y_train)

In [17]:
# Making predictions using the testing data.
predictions = model.predict(X_test)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,

In [18]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,736,81
Actual 1,60,23


## SMOTEENN sampling


In [19]:
# Creating the decision tree classifier instance.
model2 = tree.DecisionTreeClassifier()


In [20]:
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [21]:
model2 = model2.fit(X_resampled, y_resampled)

In [22]:
y_pred = model2.predict(X_test)


cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,659,158
Actual 1,2,81


In [23]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.8912565807907271

In [24]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.81      0.98      0.89      0.89      0.77       817
          1       0.34      0.98      0.81      0.50      0.89      0.80        83

avg / total       0.94      0.82      0.96      0.86      0.89      0.78       900



In [25]:
# We can sort the features by their importance.
X = deaths_df.drop(['death_class_low','death_class_high','first_yr_deaths'], axis=1).columns
importance_df = pd.DataFrame(sorted(zip(model.feature_importances_, X), reverse=True))
importance_df

Unnamed: 0,0,1
0,0.107097,POPUNI
1,0.074647,Work Ethic
2,0.06278,Entrepreneurship
3,0.061025,Empathy
4,0.060992,Collectivism
5,0.060227,Hopefulness
6,0.053639,Employment Rate
7,0.052251,Extraversion
8,0.050738,dem_pct
9,0.041179,PRED0_PE


In [26]:
impact = importance_df.loc[importance_df[0]>0]
impact_col = impact[1].to_list()
impact

Unnamed: 0,0,1
0,0.107097,POPUNI
1,0.074647,Work Ethic
2,0.06278,Entrepreneurship
3,0.061025,Empathy
4,0.060992,Collectivism
5,0.060227,Hopefulness
6,0.053639,Employment Rate
7,0.052251,Extraversion
8,0.050738,dem_pct
9,0.041179,PRED0_PE


In [27]:
impact_df = deaths_df[impact_col]
impact_df.columns.to_list()

['POPUNI',
 'Work Ethic',
 'Entrepreneurship',
 'Empathy',
 'Collectivism',
 'Hopefulness',
 'Employment Rate',
 'Extraversion',
 'dem_pct',
 'PRED0_PE',
 'PRED12_PE',
 'Tolerance',
 'Openness',
 'Belief In Science',
 'Neuroticism',
 'Risk Taking',
 'PRED3_PE',
 'Agreeableness',
 'Income Per Capita',
 'BEDS',
 'Conscientiousness',
 'Conflict Awareness',
 'Selflessness',
 'Gender Equality',
 'Religiosity']

## undersampling

In [28]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)


Counter({0: 217, 1: 217})

In [29]:
model3 = tree.DecisionTreeClassifier()
model3 = model3.fit(X_resampled, y_resampled)


In [30]:
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,736,81
Actual 1,60,23


In [31]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5889826134402973

In [32]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.90      0.28      0.91      0.50      0.27       817
          1       0.22      0.28      0.90      0.25      0.50      0.23        83

avg / total       0.86      0.84      0.33      0.85      0.50      0.26       900



## Oversampling


In [33]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [34]:
model4= tree.DecisionTreeClassifier()
model4 = model4.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,736,81
Actual 1,60,23


In [35]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.90      0.28      0.91      0.50      0.27       817
          1       0.22      0.28      0.90      0.25      0.50      0.23        83

avg / total       0.86      0.84      0.33      0.85      0.50      0.26       900

