In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from sklearn import preprocessing
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Loading clustered dataset
file = "Resources/MLM/clustered_data.csv"
clustered_df = pd.read_csv(file, index_col=0)
clustered_df.head(10)

Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,no_doctor_visits,mammogram,no_provider,pap_smear,...,abortions_residence_state,no_services,few_services,restricted_services,full_service,chc_per_capita,PC 1,PC 2,PC 3,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1462900,109.285826,4767100,0.12,0.74,0.13,0.74,...,9060,8,44,5,0,0.356611,-3.389517,1.468611,1.450318,0
Alaska,Abortion Available,27,29200,207200,84.486218,701700,0.12,0.63,0.23,0.65,...,1320,3,7,0,4,3.847798,-2.985732,-0.654379,1.727523,0
Arizona,Status of pre-Roe ban unclear,23,309000,2132900,87.07292,7098000,0.11,0.68,0.19,0.69,...,13820,19,28,7,0,0.324035,-1.623217,1.289916,-0.185999,0
Arkansas,Abortion Ban In Effect,12,83800,865900,124.819154,2922500,0.12,0.7,0.13,0.68,...,4510,9,31,3,0,0.410607,-3.225696,0.563117,1.934296,0
California,Abortion Available,175,1034300,11885100,32.123048,38642700,0.09,0.68,0.19,0.74,...,152400,51,97,0,124,0.452867,10.762057,8.750072,-1.291324,2
Colorado,Abortion Available,19,204300,1758600,43.906723,5611800,0.11,0.67,0.17,0.69,...,11830,17,34,0,15,0.338572,-0.276557,-0.145824,-1.260018,1
Connecticut,Abortion Available,16,58100,1038000,53.795577,3453300,0.07,0.78,0.1,0.8,...,11460,12,8,0,18,0.463325,3.535804,-3.156709,-0.614361,4
Delaware,Abortion Available,3,24800,295800,48.113934,940300,0.09,0.74,0.12,0.75,...,2870,10,0,0,2,0.319047,2.044858,-3.345393,1.189281,4
District of Columbia,Abortion Available,8,7600,251600,78.882128,671300,0.06,0.74,0.15,0.8,...,5010,2,0,0,2,1.191718,2.150158,-3.521315,1.427751,4
Florida,"Abortion available, pre-viability gestational ...",47,1011800,6381600,67.248213,20992000,0.14,0.74,0.2,0.73,...,73830,22,129,0,48,0.223895,1.098599,6.575453,-1.398009,3


In [3]:
clustered_df.dtypes

abortion_status                               object
total_community_health_centers                 int64
uninsured                                      int64
total_insured                                  int64
maternal_mortality                           float64
population                                     int64
no_doctor_visits                             float64
mammogram                                    float64
no_provider                                  float64
pap_smear                                    float64
prescription_contraception                    object
otc_methods                                   object
male_sterilization                            object
female_sterilization                          object
cost_sharing                                  object
teen_births                                  float64
poverty_under_200                            float64
percent_of_all_us_abortions                  float64
percent_residents_traveling_outside_state     

In [4]:
numerical_df = pd.get_dummies(data = clustered_df.drop(columns=['PC 1','PC 2','PC 3']),
                              columns=['abortion_status','prescription_contraception',
                              'otc_methods','male_sterilization','female_sterilization',
                              'cost_sharing'])
numerical_df

Unnamed: 0_level_0,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,no_doctor_visits,mammogram,no_provider,pap_smear,teen_births,...,prescription_contraception_No,prescription_contraception_Yes,otc_methods_No,otc_methods_Yes,male_sterilization_No,male_sterilization_Yes,female_sterilization_No,female_sterilization_Yes,cost_sharing_No,cost_sharing_Yes
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,17,165700,1462900,109.285826,4767100,0.12,0.74,0.13,0.74,24.8,...,1,0,1,0,1,0,1,0,1,0
Alaska,27,29200,207200,84.486218,701700,0.12,0.63,0.23,0.65,17.7,...,1,0,1,0,1,0,1,0,1,0
Arizona,23,309000,2132900,87.07292,7098000,0.11,0.68,0.19,0.69,16.6,...,0,1,1,0,1,0,1,0,1,0
Arkansas,12,83800,865900,124.819154,2922500,0.12,0.7,0.13,0.68,27.8,...,0,1,1,0,1,0,1,0,1,0
California,175,1034300,11885100,32.123048,38642700,0.09,0.68,0.19,0.74,11.0,...,0,1,0,1,1,0,0,1,0,1
Colorado,19,204300,1758600,43.906723,5611800,0.11,0.67,0.17,0.69,12.5,...,0,1,1,0,1,0,1,0,1,0
Connecticut,16,58100,1038000,53.795577,3453300,0.07,0.78,0.1,0.8,7.6,...,0,1,0,1,1,0,0,1,0,1
Delaware,3,24800,295800,48.113934,940300,0.09,0.74,0.12,0.75,14.6,...,0,1,0,1,1,0,0,1,0,1
District of Columbia,8,7600,251600,78.882128,671300,0.06,0.74,0.15,0.8,15.6,...,0,1,0,1,1,0,0,1,0,1
Florida,47,1011800,6381600,67.248213,20992000,0.14,0.74,0.2,0.73,15.2,...,1,0,1,0,1,0,1,0,1,0


In [5]:
numerical_df.dtypes

total_community_health_centers                                                                        int64
uninsured                                                                                             int64
total_insured                                                                                         int64
maternal_mortality                                                                                  float64
population                                                                                            int64
no_doctor_visits                                                                                    float64
mammogram                                                                                           float64
no_provider                                                                                         float64
pap_smear                                                                                           float64
teen_births                 

In [24]:
training_df = numerical_df.loc[numerical_df['Class']==4]
training_df.head()

Unnamed: 0_level_0,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,no_doctor_visits,mammogram,no_provider,pap_smear,teen_births,...,prescription_contraception_No,prescription_contraception_Yes,otc_methods_No,otc_methods_Yes,male_sterilization_No,male_sterilization_Yes,female_sterilization_No,female_sterilization_Yes,cost_sharing_No,cost_sharing_Yes
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Connecticut,16,58100,1038000,53.795577,3453300,0.07,0.78,0.1,0.8,7.6,...,0,1,0,1,1,0,0,1,0,1
Delaware,3,24800,295800,48.113934,940300,0.09,0.74,0.12,0.75,14.6,...,0,1,0,1,1,0,0,1,0,1
District of Columbia,8,7600,251600,78.882128,671300,0.06,0.74,0.15,0.8,15.6,...,0,1,0,1,1,0,0,1,0,1
Illinois,45,328700,3731700,40.510735,12362300,0.1,0.74,0.15,0.73,13.6,...,0,1,0,1,0,1,0,1,0,1
Maine,18,24400,398100,25.998787,1302300,0.08,0.77,0.08,0.73,10.6,...,0,1,1,0,1,0,0,1,0,1


In [25]:
testing_df = numerical_df.loc[numerical_df['Class']!=4]
testing_df

Unnamed: 0_level_0,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,no_doctor_visits,mammogram,no_provider,pap_smear,teen_births,...,prescription_contraception_No,prescription_contraception_Yes,otc_methods_No,otc_methods_Yes,male_sterilization_No,male_sterilization_Yes,female_sterilization_No,female_sterilization_Yes,cost_sharing_No,cost_sharing_Yes
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,17,165700,1462900,109.285826,4767100,0.12,0.74,0.13,0.74,24.8,...,1,0,1,0,1,0,1,0,1,0
Alaska,27,29200,207200,84.486218,701700,0.12,0.63,0.23,0.65,17.7,...,1,0,1,0,1,0,1,0,1,0
Arizona,23,309000,2132900,87.07292,7098000,0.11,0.68,0.19,0.69,16.6,...,0,1,1,0,1,0,1,0,1,0
Arkansas,12,83800,865900,124.819154,2922500,0.12,0.7,0.13,0.68,27.8,...,0,1,1,0,1,0,1,0,1,0
California,175,1034300,11885100,32.123048,38642700,0.09,0.68,0.19,0.74,11.0,...,0,1,0,1,1,0,0,1,0,1
Colorado,19,204300,1758600,43.906723,5611800,0.11,0.67,0.17,0.69,12.5,...,0,1,1,0,1,0,1,0,1,0
Florida,47,1011800,6381600,67.248213,20992000,0.14,0.74,0.2,0.73,15.2,...,1,0,1,0,1,0,1,0,1,0
Georgia,35,586000,3356300,88.182701,10294000,0.16,0.71,0.17,0.73,18.2,...,0,1,1,0,1,0,1,0,1,0
Hawaii,14,18900,379800,44.345898,1343000,0.06,0.78,0.1,0.73,13.0,...,0,1,1,0,1,0,1,0,1,0
Idaho,14,78900,535000,51.084382,1750900,0.1,0.65,0.15,0.64,14.6,...,1,0,1,0,1,0,1,0,1,0


In [26]:
training_df['chc_per_capita'].describe()

count    14.000000
mean      0.651802
std       0.490090
min       0.264053
25%       0.318000
50%       0.413667
75%       0.769478
max       1.839157
Name: chc_per_capita, dtype: float64

In [16]:
X_train = training_df.drop(columns='chc_per_capita')
y_train = training_df[['chc_per_capita']]

y_train['chc_per_capita'].value_counts()

0.463325    1
0.319047    1
1.191718    1
0.364010    1
1.382170    1
0.289337    1
0.556324    1
0.264053    1
0.264386    1
0.783776    1
0.726586    1
1.839157    1
0.317650    1
0.363690    1
Name: chc_per_capita, dtype: int64

In [17]:
X_test = testing_df.drop(columns='chc_per_capita')
y_test = testing_df[['chc_per_capita']]

y_test['chc_per_capita'].value_counts()

0.356611    1
1.346931    1
0.761035    1
0.333187    1
0.384627    1
0.547196    1
0.449280    1
0.548747    1
0.339323    1
0.786550    1
0.461458    1
0.469870    1
0.436688    1
0.254541    1
0.412162    1
1.610121    1
0.281834    1
0.373612    1
0.471388    1
3.847798    1
0.697569    1
0.324035    1
0.410607    1
0.452867    1
0.338572    1
0.223895    1
0.340004    1
1.042442    1
0.799589    1
0.414632    1
0.457486    1
0.675868    1
0.578797    1
0.800249    1
0.399361    1
0.290128    1
1.066288    1
Name: chc_per_capita, dtype: int64

## Creating a StandardScaler instance

In [11]:
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Random Forest Classifier

In [18]:
## Creating a RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


ValueError: Unknown label type: 'continuous'

In [14]:
y_train.shape

(14, 1)

In [122]:
predictions = rf_model.predict(X_test)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.02702702702702703

In [123]:
predictions
predictions.ravel()

array([ 8,  8,  8,  8, 45, 17, 45, 26,  8, 16,  8, 16, 16,  8,  8, 26, 17,
        8,  8, 18, 16, 18, 45, 26,  8, 26,  8, 26,  8,  8,  8,  8, 26,  8,
        8, 17, 11])

In [110]:
compare_df = testing_df['total_community_health_centers'].to_frame()
compare_df['predictions'] = predictions
compare_df

Unnamed: 0_level_0,total_community_health_centers,predictions
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,17,8
Alaska,27,8
Arizona,23,8
Arkansas,12,8
California,175,45
Colorado,19,17
Florida,47,45
Georgia,35,26
Hawaii,14,8
Idaho,14,16


compare_

In [69]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)
cm

array([[0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0,

In [57]:
# Printing the imbalanced classification report
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          3       0.00      0.00      0.70      0.00      0.00      0.00       0.0
          4       0.00      0.00      1.00      0.00      0.00      0.00       2.0
          6       0.00      0.00      1.00      0.00      0.00      0.00       1.0
          7       0.00      0.00      1.00      0.00      0.00      0.00       1.0
          8       0.00      0.00      0.97      0.00      0.00      0.00       1.0
         10       0.00      0.00      1.00      0.00      0.00      0.00       1.0
         11       0.00      0.00      0.89      0.00      0.00      0.00       0.0
         12       0.00      0.00      1.00      0.00      0.00      0.00       1.0
         13       0.00      0.00      1.00      0.00      0.00      0.00       1.0
         14       0.00      0.00      1.00      0.00      0.00      0.00       4.0
         16       0.00      0.00      0.97      0.00      0.00      0.00       2.0
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
# Listing the features sorted in descending order by feature importance.
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.07897695962424295, 'mammogram'),
 (0.06674860712986377, 'abortions_occurring_state'),
 (0.06405163516440797, 'teen_births'),
 (0.06356857936798511, 'few_services'),
 (0.0622149576077038, 'pap_smear'),
 (0.059073684996580944, 'maternal_mortality'),
 (0.05852606247748114, 'abortions_residence_state'),
 (0.0573461544713971, 'full_service'),
 (0.05656256474876965, 'uninsured'),
 (0.05517026762536534, 'no_services'),
 (0.04787732414797696, 'percent_residents_traveling_outside_state'),
 (0.047371256704399824, 'no_provider'),
 (0.044968407958396704, 'population'),
 (0.04471632837723396, 'total_insured'),
 (0.041383131822156576, 'percent_of_all_us_abortions'),
 (0.04083251147038448, 'no_doctor_visits'),
 (0.02401995459034747, 'cost_sharing_Yes'),
 (0.020796898696867117, 'female_sterilization_No'),
 (0.01762172947751528, 'female_sterilization_Yes'),
 (0.0, 'restricted_services'),
 (0.0, 'prescription_contraception_Yes'),
 (0.0, 'otc_methods_Yes'),
 (0.0, 'otc_methods_No'),
 (0.0, 'male_ster