In [36]:
# Initial imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

## Preprocessing the Data for PCA

In [37]:
# Loading final_reproductive_table.csv dataset
file = "Resources/Clean_data/final_reproductive_table2copy.csv"
df = pd.read_csv(file, index_col=0)
df.head(10)

Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,...,teen_births,poverty_under_200,percent_of_all_us_abortions,percent_residents_traveling_outside_state,abortions_occurring_state,abortions_residence_state,no_services,few_services,restricted_services,full_service
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,...,24.8,0.332,0.6,47,5700,9060,8,44,5,0
Alaska,Abortion Available,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,...,17.7,0.285,0.1,7,1240,1320,3,7,0,4
Arizona,Status of pre-Roe ban unclear,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,...,16.6,0.298,1.4,6,13320,13820,19,28,7,0
Arkansas,Abortion Ban In Effect,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,...,27.8,0.334,0.3,37,3250,4510,9,31,3,0
California,Abortion Available,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,...,11.0,0.272,16.6,0,154060,152400,51,97,0,124
Colorado,Abortion Available,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,...,12.5,0.219,1.4,1,13420,11830,17,34,0,15
Connecticut,Abortion Available,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,...,7.6,0.228,1.2,6,11170,11460,12,8,0,18
Delaware,Abortion Available,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,...,14.6,0.292,0.2,44,1830,2870,10,0,0,2
District of Columbia,Abortion Available,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,...,15.6,0.271,1.0,45,9410,5010,2,0,0,2
Florida,"Abortion available, pre-viability gestational ...",47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,...,15.2,0.325,8.3,1,77400,73830,22,129,0,48


In [38]:
df.dtypes

abortion_status                               object
total_community_health_centers                 int64
uninsured                                      int64
total_insured                                  int64
maternal_mortality                           float64
population                                     int64
land_area_sqmi                                 int64
no_doctor_visits                             float64
mammogram                                    float64
no_provider                                  float64
pap_smear                                    float64
prescription_contraception                    object
otc_methods                                   object
male_sterilization                            object
female_sterilization                          object
cost_sharing                                  object
teen_births                                  float64
poverty_under_200                            float64
percent_of_all_us_abortions                  f

In [39]:
# Using get_dummies() to create variables for text features.
X = pd.get_dummies(data=df, 
                   columns=['abortion_status','prescription_contraception','otc_methods',
                            'male_sterilization','female_sterilization','cost_sharing'])
X

Unnamed: 0_level_0,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,pap_smear,...,prescription_contraception_No,prescription_contraception_Yes,otc_methods_No,otc_methods_Yes,male_sterilization_No,male_sterilization_Yes,female_sterilization_No,female_sterilization_Yes,cost_sharing_No,cost_sharing_Yes
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,0.74,...,1,0,1,0,1,0,1,0,1,0
Alaska,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,0.65,...,1,0,1,0,1,0,1,0,1,0
Arizona,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,0.69,...,0,1,1,0,1,0,1,0,1,0
Arkansas,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,0.68,...,0,1,1,0,1,0,1,0,1,0
California,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,0.74,...,0,1,0,1,1,0,0,1,0,1
Colorado,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,0.69,...,0,1,1,0,1,0,1,0,1,0
Connecticut,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,0.8,...,0,1,0,1,1,0,0,1,0,1
Delaware,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,0.75,...,0,1,0,1,1,0,0,1,0,1
District of Columbia,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,0.8,...,0,1,0,1,1,0,0,1,0,1
Florida,47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,0.73,...,1,0,1,0,1,0,1,0,1,0


## Clustering Using K-Means

In [40]:
# Creating an elbow curve to find the best value for K.
inertia = []
k = list(range(1,11))
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    inertia.append(km.inertia_)

elbow_data = {'k':k, 'inertia':inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x='k',y='inertia',title='Elbow Curve',xticks=k)

In [41]:
# Running K-Means with k=4
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fitting the model
model.fit(X)

# Predicting clusters
predictions = model.predict(X)

predictions

array([0, 0, 2, 0, 3, 2, 0, 0, 0, 1, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2,
       2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 1, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 1,
       0, 0, 2, 2, 0, 2, 0], dtype=int32)

In [42]:
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
df['Class'] = model.labels_
df.head(10)

Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,...,poverty_under_200,percent_of_all_us_abortions,percent_residents_traveling_outside_state,abortions_occurring_state,abortions_residence_state,no_services,few_services,restricted_services,full_service,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,...,0.332,0.6,47,5700,9060,8,44,5,0,0
Alaska,Abortion Available,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,...,0.285,0.1,7,1240,1320,3,7,0,4,0
Arizona,Status of pre-Roe ban unclear,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,...,0.298,1.4,6,13320,13820,19,28,7,0,2
Arkansas,Abortion Ban In Effect,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,...,0.334,0.3,37,3250,4510,9,31,3,0,0
California,Abortion Available,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,...,0.272,16.6,0,154060,152400,51,97,0,124,3
Colorado,Abortion Available,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,...,0.219,1.4,1,13420,11830,17,34,0,15,2
Connecticut,Abortion Available,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,...,0.228,1.2,6,11170,11460,12,8,0,18,0
Delaware,Abortion Available,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,...,0.292,0.2,44,1830,2870,10,0,0,2,0
District of Columbia,Abortion Available,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,...,0.271,1.0,45,9410,5010,2,0,0,2,0
Florida,"Abortion available, pre-viability gestational ...",47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,...,0.325,8.3,1,77400,73830,22,129,0,48,1


## Visualizing Results

In [44]:
# Creating a 3D-Scatter the clusters
fig = px.scatter_3d(
    df,
    x='total_community_health_centers',
    y='population',
    z='land_area_sqmi',
    color='Class',
    symbol='Class',
    hover_name=df.index,
    hover_data=['abortion_status'],
    width=800
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [45]:
# Saving clustered data
file_path = "Resources/MLM/clustered_data_noPCA.csv"
clustered_df.to_csv(file_path, index=True)