In [41]:
# Initial imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

## Preprocessing the Data for PCA

In [42]:
# Loading final_reproductive_table.csv dataset
file = "Resources/Clean_data/final_reproductive_table2copy.csv"
df = pd.read_csv(file, index_col=0)
df.head(10)

Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,...,teen_births,poverty_under_200,percent_of_all_us_abortions,percent_residents_traveling_outside_state,abortions_occurring_state,abortions_residence_state,no_services,few_services,restricted_services,full_service
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,...,24.8,0.332,0.6,47,5700,9060,8,44,5,0
Alaska,Abortion Available,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,...,17.7,0.285,0.1,7,1240,1320,3,7,0,4
Arizona,Status of pre-Roe ban unclear,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,...,16.6,0.298,1.4,6,13320,13820,19,28,7,0
Arkansas,Abortion Ban In Effect,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,...,27.8,0.334,0.3,37,3250,4510,9,31,3,0
California,Abortion Available,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,...,11.0,0.272,16.6,0,154060,152400,51,97,0,124
Colorado,Abortion Available,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,...,12.5,0.219,1.4,1,13420,11830,17,34,0,15
Connecticut,Abortion Available,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,...,7.6,0.228,1.2,6,11170,11460,12,8,0,18
Delaware,Abortion Available,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,...,14.6,0.292,0.2,44,1830,2870,10,0,0,2
District of Columbia,Abortion Available,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,...,15.6,0.271,1.0,45,9410,5010,2,0,0,2
Florida,"Abortion available, pre-viability gestational ...",47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,...,15.2,0.325,8.3,1,77400,73830,22,129,0,48


In [43]:
df.dtypes

abortion_status                               object
total_community_health_centers                 int64
uninsured                                      int64
total_insured                                  int64
maternal_mortality                           float64
population                                     int64
land_area_sqmi                                 int64
no_doctor_visits                             float64
mammogram                                    float64
no_provider                                  float64
pap_smear                                    float64
prescription_contraception                    object
otc_methods                                   object
male_sterilization                            object
female_sterilization                          object
cost_sharing                                  object
teen_births                                  float64
poverty_under_200                            float64
percent_of_all_us_abortions                  f

In [44]:
# Using get_dummies() to create variables for text features.
X = pd.get_dummies(data=df, 
                   columns=['abortion_status','prescription_contraception','otc_methods',
                            'male_sterilization','female_sterilization','cost_sharing'])
X

Unnamed: 0_level_0,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,pap_smear,...,prescription_contraception_No,prescription_contraception_Yes,otc_methods_No,otc_methods_Yes,male_sterilization_No,male_sterilization_Yes,female_sterilization_No,female_sterilization_Yes,cost_sharing_No,cost_sharing_Yes
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,0.74,...,1,0,1,0,1,0,1,0,1,0
Alaska,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,0.65,...,1,0,1,0,1,0,1,0,1,0
Arizona,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,0.69,...,0,1,1,0,1,0,1,0,1,0
Arkansas,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,0.68,...,0,1,1,0,1,0,1,0,1,0
California,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,0.74,...,0,1,0,1,1,0,0,1,0,1
Colorado,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,0.69,...,0,1,1,0,1,0,1,0,1,0
Connecticut,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,0.8,...,0,1,0,1,1,0,0,1,0,1
Delaware,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,0.75,...,0,1,0,1,1,0,0,1,0,1
District of Columbia,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,0.8,...,0,1,0,1,1,0,0,1,0,1
Florida,47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,0.73,...,1,0,1,0,1,0,1,0,1,0


In [45]:
# Standardizing the data with StandardScaler().
df_scaled = StandardScaler().fit_transform(X)
print(df_scaled[0:5])

[[-0.36337399 -0.11297454 -0.20915061  1.84709378 -0.20974035 -0.21973692
   0.87098834  0.45649092 -0.24045745  0.58938342  1.64884298  1.07643255
  -0.43671026  1.1718366  -0.43618519 -0.32971869 -0.2841427   0.14891548
   1.08797581 -0.53267653 -0.29172998 -1.0198039   2.1602469  -0.32969024
  -0.25       -0.20203051 -0.20203051  1.19522861 -1.19522861  0.58489765
  -0.58489765  0.43133109 -0.43133109  0.64549722 -0.64549722  0.70710678
  -0.70710678]
 [ 0.02677493 -0.53959359 -0.78173133  0.84158606 -0.78097787  5.92073157
   0.87098834 -2.1566867   2.1178752  -1.68395262  0.36982404  0.20679695
  -0.59810318 -0.54304623 -0.59133905 -0.6083059  -0.68225473 -0.84402982
  -0.45332325 -0.33723407 -0.29172998  0.98058068 -0.46291005 -0.32969024
  -0.25       -0.20203051 -0.20203051  1.19522861 -1.19522861  0.58489765
  -0.58489765  0.43133109 -0.43133109  0.64549722 -0.64549722  0.70710678
  -0.70710678]
 [-0.12928464  0.33489734  0.09635952  0.94646469  0.1177791   0.52360793
   0.467

## Reducing Data Dimensions Using PCA

In [46]:
# Using PCA to reduce dimension to three principal components.
# Initialize PCA model
pca = PCA(n_components=3)
pcs = pca.fit_transform(df_scaled)
print(pcs.shape)
pcs

(51, 3)


array([[-3.44160948,  1.22650263,  0.81285213],
       [-2.94478457,  0.67918514,  3.15515167],
       [-1.71354991,  1.25049228,  0.17048061],
       [-3.25221357,  0.37015088,  1.47372023],
       [10.48085819,  9.23439935, -1.18671755],
       [-0.31905815, -0.12129215, -0.65647553],
       [ 3.6354646 , -3.19474118, -0.86182275],
       [ 2.13597285, -3.43056052,  0.97137514],
       [ 2.32386133, -3.4516617 ,  0.93529853],
       [ 0.90052016,  6.48492161, -1.5305639 ],
       [-0.67268219,  3.14271601, -0.01468805],
       [-0.61327757, -2.57013823, -2.43662344],
       [-3.07977125, -0.58411928,  0.41790305],
       [ 5.76575847,  0.18471733,  1.25965957],
       [-2.33740013,  1.10301819, -1.14745662],
       [-1.16737794, -1.3048153 , -2.6864662 ],
       [-2.46250412,  0.15093752, -0.94398003],
       [-3.36289014,  0.95984907,  0.57361672],
       [-3.00130527,  0.75253531,  0.05819525],
       [ 1.82694717, -3.26904965, -1.081919  ],
       [ 3.61018901, -2.03119665,  0.196

In [47]:
# Creating a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data=pcs,
                      columns=['PC 1','PC 2','PC 3'],
                      index=df.index
                     )
pcs_df

Unnamed: 0_level_0,PC 1,PC 2,PC 3
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,-3.441609,1.226503,0.812852
Alaska,-2.944785,0.679185,3.155152
Arizona,-1.71355,1.250492,0.170481
Arkansas,-3.252214,0.370151,1.47372
California,10.480858,9.234399,-1.186718
Colorado,-0.319058,-0.121292,-0.656476
Connecticut,3.635465,-3.194741,-0.861823
Delaware,2.135973,-3.430561,0.971375
District of Columbia,2.323861,-3.451662,0.935299
Florida,0.90052,6.484922,-1.530564


## Clustering Using K-Means

In [8]:
# Creating an elbow curve to find the best value for K.
inertia = []
k = list(range(1,11))
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

elbow_data = {'k':k, 'inertia':inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x='k',y='inertia',title='Elbow Curve',xticks=k)

In [9]:
# Running K-Means with k=5
# Initialize the K-Means model.
model = KMeans(n_clusters=5, random_state=0)

# Fitting the model
model.fit(pcs_df)

# Predicting clusters
predictions = model.predict(pcs_df)

predictions

array([0, 0, 0, 0, 2, 1, 4, 4, 4, 3, 0, 1, 0, 4, 0, 1, 0, 0, 0, 4, 4, 4,
       1, 1, 0, 0, 0, 0, 4, 1, 4, 4, 2, 1, 0, 0, 0, 4, 1, 1, 0, 0, 0, 3,
       0, 4, 4, 4, 0, 1, 0], dtype=int32)

In [10]:
# Creating a new DataFrame including predicted clusters and original features features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
frames = [df, pcs_df]
clustered_df = pd.concat(frames, axis=1, join='inner')
clustered_df.head(10)

Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,...,percent_residents_traveling_outside_state,abortions_occurring_state,abortions_residence_state,no_services,few_services,restricted_services,full_service,PC 1,PC 2,PC 3
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,...,47,5700,9060,8,44,5,0,-3.441609,1.226503,0.812852
Alaska,Abortion Available,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,...,7,1240,1320,3,7,0,4,-2.944785,0.679185,3.155152
Arizona,Status of pre-Roe ban unclear,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,...,6,13320,13820,19,28,7,0,-1.71355,1.250492,0.170481
Arkansas,Abortion Ban In Effect,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,...,37,3250,4510,9,31,3,0,-3.252214,0.370151,1.47372
California,Abortion Available,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,...,0,154060,152400,51,97,0,124,10.480858,9.234399,-1.186718
Colorado,Abortion Available,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,...,1,13420,11830,17,34,0,15,-0.319058,-0.121292,-0.656476
Connecticut,Abortion Available,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,...,6,11170,11460,12,8,0,18,3.635465,-3.194741,-0.861823
Delaware,Abortion Available,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,...,44,1830,2870,10,0,0,2,2.135973,-3.430561,0.971375
District of Columbia,Abortion Available,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,...,45,9410,5010,2,0,0,2,2.323861,-3.451662,0.935299
Florida,"Abortion available, pre-viability gestational ...",47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,...,1,77400,73830,22,129,0,48,0.90052,6.484922,-1.530564


In [11]:
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = model.labels_
clustered_df.head(10)

Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,...,abortions_occurring_state,abortions_residence_state,no_services,few_services,restricted_services,full_service,PC 1,PC 2,PC 3,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,...,5700,9060,8,44,5,0,-3.441609,1.226503,0.812852,0
Alaska,Abortion Available,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,...,1240,1320,3,7,0,4,-2.944785,0.679185,3.155152,0
Arizona,Status of pre-Roe ban unclear,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,...,13320,13820,19,28,7,0,-1.71355,1.250492,0.170481,0
Arkansas,Abortion Ban In Effect,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,...,3250,4510,9,31,3,0,-3.252214,0.370151,1.47372,0
California,Abortion Available,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,...,154060,152400,51,97,0,124,10.480858,9.234399,-1.186718,2
Colorado,Abortion Available,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,...,13420,11830,17,34,0,15,-0.319058,-0.121292,-0.656476,1
Connecticut,Abortion Available,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,...,11170,11460,12,8,0,18,3.635465,-3.194741,-0.861823,4
Delaware,Abortion Available,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,...,1830,2870,10,0,0,2,2.135973,-3.430561,0.971375,4
District of Columbia,Abortion Available,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,...,9410,5010,2,0,0,2,2.323861,-3.451662,0.935299,4
Florida,"Abortion available, pre-viability gestational ...",47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,...,77400,73830,22,129,0,48,0.90052,6.484922,-1.530564,3


In [12]:
# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(51, 30)


Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,...,abortions_occurring_state,abortions_residence_state,no_services,few_services,restricted_services,full_service,PC 1,PC 2,PC 3,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,...,5700,9060,8,44,5,0,-3.441609,1.226503,0.812852,0
Alaska,Abortion Available,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,...,1240,1320,3,7,0,4,-2.944785,0.679185,3.155152,0
Arizona,Status of pre-Roe ban unclear,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,...,13320,13820,19,28,7,0,-1.71355,1.250492,0.170481,0
Arkansas,Abortion Ban In Effect,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,...,3250,4510,9,31,3,0,-3.252214,0.370151,1.47372,0
California,Abortion Available,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,...,154060,152400,51,97,0,124,10.480858,9.234399,-1.186718,2
Colorado,Abortion Available,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,...,13420,11830,17,34,0,15,-0.319058,-0.121292,-0.656476,1
Connecticut,Abortion Available,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,...,11170,11460,12,8,0,18,3.635465,-3.194741,-0.861823,4
Delaware,Abortion Available,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,...,1830,2870,10,0,0,2,2.135973,-3.430561,0.971375,4
District of Columbia,Abortion Available,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,...,9410,5010,2,0,0,2,2.323861,-3.451662,0.935299,4
Florida,"Abortion available, pre-viability gestational ...",47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,...,77400,73830,22,129,0,48,0.90052,6.484922,-1.530564,3


## Visualizing Results

In [28]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x='PC 1',
    y='PC 2',
    z='PC 3',
    color='Class',
    symbol='Class',
    hover_name=df.index,
    hover_data=['abortion_status'],
    width=800
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

NameError: name 'clustered_df' is not defined

In [14]:
# Saving clustered data
file_path = "Resources/MLM/clustered_data.csv"
clustered_df.to_csv(file_path, index=True)