In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import plotly.figure_factory as ff

In [5]:
df = pd.read_csv("county_data_mlseg2.csv", index_col=0)
df.head()

Unnamed: 0,county,state,lat,long,TotalPop,total_votes20,votes20_Donald_Trump,votes20_Joe_Biden,percentage20_Donald_Trump,percentage20_Joe_Biden,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,Abbeville,SC,34.223334,-82.461707,24788.0,12433.0,8215.0,4101.0,0.661,0.33,...,1.8,1.8,6.5,25.8,9505.0,78.8,13.3,7.8,0.1,9.4
1,Acadia,LA,30.295065,-92.414197,62607.0,28425.0,22596.0,5443.0,0.795,0.191,...,1.6,2.2,2.5,27.6,24982.0,80.0,12.1,7.6,0.3,8.9
2,Accomack,VA,37.767072,-75.632346,32840.0,16938.0,9172.0,7578.0,0.542,0.447,...,2.6,1.8,4.5,22.0,13837.0,74.6,18.1,7.1,0.2,5.4
3,Ada,ID,43.452658,-116.241552,435117.0,259389.0,130699.0,120539.0,0.504,0.465,...,1.5,2.8,6.9,20.4,214984.0,78.3,15.0,6.6,0.1,4.3
4,Adair,IA,41.330756,-94.471059,7192.0,4183.0,2917.0,1197.0,0.697,0.286,...,2.8,0.4,6.2,22.3,3680.0,73.8,15.3,10.4,0.5,3.0


In [8]:
model_df = df.drop(['county', 'state', 'lat','long'], axis=1)
model_df.columns

Index(['TotalPop', 'total_votes20', 'votes20_Donald_Trump',
       'votes20_Joe_Biden', 'percentage20_Donald_Trump',
       'percentage20_Joe_Biden', 'cases', 'deaths', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'Income',
       'IncomePerCap', 'Poverty', 'Professional', 'Service', 'Office',
       'Construction', 'Production', 'Drive', 'Carpool', 'Transit', 'Walk',
       'OtherTransp', 'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork',
       'PublicWork', 'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')

In [18]:
econ_df = df[['cases','Income', 'IncomePerCap', 'Poverty', 'Employed', 'Unemployment']]
econ_df.head()

Unnamed: 0,cases,Income,IncomePerCap,Poverty,Employed,Unemployment
0,805.0,35254.0,19234.0,22.7,9505.0,9.4
1,3182.0,40492.0,21591.0,21.5,24982.0,8.9
2,1227.0,42260.0,24266.0,19.8,13837.0,5.4
3,17451.0,60151.0,31642.0,11.8,214984.0,4.3
4,222.0,49477.0,28861.0,9.5,3680.0,3.0


In [19]:
df_scaled = StandardScaler().fit_transform(econ_df)
print(df_scaled[0:5])

[[-0.19385961 -1.10658899 -1.09008882  1.02520046 -0.24181958  1.03164915]
 [ 0.02598209 -0.69958581 -0.70506226  0.84141598 -0.14440173  0.86415151]
 [-0.15483008 -0.56220863 -0.26808896  0.58105463 -0.2145524  -0.30833193]
 [ 1.34567975  0.82795821  0.93681385 -0.64417527  1.05153978 -0.67682673]
 [-0.24777956 -0.00143315  0.48252498 -0.99642887 -0.27848424 -1.11232058]]


In [20]:
# pca
pca = PCA(n_components=2)

In [21]:
pca_df = pca.fit_transform(df_scaled)
pca_df[:5]

array([[-2.1095937 ,  0.3797518 ],
       [-1.48454207,  0.44965227],
       [-0.67641125, -0.14022215],
       [ 1.98466264,  1.124142  ],
       [ 1.03436683, -0.91813689]])

In [27]:
df_econ_pca = pd.DataFrame(
    data=pca_df, columns=["principal component 1", "principal component 2"]
)
df_econ_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.109594,0.379752
1,-1.484542,0.449652
2,-0.676411,-0.140222
3,1.984663,1.124142
4,1.034367,-0.918137


In [28]:
# Create the dendrogram
fig = ff.create_dendrogram(df_county_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [33]:
agg = AgglomerativeClustering(n_clusters=10)
model = agg.fit(df_county_pca)

In [34]:
df_econ_pca["class"] = model.labels_
df_econ_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.109594,0.379752,5
1,-1.484542,0.449652,5
2,-0.676411,-0.140222,1
3,1.984663,1.124142,3
4,1.034367,-0.918137,4


In [35]:
df_econ_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)