In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Clustering senators

In [55]:
def kmeans(X,k,max_iterations=1000, return_inertia=False):
    from sklearn.metrics import pairwise_distances
    # initialize iteration counter
    it = 0
    repeat = True
    
    # number of datapoints
    m = X.shape[0] 
    
    # initialization
    means = X[np.random.choice(m,k,replace=False)] 
    dist = pairwise_distances(X,means)
    clusters = np.argmin(dist, axis=1)
 
    
    while repeat and it<max_iterations:
        
        # update means; # if a cluster has no data points associated with it, replace it with a random data point
        means = np.array([np.mean(X[clusters==i], axis=0) 
                          if np.sum(clusters==i)!=0
                          else  X[np.random.randint(m)]
                          for i in range(k)])
        
        # update clusters
        dist = pairwise_distances(X,means)
        new_clusters = np.argmin(dist, axis=1)
        
        # check if the new clusters are equal to the previous clusters
        if np.sum(clusters!=new_clusters)==0: 
            repeat = False
        clusters = new_clusters
                
        it += 1 # increment iteration counter by 1
    
    
    results = {'clusters' : clusters, 'means' : means}
    # compute inertia
    if return_inertia:
        results['inertia'] = np.sum([np.sum((X[clusters==i]-means[i])**2) for i in range(k)])/m
    
    return results

I use the voting history from the 114th Congress to split Senators into clusters.

In [56]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/114_congress.csv'
data = pd.read_csv(url)
data.head()

Unnamed: 0,name,party,state,00001,00004,00005,00006,00007,00008,00009,00010,00020,00026,00032,00038,00039,00044,00047
0,Alexander,R,TN,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Ayotte,R,NH,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,Baldwin,D,WI,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
3,Barrasso,R,WY,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,Bennet,D,CO,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


The data contains all the votes from the 114th Senate. Each row contains the votes of an individual senator. Votes are coded as 0 for “No”, 1 for “Yes”, and 0.5 for “Abstain”.

In [57]:
# feature matrix X
X = data.drop(['name','party','state'],axis=1).to_numpy()
# vector with party affiliation
party = data['party'].to_numpy()
# senators names
names = data['name'].to_numpy()

**Part 1:** Use kmeans to split the senators into two clusters. 
Do the two clusters correspond to the two parties?

In [58]:
k=2
results = kmeans(X,k,return_inertia=False)

clusters = results['clusters']
means = results['means']


In [59]:
Dems=party[clusters==1]
Dem_name=names[clusters==1]
Dems


array(['R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R'], dtype=object)

In [60]:
reps=party[clusters==0]
rep_name=names[clusters==0]
reps

array(['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'I', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'I', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D'], dtype=object)

**Part 2:** Find the senators that are in the cluster associated with the opposite party

In [61]:
rep_name[np.where(reps!='R')]

array(['Baldwin', 'Bennet', 'Blumenthal', 'Booker', 'Boxer', 'Brown',
       'Cantwell', 'Cardin', 'Carper', 'Casey', 'Coons', 'Donnelly',
       'Durbin', 'Feinstein', 'Franken', 'Gillibrand', 'Heinrich',
       'Hirono', 'Kaine', 'King', 'Klobuchar', 'Leahy', 'Markey',
       'McCaskill', 'Menendez', 'Merkley', 'Mikulski', 'Murphy', 'Murray',
       'Nelson', 'Peters', 'Reed', 'Reid', 'Sanders', 'Schatz', 'Schumer',
       'Shaheen', 'Stabenow', 'Tester', 'Udall', 'Warner', 'Warren',
       'Whitehouse', 'Wyden'], dtype=object)

In [62]:
Dem_name[np.where(Dems!='D')]

array(['Alexander', 'Ayotte', 'Barrasso', 'Blunt', 'Boozman', 'Burr',
       'Capito', 'Cassidy', 'Coats', 'Cochran', 'Collins', 'Corker',
       'Cornyn', 'Cotton', 'Crapo', 'Cruz', 'Daines', 'Enzi', 'Ernst',
       'Fischer', 'Flake', 'Gardner', 'Graham', 'Grassley', 'Hatch',
       'Heller', 'Hoeven', 'Inhofe', 'Isakson', 'Johnson', 'Kirk',
       'Lankford', 'Lee', 'McCain', 'McConnell', 'Moran', 'Murkowski',
       'Paul', 'Perdue', 'Portman', 'Risch', 'Roberts', 'Rounds', 'Rubio',
       'Sasse', 'Scott', 'Sessions', 'Shelby', 'Sullivan', 'Thune',
       'Tillis', 'Toomey', 'Vitter', 'Wicker'], dtype=object)

**Part 3:** More clusters could show wings of each party, or cross-party groups. Use kmeans to split the senators into 5 clusters, and analyze the two democratic factions

In [63]:
k=5
results = kmeans(X,k,return_inertia=False)

clusters = results['clusters']
means = results['means']

In [64]:
clusters

array([2, 0, 3, 1, 0, 3, 1, 3, 1, 3, 3, 1, 3, 1, 3, 0, 0, 1, 1, 1, 0, 3,
       1, 1, 1, 1, 2, 1, 0, 3, 1, 1, 3, 1, 1, 3, 2, 3, 1, 1, 1, 0, 0, 2,
       3, 1, 1, 1, 1, 3, 0, 2, 0, 2, 3, 2, 2, 3, 1, 0, 1, 3, 3, 3, 1, 1,
       3, 3, 3, 2, 1, 0, 1, 3, 4, 1, 1, 1, 2, 3, 2, 3, 3, 1, 1, 3, 1, 0,
       1, 0, 1, 1, 1, 3, 1, 3, 3, 3, 1, 3], dtype=int64)

In [65]:
range(k-1)

range(0, 4)

In [82]:
for i in range (k):
    faction=party[clusters==i]
    name=names[clusters==i]
    factions=np.hstack((faction,name,i))
    print ("faction " + str(i+1) + ' is ')
    print (factions)

faction 1 is 
['R' 'D' 'D' 'D' 'R' 'D' 'D' 'D' 'I' 'D' 'D' 'D' 'D' 'D' 'Ayotte' 'Bennet'
 'Carper' 'Casey' 'Collins' 'Donnelly' 'Heinrich' 'Heitkamp' 'King'
 'Klobuchar' 'McCaskill' 'Peters' 'Stabenow' 'Tester' 0]
faction 2 is 
['R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'Barrasso' 'Blunt' 'Boozman' 'Burr' 'Capito'
 'Cassidy' 'Coats' 'Cochran' 'Corker' 'Cornyn' 'Cotton' 'Crapo' 'Daines'
 'Enzi' 'Ernst' 'Fischer' 'Flake' 'Graham' 'Grassley' 'Hatch' 'Hoeven'
 'Inhofe' 'Isakson' 'Johnson' 'McCain' 'McConnell' 'Moran' 'Murkowski'
 'Perdue' 'Portman' 'Risch' 'Roberts' 'Rounds' 'Scott' 'Sessions' 'Shelby'
 'Sullivan' 'Thune' 'Tillis' 'Toomey' 'Vitter' 'Wicker' 1]
faction 3 is 
['R' 'R' 'R' 'R' 'R' 'R' 'R' 'D' 'R' 'R' 'R' 'Alexander' 'Cruz' 'Gardner'
 'Heller' 'Kirk' 'Lankford' 'Lee' 'Manchin' 'Paul' 'Rubio' 'Sasse' 2]
faction 4 is 
['D' 'D' 'D' 'D' 'D' 'D' 'D' 'D' 'D

In [77]:
factions

array(['D', 'Reid', 4], dtype=object)