In [97]:
import csv
import glob

from typing import NamedTuple, DefaultDict, Dict, List, Tuple
from collections import defaultdict, Counter
from pprint import pprint

In [73]:
NUM_SENATORS = 100

In [53]:
Senator = NamedTuple('Senator', [('name', str), ('party', str), ('state', str)])
VoteValue = int
VoteHistory = Tuple[VoteValue, ...]

In [44]:
vote_value = {'Yea': 1, 'Nay': -1, 'Not Voting': 0} # type: Dict[str, VoteValue]

In [42]:
accumulated_record = defaultdict(list) # type: DefaultDict[Senator, List[VoteValue]]

# Load votes

In [48]:
for filename in glob.glob('congress_data/*.csv'):    
    with open(filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        vote_topic = next(reader)
        headers = next(reader)
        for person, state, distric, vote, name, party in reader:
            senator = Senator(name, party, state)
            accumulated_record[senator].append(vote_value[vote])

# Transform the record into a plain dict that maps to tuple of votes

In [54]:
record = {senator: tuple(votes) for senator, votes in accumulated_record.items()}  # type: Dict[Senator, VoteHistory]

# Use k-means to locate the cluster centroids, assign each senator to the nearest cluster

In [56]:
from kmeans import k_means, assign_data

In [57]:
centroids = k_means(record.values(), k=3)

In [65]:
clustered_votes = assign_data(centroids, record.values())

# Build a reverse mapping from a vote history to a list of senators who voted that way

In [75]:
votes_to_senators = defaultdict(list)  # type: DefaultDict[VoteHistory, List[Senator]]

In [91]:
for senator, votehistory in record.items():
    votes_to_senators[votehistory].append(senator)

In [92]:
assert sum([len(cluster) for cluster in votes_to_senators.values()]) == NUM_SENATORS

# Display the clusters and the members (senators) of each cluster

In [100]:
for i, votes_in_cluster in enumerate(clustered_votes.values(), start=1):
    print(f'----- Voting Cluster #{i} -----')
    party_totals = Counter()
    for votes in set(votes_in_cluster):
        for senator in votes_to_senators[votes]:
            print(senator)
            party_totals[senator.party] += 1
    print(party_totals)

----- Voting Cluster #1 -----
Senator(name='Sen. Cory Gardner [R]', party='Republican', state='CO')
Senator(name='Sen. Timothy Kaine [D]', party='Democrat', state='VA')
Senator(name='Sen. Robert “Bob” Casey Jr. [D]', party='Democrat', state='PA')
Senator(name='Sen. Thomas Carper [D]', party='Democrat', state='DE')
Senator(name='Sen. Alan “Al” Franken [D]', party='Democrat', state='MN')
Senator(name='Sen. Mark Warner [D]', party='Democrat', state='VA')
Senator(name='Sen. Daniel Coats [R]', party='Republican', state='IN')
Senator(name='Sen. Mark Kirk [R]', party='Republican', state='IL')
Senator(name='Sen. Orrin Hatch [R]', party='Republican', state='UT')
Senator(name='Sen. Richard Burr [R]', party='Republican', state='NC')
Senator(name='Sen. John “Johnny” Isakson [R]', party='Republican', state='GA')
Senator(name='Sen. Richard Blumenthal [D]', party='Democrat', state='CT')
Senator(name='Sen. Angus King [I]', party='Independent', state='ME')
Senator(name='Sen. Bob Corker [R]', party='Rep