# 2.8 Tutorial: Computing Similarity

In [1]:
import sys
import operator

In [2]:
s = [0]*5 
for i in range(5):
    s[i] = i**2
s

[0, 1, 4, 9, 16]

In [3]:
n = 5

In [4]:
pairs = set({}) 
for i in range(n):
    for j in range(i+1, n+1, 1):
        pairs = pairs.union({(i,j)})
pairs

{(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (2, 3),
 (2, 4),
 (2, 5),
 (3, 4),
 (3, 5),
 (4, 5)}

In [5]:
pairs = {(i,j) for i in range(n) for j in range(i+1, n+1, 1)}
pairs

{(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (2, 3),
 (2, 4),
 (2, 5),
 (3, 4),
 (3, 5),
 (4, 5)}

In [6]:
path = 'cm.txt'
nameDict = {}
with open(path) as f:
    for line in f:
        data = line.split("|")
        if data[1] != '':
            nameDict[data[0]] = data[1]

In [7]:
print('Number of committees = ', len(nameDict))
committees = set(nameDict.keys())

Number of committees =  14905


In [8]:
list(committees)[:5]

['C00526707', 'C00116145', 'C00504670', 'C00534974', 'C00517920']

Download the dataframe from the link below

https://www.fec.gov/files/bulk-downloads/2014/oth14.zip

In [9]:
path = 'itoth.txt'
contributorDict = {}
with open(path) as f:
    for line in f:
        data = line.split("|")
        contributor = data[0]
        if contributorDict.get(contributor) is None:
            contributorDict[contributor] = {data[7]}
        else:
            contributorDict[contributor] = contributorDict[contributor].union({data[7]})

In [10]:
n = len(contributorDict)
print('N pairs = ', n*(n-1)/2)

N pairs =  24147775.0


In [11]:
for key in list(contributorDict.keys()):
    if len(contributorDict[key]) <= 500:
        contributorDict.pop(key, None)
n = len(contributorDict)
print('N pairs = ', n*(n-1)/2)

N pairs =  1596.0


In [None]:
for key in contributorDict:
    print(nameDict[key], len(contributorDict[key]))

In [12]:
contributors = list(contributorDict.keys())

In [13]:
pairs = [(contributors[i], contributors[j]) for i in range(n-1) for j in range(i+1, n, 1)]

In [14]:
simDict = {}
for commA, commB in pairs:
    A = contributorDict[commA] # Set of contributors to commA.
    nameA = nameDict[commA]
    B = contributorDict[commB]
    nameB = nameDict[commB]
    nIntersection = len(A.intersection(B))
    jAB = nIntersection/len(A.union(B))
    #jAB = nIntersection/float(len(A.union(B)))    #for python version less 3.0
    pAGivenB = nIntersection/len(B)
    pBGivenA = nIntersection/len(A)
    simDict[(nameA, nameB)] = (jAB, pAGivenB, pBGivenA)

In [15]:
sortedList = sorted(simDict.items(), key=operator.itemgetter(1), reverse=True)

In [16]:
for committees, simMeasures in sortedList:
    nameA, nameB = committees
    jAB, pAB, pBA = simMeasures
    if jAB > .5:
        print(round(jAB, 3), round(pAB, 3), round(pBA, 3), nameA + ' | ' + nameB)

0.596 0.809 0.693 COMCAST CORPORATION & NBCUNIVERSAL POLITICAL ACTION COMMITTEE - FEDERAL | VERIZON COMMUNICATIONS INC. GOOD GOVERNMENT CLUB (VERIZON PAC)
0.582 0.713 0.76 GENERAL ELECTRIC COMPANY POLITICAL ACTION COMMITTEE (GEPAC) | COMCAST CORPORATION & NBCUNIVERSAL POLITICAL ACTION COMMITTEE - FEDERAL
0.57 0.651 0.821 NATIONAL BEER WHOLESALERS ASSOCIATION POLITICAL ACTION COMMITTEE | NATIONAL ASSOCIATION OF REALTORS POLITICAL ACTION COMMITTEE
0.558 0.788 0.656 AT&T INC. FEDERAL POLITICAL ACTION COMMITTEE (AT&T FEDERAL PAC) | VERIZON COMMUNICATIONS INC. GOOD GOVERNMENT CLUB (VERIZON PAC)
0.558 0.75 0.685 GENERAL ELECTRIC COMPANY POLITICAL ACTION COMMITTEE (GEPAC) | VERIZON COMMUNICATIONS INC. GOOD GOVERNMENT CLUB (VERIZON PAC)
0.537 0.659 0.743 LOCKHEED MARTIN CORPORATION EMPLOYEES' POLITICAL ACTION COMMITTEE | AT&T INC. FEDERAL POLITICAL ACTION COMMITTEE (AT&T FEDERAL PAC)
0.531 0.639 0.759 EMPLOYEES OF NORTHROP GRUMMAN CORPORATION PAC | GENERAL ELECTRIC COMPANY POLITICAL ACTION COM