# Similarity measure - MPs names 

Use stack overflow code to get a measure of similarity between MPs names


In [9]:
# Imports

import sys, operator
import pandas as pd

In [10]:
# Test sample

sample = ["NBA Basketball", "Basketball NBA", "Basketball", "Baseball"]

sample

['NBA Basketball', 'Basketball NBA', 'Basketball', 'Baseball']

In [11]:
# Functions

def tokenize(s, glen):
    g2 = set()
    for i in range(len(s)-(glen-1)):
        g2.add(s[i:i+glen])
    return g2

def dice_grams(g1, g2):
    return (2.0*len(g1 & g2)) / (len(g1)+len(g2))

def dice(n, s1, s2):
    return dice_grams(tokenize(s1, n), tokenize(s2, n))

def term_similarity(sample):
    GRAM_LEN = 4
    scores = []
    for i in range(0,len(sample)):
        for j in range(i+1, len(sample)):
            s1 = sample[i]            
            s2 = sample[j]
            score = dice(GRAM_LEN, s1, s2)
#            print(s1, ":", s2, " = ", score)
            scores.append([s1, s2, score])
    return pd.DataFrame(scores, columns=["Label1", "Label2", "Score"])

                    
sim_scores = term_similarity(sample)
sim_scores

Unnamed: 0,Label1,Label2,Score
0,NBA Basketball,Basketball NBA,0.636364
1,NBA Basketball,Basketball,0.777778
2,NBA Basketball,Baseball,0.125
3,Basketball NBA,Basketball,0.777778
4,Basketball NBA,Baseball,0.125
5,Basketball,Baseball,0.166667


In [12]:
# Run on MPs names

# Read in the lookup
import csv
with open('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\mps.csv', 'r') as f:
    reader = csv.reader(f)
    mp_list = list(reader)


# Split imported lookup into MP name and Consituency

mp_names = []
mp_consts = []

for row in mp_list[1:]:
#    mp_names.append(list(itertools.chain.from_iterable(row[1:3])))
    temp = row[1] + ' ' + row[2]
    mp_names.append(temp)
    temp2 = row[4]
    mp_consts.append(temp2)

mp_names

['Diane Abbott',
 'Debbie Abrahams',
 'Nigel Adams',
 'Adam Afriyie',
 'Tasmina Ahmed-Sheikh',
 'Peter Aldous',
 'Heidi Alexander',
 'Rushanara Ali',
 'Lucy Allan',
 'Graham Allen',
 'Heidi Allen',
 'Rosena Allin-Khan',
 'David Amess',
 'David Anderson',
 'Stuart Andrew',
 'Caroline Ansell',
 'Edward Argar',
 'Richard Arkless',
 'Jon Ashworth',
 'Victoria Atkins',
 'Ian Austin',
 'Richard Bacon',
 'Adrian Bailey',
 'Steven Baker',
 'Harriett Baldwin',
 'Stephen Barclay',
 'Hannah Bardell',
 'John Baron',
 'Kevin Barron',
 'Gavin Barwell',
 'Guto Bebb',
 'Margaret Beckett',
 'Henry Bellingham',
 'Hilary Benn',
 'Richard Benyon',
 'John Bercow',
 'Paul Beresford',
 'Luciana Berger',
 'Jake Berry',
 'James Berry',
 'Clive Betts',
 'Andrew Bingham',
 'Mhairi Black',
 'Ian Blackford',
 'Bob Blackman',
 'Kirsty Blackman',
 'Roberta Blackman-Woods',
 'Nicola Blackwood',
 'Tom Blenkinsop',
 'Paul Blomfield',
 'Crispin Blunt',
 'Nicholas Boles',
 'Peter Bone',
 'Victoria Borwick',
 'Phil Boswel

In [13]:
# Run similarity measure on MPs names

sim_scores = term_similarity(mp_names)
sim_scores

Unnamed: 0,Label1,Label2,Score
0,Diane Abbott,Debbie Abrahams,0.095238
1,Diane Abbott,Nigel Adams,0.000000
2,Diane Abbott,Adam Afriyie,0.000000
3,Diane Abbott,Tasmina Ahmed-Sheikh,0.000000
4,Diane Abbott,Peter Aldous,0.000000
5,Diane Abbott,Heidi Alexander,0.000000
6,Diane Abbott,Rushanara Ali,0.000000
7,Diane Abbott,Lucy Allan,0.000000
8,Diane Abbott,Graham Allen,0.000000
9,Diane Abbott,Heidi Allen,0.000000


In [14]:
sim_scores.loc[sim_scores['Score'] > 0.5].sort_values('Score', ascending=False)


Unnamed: 0,Label1,Label2,Score
80995,David Davies,David Davis,0.800000
194523,Owen Paterson,Steven Paterson,0.727273
45825,Richard Burden,Richard Burgon,0.727273
180386,Stewart McDonald,Stuart McDonald,0.720000
77406,Alex Cunningham,Jim Cunningham,0.695652
186840,Carol Monaghan,Paul Monaghan,0.666667
67965,Julie Cooper,Rosie Cooper,0.666667
169521,Ivan Lewis,Julian Lewis,0.625000
155665,Alan Johnson,Jo Johnson,0.625000
41786,Alan Brown,Lyn Brown,0.615385


In [26]:
sim_scores.loc[sim_scores['Label1'] == 'Michael Fallon'].loc[sim_scores['Label2'] == 'Tim Farron']

Unnamed: 0,Label1,Label2,Score
106992,Michael Fallon,Tim Farron,0.0
