# Adjacencies between entitites

Calculate adjacency matrix for entities in 4 document Hansard example

In [1]:
# Imports

# Stanford entity extraction full run - code taken from Four_entity_process_stanford_NER_v1_0.ipynb

# Stanford imports
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer


# Other imports
import os
import nltk
from collections import defaultdict


# NLTK imports
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

import tkinter

import pandas as pd
import numpy as np

from collections import Counter
import operator

import math

In [4]:



### Define paths ###

#Set core path for Stanford NLP packages
main_path = os.path.join("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\StanfordNLP\\",
                         "stanford-corenlp-full-2016-10-31\\")
# Set paths where the Standford NLP .jar files are located
pathlist = [os.path.join(main_path,"stanford-corenlp-3.7.0"),
            os.path.join(main_path,"ner\\stanford-ner.jar"),
            os.path.join(main_path,"postagger\\stanford-postagger.jar")]
###            os.path.join(main_path,"parser\\stanford-parser.jar"),
###            os.path.join(main_path,"parser\\stanford-parser-3.6.0-models.jar"),
# Set path to Stanford models
mpath = [os.path.join(main_path,"postagger\\models"),
         os.path.join(main_path,"ner\\classifiers")]
# Set path to java.exe
javapath = "C:\\Program Files\\Java\\jre1.8.0_121\\bin\\java.exe"

# Add paths to the CLASSPATH environmental variable (as instructed by NLTK)
os.environ['CLASSPATH'] = os.pathsep.join(pathlist)
os.environ['STANFORD_MODELS'] = os.pathsep.join(mpath)
os.environ['JAVAHOME'] = javapath



# Define function to tag NER sentence with BIO tags
def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
    # Return BIO tagged sentence
    return bio_tagged_sent



### Main function ###

def stanford_ner_full_run(file_in, file_out_suffix):
    
    # Read in file
    sample = open(file_in, 'r', encoding='utf8').read()
    
    
    # Tokenize sentence with stanford NLP
    tkn_sent = StanfordTokenizer().tokenize(sample)

    # Named entity tagging with stanford NLP
    # tag_sent = StanfordNERTagger('english.muc.7class.distsim.crf.ser.gz').tag(tkn_sent)
    tag_sent = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz').tag(tkn_sent) 

    # Apply BIO tags to the tagged sentence
    bio_tagged_sent = stanfordNE2BIO(tag_sent)
    
    # Collate BIO parts of entities together
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne
                     in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)

    
    # Get entities from the trees
    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    
    # Output entities (with duplicates retained)
    return ne_in_sent



In [3]:
# Run for the 14 files separately

ents_mn = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Motor Neurone Disease-Gordon Aikman 2017-02-20.txt','motor_neurone')
ents_tr = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\President Trump- State Visit 2017-02-20.txt', 'trump')
ents_hi = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\High Speed Rail (London - West Midlands) Bill 2017-02-20.txt', 'highspeedrail')
ents_va = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Vauxhall-Opel- Proposed Takeover 2017-02-20.txt', 'vauxhallopel')

# New ones
ents_af = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Armed Forces- Historical Cases 2017-02-23.txt', 'armedforces')
ents_ag = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Aster Group Housing Association 2017-02-22.txt', 'astergroup')
ents_hs2 = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\High Speed 2 (Newton) 2017-02-23.txt', 'hs2')
ents_hiv = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\HIV Awareness- PSHE Lessons 2017-02-24.txt', 'hiv')
ents_jam = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Jamal al-Harith 2017-02-23.txt', 'jamalalharith')
ents_lv = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Lee Valley Regional Park (Amendment) 2017-02-22.txt', 'leevalley')
ents_lgf = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Local Government Finance 2017-02-22.txt', 'localgovfin')
ents_pg = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Police Grant 2017-02-22.txt', 'policegrant')
ents_ssp = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Social Security and Pensions 2017-02-21.txt', 'sspensions')
ents_uc = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Unaccompanied Children  (Greece and Italy) 2017-02-23.txt', 'unacchild')


In [4]:
def reduced_set(list_in):

    reduced_list = []
    for row in list_in:
        if row[1] in ['PERSON', 'LOCATION', 'ORGANIZATION']:
            reduced_list.append(row)
    return set(reduced_list)


# Old ones
reduced_tr = reduced_set(ents_tr)
reduced_mn = reduced_set(ents_mn)
reduced_hi = reduced_set(ents_hi)
reduced_va = reduced_set(ents_va)


# New ones
reduced_af = reduced_set(ents_af)
reduced_ag = reduced_set(ents_ag)
reduced_hs2 = reduced_set(ents_hs2)
reduced_hiv = reduced_set(ents_hiv)
reduced_jam = reduced_set(ents_jam)
reduced_lv = reduced_set(ents_lv)
reduced_lgf = reduced_set(ents_lgf)
reduced_pg = reduced_set(ents_pg)
reduced_ssp = reduced_set(ents_ssp)
reduced_uc = reduced_set(ents_uc)


# Check one output
reduced_uc

{('Aamir', 'ORGANIZATION'),
 ('Afghanistan', 'LOCATION'),
 ('Africa', 'LOCATION'),
 ('Alan Kurdi', 'PERSON'),
 ('Aleppo', 'LOCATION'),
 ('Alex Salmond', 'PERSON'),
 ('Alf', 'PERSON'),
 ('Alf Dubs', 'PERSON'),
 ('Alison McGovern', 'PERSON'),
 ('Amnesty International', 'ORGANIZATION'),
 ('Anne McLaughlin', 'PERSON'),
 ('Anthony Rowlands', 'PERSON'),
 ('Anthony Steen', 'PERSON'),
 ('Argyll', 'ORGANIZATION'),
 ('Arthur Helton', 'PERSON'),
 ('Athens', 'LOCATION'),
 ('Azraq', 'LOCATION'),
 ('Backbench Business Committee', 'ORGANIZATION'),
 ('Balkans', 'LOCATION'),
 ('Belgium', 'LOCATION'),
 ('Birmingham', 'LOCATION'),
 ('Birmingham City Council', 'ORGANIZATION'),
 ('Bradford West', 'ORGANIZATION'),
 ('Braintree', 'LOCATION'),
 ("Brendan O'Hara", 'PERSON'),
 ('Bristol City Council', 'ORGANIZATION'),
 ('Britain', 'LOCATION'),
 ('Bulgaria', 'LOCATION'),
 ('Burrowes', 'PERSON'),
 ('Calais', 'LOCATION'),
 ('Cambridgeshire', 'LOCATION'),
 ('Camden', 'LOCATION'),
 ('Canada', 'LOCATION'),
 ('Carolin

In [8]:
# Rack together and count probabilities

reduced_all = list(reduced_tr) + list(reduced_mn) + list(reduced_hi) + list(reduced_va) + list(reduced_af) \
                + list(reduced_ag) + list(reduced_hs2) + list(reduced_hiv) + list(reduced_jam) + list(reduced_lv) \
                + list(reduced_lgf) + list(reduced_pg) + list(reduced_ssp) + list(reduced_uc)

counts = Counter(reduced_all)
counts_sorted = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

counts_sorted

[(('House', 'ORGANIZATION'), 14),
 (('Government', 'ORGANIZATION'), 14),
 (('United Kingdom', 'LOCATION'), 9),
 (('UK', 'ORGANIZATION'), 9),
 (('London', 'LOCATION'), 9),
 (('Britain', 'LOCATION'), 9),
 (('England', 'LOCATION'), 9),
 (('Parliament', 'ORGANIZATION'), 8),
 (('Department', 'ORGANIZATION'), 8),
 (('Secretary of State', 'ORGANIZATION'), 8),
 (('UK Government', 'ORGANIZATION'), 7),
 (('Birmingham', 'LOCATION'), 7),
 (('Gentleman', 'PERSON'), 7),
 (('Scotland', 'LOCATION'), 7),
 (('Wales', 'LOCATION'), 6),
 (('Europe', 'LOCATION'), 6),
 (('Select Committee', 'ORGANIZATION'), 6),
 (('Mr Speaker', 'PERSON'), 5),
 (('Coventry South', 'ORGANIZATION'), 5),
 (('Kingston', 'LOCATION'), 5),
 (('Parliament', 'LOCATION'), 5),
 (('Jim Cunningham', 'PERSON'), 5),
 (('United States', 'LOCATION'), 5),
 (('Madam Deputy Speaker', 'ORGANIZATION'), 5),
 (('Labour Government', 'ORGANIZATION'), 5),
 (('Committee', 'ORGANIZATION'), 5),
 (('Northern Ireland', 'LOCATION'), 5),
 (("Her Majesty 's Go

In [103]:
# Check for occurences of Trump entities


for item in list(reduced_uc):
    if item[0] == 'Trump':
        print(item)
        

('Trump', 'ORGANIZATION')
('Trump', 'PERSON')


In [9]:
# Convert to Dataframe
num_docs=14

counts_df = pd.DataFrame(counts_sorted)
counts_df.columns = ['Entity', 'Count']
counts_df['Prob'] = counts_df['Count'] / num_docs
counts_df

Unnamed: 0,Entity,Count,Prob
0,"(House, ORGANIZATION)",14,1.000000
1,"(Government, ORGANIZATION)",14,1.000000
2,"(United Kingdom, LOCATION)",9,0.642857
3,"(UK, ORGANIZATION)",9,0.642857
4,"(London, LOCATION)",9,0.642857
5,"(Britain, LOCATION)",9,0.642857
6,"(England, LOCATION)",9,0.642857
7,"(Parliament, ORGANIZATION)",8,0.571429
8,"(Department, ORGANIZATION)",8,0.571429
9,"(Secretary of State, ORGANIZATION)",8,0.571429


In [10]:
# Using Trump, PERSON as an example, 
# cycle through the sets from each doc
# if Trump, PERSON is present, count +1 for all other elements in the set
# sum the total count for all entities
# Divide by the numenbr of docs (in this case 4) to calc the Prob(ent1,ent2)

entity = ('Trump', 'PERSON')

assoc_counts = []

set_list=[reduced_tr, reduced_mn, reduced_hi, reduced_va, reduced_af, reduced_ag, reduced_hs2, 
          reduced_hiv, reduced_jam, reduced_lv, reduced_lgf, reduced_pg, reduced_ssp, reduced_uc]
num_docs = len(set_list)

for item in set_list:
    search_list = list(item)

    if entity in search_list:
        assoc_counts = assoc_counts + search_list


assoc_counts = Counter(assoc_counts)

list(assoc_counts.items())

[(('Mall', 'ORGANIZATION'), 1),
 (('Edward Leigh That', 'PERSON'), 1),
 (('Tiananmen Square', 'LOCATION'), 1),
 (('Randy Krone', 'PERSON'), 1),
 (('Hugh MacDiarmid', 'PERSON'), 1),
 (('US Secretary of Homeland Security', 'ORGANIZATION'), 1),
 (('House of Commons', 'ORGANIZATION'), 2),
 (('America', 'LOCATION'), 1),
 (('Administration', 'ORGANIZATION'), 1),
 (('Ribble Valley', 'ORGANIZATION'), 1),
 (('Mr Speaker', 'PERSON'), 3),
 (('Dallas', 'LOCATION'), 1),
 (('Seema Malhotra', 'PERSON'), 1),
 (('Melania Trump', 'PERSON'), 1),
 (('Penarth', 'LOCATION'), 1),
 (('Geneva', 'LOCATION'), 1),
 (('Jacob Rees-Mogg', 'PERSON'), 1),
 (('Dawn Butler', 'PERSON'), 1),
 (('Aung San Suu Kyi', 'LOCATION'), 1),
 (('House', 'ORGANIZATION'), 4),
 (('Hillary Clinton', 'PERSON'), 1),
 (('Florida', 'LOCATION'), 1),
 (('Nigel Evans', 'PERSON'), 1),
 (('Julian Lewis', 'PERSON'), 1),
 (('UK Government', 'ORGANIZATION'), 4),
 (('Omaha', 'LOCATION'), 1),
 (('Obama', 'PERSON'), 2),
 (('Caroline Lucas', 'PERSON'),

In [11]:
# Convert to dataframe

assoc_df = pd.DataFrame(list(assoc_counts.items()))

assoc_df.columns = ['Entity', 'Assoc_Count']

assoc_df['Assoc_Prob'] = assoc_df['Assoc_Count'] / num_docs

assoc_df

Unnamed: 0,Entity,Assoc_Count,Assoc_Prob
0,"(Mall, ORGANIZATION)",1,0.071429
1,"(Edward Leigh That, PERSON)",1,0.071429
2,"(Tiananmen Square, LOCATION)",1,0.071429
3,"(Randy Krone, PERSON)",1,0.071429
4,"(Hugh MacDiarmid, PERSON)",1,0.071429
5,"(US Secretary of Homeland Security, ORGANIZATION)",1,0.071429
6,"(House of Commons, ORGANIZATION)",2,0.142857
7,"(America, LOCATION)",1,0.071429
8,"(Administration, ORGANIZATION)",1,0.071429
9,"(Ribble Valley, ORGANIZATION)",1,0.071429


In [12]:
# Merge together

merged_df = pd.merge(assoc_df, counts_df, left_on='Entity', right_on='Entity', how='left')

merged_df['Assoc_measure'] = merged_df['Assoc_Prob'] / merged_df['Prob']

merged_df

Unnamed: 0,Entity,Assoc_Count,Assoc_Prob,Count,Prob,Assoc_measure
0,"(Mall, ORGANIZATION)",1,0.071429,1,0.071429,1.000000
1,"(Edward Leigh That, PERSON)",1,0.071429,1,0.071429,1.000000
2,"(Tiananmen Square, LOCATION)",1,0.071429,1,0.071429,1.000000
3,"(Randy Krone, PERSON)",1,0.071429,1,0.071429,1.000000
4,"(Hugh MacDiarmid, PERSON)",1,0.071429,1,0.071429,1.000000
5,"(US Secretary of Homeland Security, ORGANIZATION)",1,0.071429,1,0.071429,1.000000
6,"(House of Commons, ORGANIZATION)",2,0.142857,2,0.142857,1.000000
7,"(America, LOCATION)",1,0.071429,1,0.071429,1.000000
8,"(Administration, ORGANIZATION)",1,0.071429,1,0.071429,1.000000
9,"(Ribble Valley, ORGANIZATION)",1,0.071429,1,0.071429,1.000000


In [13]:
# Sort 

merged_df.sort_values('Assoc_measure', ascending=False)

Unnamed: 0,Entity,Assoc_Count,Assoc_Prob,Count,Prob,Assoc_measure
0,"(Mall, ORGANIZATION)",1,0.071429,1,0.071429,1.000000
458,"(Mark Burns-Williamson, PERSON)",1,0.071429,1,0.071429,1.000000
450,"(Her Majesty 's Inspectorate of Constabulary, ...",1,0.071429,1,0.071429,1.000000
451,"(Rishi Sunak, PERSON)",1,0.071429,1,0.071429,1.000000
453,"(Kathryn Holloway, PERSON)",1,0.071429,1,0.071429,1.000000
454,"(Richard Drax, PERSON)",1,0.071429,1,0.071429,1.000000
455,"(Clive Grunshaw, PERSON)",1,0.071429,1,0.071429,1.000000
456,"(Lisa, PERSON)",1,0.071429,1,0.071429,1.000000
457,"(Brandon Lewis, PERSON)",1,0.071429,1,0.071429,1.000000
459,"(Coventry North West, ORGANIZATION)",1,0.071429,1,0.071429,1.000000


In [14]:
# Look up a specific value

merged_df[(merged_df['Entity'].astype(str)=='(\'Trump\', \'ORGANIZATION\')')]

Unnamed: 0,Entity,Assoc_Count,Assoc_Prob,Count,Prob,Assoc_measure
187,"(Trump, ORGANIZATION)",2,0.142857,2,0.142857,1.0


# Matrix method - test data

In [56]:
# Test

# test_list1=['a','b', 'c']
# test_list2=['a', 'c']
# test_list3=['a', 'b']

test_list1=['Cabinet', 'T. May', 'P. Hammond', 'A. Rudd', 'B. Johnson', 'D. Davis']
test_list2=['Commons', 'T. May', 'P. Hammond', 'A. Rudd', 'B. Johnson', 'D. Davis', 'J. Corbyn', 'T. Farron', 'N. Sturgeon']
test_list3=['No.10', 'T. May']
test_list4=['No.11', 'P. Hammond']

sample_lists=[test_list1, test_list2, test_list3, test_list4]

# Make vocabulary
vocab_list = list(sorted(set(test_list1 + test_list2 + test_list3 + test_list4)))



# Get size of vocabulary (for matrix size)
vocab_size=len(vocab_list)

num_docs = len(sample_lists)

matrix = [[0 for x in range(vocab_size)] for y in range(vocab_size)]


for list_item in sample_lists:
    n=0
    for item1 in list_item:
        for item2 in list_item[n:]:
            # print(item1, ' ', list_item.index(item1), ' ', item2, list_item.index(item2))
            matrix[vocab_list.index(item1)][vocab_list.index(item2)] += 1
            if item1 != item2:
                matrix[vocab_list.index(item2)][vocab_list.index(item1)] += 1
        n += 1


In [57]:
# Show in a data frame

matrix_df = pd.DataFrame(matrix)

matrix_df.columns = [vocab_list]
matrix_df['Name'] = vocab_list
matrix_cols = matrix_df.columns.tolist()
matrix_cols = matrix_cols[-1:] + matrix_cols[:-1]
matrix_df = matrix_df[matrix_cols]

matrix_df

Unnamed: 0,Name,A. Rudd,B. Johnson,Cabinet,Commons,D. Davis,J. Corbyn,N. Sturgeon,No.10,No.11,P. Hammond,T. Farron,T. May
0,A. Rudd,2,2,1,1,2,1,1,0,0,2,1,2
1,B. Johnson,2,2,1,1,2,1,1,0,0,2,1,2
2,Cabinet,1,1,1,0,1,0,0,0,0,1,0,1
3,Commons,1,1,0,1,1,1,1,0,0,1,1,1
4,D. Davis,2,2,1,1,2,1,1,0,0,2,1,2
5,J. Corbyn,1,1,0,1,1,1,1,0,0,1,1,1
6,N. Sturgeon,1,1,0,1,1,1,1,0,0,1,1,1
7,No.10,0,0,0,0,0,0,0,1,0,0,0,1
8,No.11,0,0,0,0,0,0,0,0,1,1,0,0
9,P. Hammond,2,2,1,1,2,1,1,0,1,3,1,2


In [58]:
# Calc row and column max's
max_over_cols_df = matrix_df.ix[:, 1:].max(axis=1)
max_over_rows_df = matrix_df.ix[:, 1:].max(axis=0)


# Turn into probabilities

assoc_df =  matrix_df.ix[:, 1:].divide(num_docs) . divide(max_over_cols_df/num_docs, axis=0).divide(max_over_rows_df/num_docs, axis=1) 

assoc_df = (np.log2(assoc_df.astype('float64')) )

assoc_df = assoc_df.replace(to_replace="-inf", value="")

assoc_df





Unnamed: 0,A. Rudd,B. Johnson,Cabinet,Commons,D. Davis,J. Corbyn,N. Sturgeon,No.10,No.11,P. Hammond,T. Farron,T. May
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,0.415037,1.0,0.415037
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,0.415037,1.0,0.415037
2,1.0,1.0,2.0,,1.0,,,,,0.415037,,0.415037
3,1.0,1.0,,2.0,1.0,2.0,2.0,,,0.415037,2.0,0.415037
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,0.415037,1.0,0.415037
5,1.0,1.0,,2.0,1.0,2.0,2.0,,,0.415037,2.0,0.415037
6,1.0,1.0,,2.0,1.0,2.0,2.0,,,0.415037,2.0,0.415037
7,,,,,,,,2.0,,,,0.415037
8,,,,,,,,,2.0,0.415037,,
9,0.415037,0.415037,0.415037,0.415037,0.415037,0.415037,0.415037,,0.415037,0.415037,0.415037,-0.169925


In [59]:
# Tag back on name column

assoc_df['Name'] = matrix_df['Name']
assoc_cols = assoc_df.columns.tolist()
assoc_cols = assoc_cols[-1:] + assoc_cols[:-1]
assoc_df = assoc_df[assoc_cols]

assoc_df

Unnamed: 0,Name,A. Rudd,B. Johnson,Cabinet,Commons,D. Davis,J. Corbyn,N. Sturgeon,No.10,No.11,P. Hammond,T. Farron,T. May
0,A. Rudd,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,0.415037,1.0,0.415037
1,B. Johnson,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,0.415037,1.0,0.415037
2,Cabinet,1.0,1.0,2.0,,1.0,,,,,0.415037,,0.415037
3,Commons,1.0,1.0,,2.0,1.0,2.0,2.0,,,0.415037,2.0,0.415037
4,D. Davis,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,0.415037,1.0,0.415037
5,J. Corbyn,1.0,1.0,,2.0,1.0,2.0,2.0,,,0.415037,2.0,0.415037
6,N. Sturgeon,1.0,1.0,,2.0,1.0,2.0,2.0,,,0.415037,2.0,0.415037
7,No.10,,,,,,,,2.0,,,,0.415037
8,No.11,,,,,,,,,2.0,0.415037,,
9,P. Hammond,0.415037,0.415037,0.415037,0.415037,0.415037,0.415037,0.415037,,0.415037,0.415037,0.415037,-0.169925


In [60]:
# Export to csv

assoc_df.to_csv('adjacencies_between_entities_test_example.csv')

# Matrix method - Hansard data

In [16]:
# Dataset - reduced_all

# Make vocabulary
vocab_list = list(sorted(set(reduced_all)))

# Get size of vocabulary (for matrix size)
vocab_size=len(vocab_list)

sample_lists=[list(reduced_hi), list(reduced_mn), list(reduced_tr), list(reduced_va), list(reduced_af), 
              list(reduced_ag), list(reduced_hs2), list(reduced_hiv), list(reduced_jam), list(reduced_lv), 
              list(reduced_lgf), list(reduced_pg), list(reduced_ssp), list(reduced_uc)]

# Get the number of documents in the sample
num_docs = len(sample_lists)

# Construct a 2d matrix of zeros, for each word in the vocabulary 
matrix = [[0 for x in range(vocab_size)] for y in range(vocab_size)]

# Count co-occurences of pairs of words
for list_item in sample_lists:
    n=0
    for item1 in list_item:
        for item2 in list_item[n:]:
            # print(item1, ' ', list_item.index(item1), ' ', item2, list_item.index(item2))
            matrix[vocab_list.index(item1)][vocab_list.index(item2)] += 1
            if item1 != item2:
                matrix[vocab_list.index(item2)][vocab_list.index(item1)] += 1
        n += 1


In [26]:
# Convert to DataFrame
matrix_df = pd.DataFrame(matrix)

# Pull name field to start of dataframe
matrix_df.columns = [vocab_list]
matrix_df['Name'] = vocab_list
matrix_cols = matrix_df.columns.tolist()
matrix_cols = matrix_cols[-1:] + matrix_cols[:-1]
matrix_df = matrix_df[matrix_cols]


# Calc row and column max's
max_over_cols_df = matrix_df.ix[:, 1:].max(axis=1)
max_over_rows_df = matrix_df.ix[:, 1:].max(axis=0)



# Turn into probabilities
assoc_df =  matrix_df.ix[:, 1:].divide(num_docs) . \
            divide(max_over_cols_df/num_docs, axis=0).divide(max_over_rows_df/num_docs, axis=1) 

# Apply trasnform from Stanford (log to base 2 of the probability ratio) 
assoc_df = (np.log2(assoc_df.astype('float64')) )

# Blank out cells where a divide by zero occured
# assoc_df = assoc_df.replace(to_replace="-inf", value="")


# Merge back on Name column and move to start of the dataframe
# assoc_df = pd.merge(matrix_df['Name'].to_frame(), assoc_df, left_index=True, right_index=True)
assoc_df[('Name', '')] = matrix_df['Name']
assoc_cols = assoc_df.columns.tolist()
assoc_cols = assoc_cols[-1:] + assoc_cols[:-1]
assoc_df = assoc_df[assoc_cols]


# Show result, blanking out cells where a divide by zero occured
assoc_df.replace(to_replace="-inf", value="")



Unnamed: 0_level_0,Name,A&E,AAC,ALS Association,Aamir,Aberdeen,Aberdeen South,Acquire Land Compulsorily 18:01:00 The Parliamentary Under-Secretary of State for Transport,Adam Holloway,Administration,...,the Health Select Committee,the Justice,the Local Government Association,the London Finance Commission,the Ministry of Defence,the Motor Neurone Disease Association,the Netherlands,the Northern Ireland Office,the Transport Secretary,the University of Oxford
Unnamed: 0_level_1,Unnamed: 1_level_1,ORGANIZATION,ORGANIZATION,ORGANIZATION,ORGANIZATION,LOCATION,ORGANIZATION,ORGANIZATION,PERSON,ORGANIZATION,...,ORGANIZATION,ORGANIZATION,ORGANIZATION,ORGANIZATION,ORGANIZATION,ORGANIZATION,LOCATION,ORGANIZATION,ORGANIZATION,ORGANIZATION
0,"(A&E, ORGANIZATION)",2.80735,,,,,,,,,...,2.80735,,,2.80735,,,,,,
1,"(AAC, ORGANIZATION)",,3.80735,3.80735,,,,,,,...,,,,,,3.80735,,,,3.80735
2,"(ALS Association, ORGANIZATION)",,3.80735,3.80735,,,,,,,...,,,,,,3.80735,,,,3.80735
3,"(Aamir, ORGANIZATION)",,,,3.80735,,,,,,...,,,3.80735,,,,,,,
4,"(Aberdeen, LOCATION)",,,,,3.80735,,,,,...,,,,,,,,,,
5,"(Aberdeen South, ORGANIZATION)",,,,,,3.80735,,,,...,,,,,,,,,,
6,(Acquire Land Compulsorily 18:01:00 The Parlia...,,,,,,,3.80735,,,...,,,,,,,,,3.80735,
7,"(Adam Holloway, PERSON)",,,,,,,,3.80735,3.80735,...,,,,,,,3.80735,,,
8,"(Administration, ORGANIZATION)",,,,,,,,3.80735,3.80735,...,,,,,,,3.80735,,,
9,"(Afghanistan, LOCATION)",,,,2.80735,,,,,,...,,,2.80735,,2.80735,,,2.80735,,


In [44]:
# Show an example for Trump

trump_df = assoc_df[['Name', 'Trump']]

trump_df.sort_values(('Trump', 'PERSON'), ascending=False).replace(to_replace="-inf", value="")

Unnamed: 0_level_0,Name,Trump,Trump,Trump
Unnamed: 0_level_1,Unnamed: 1_level_1,LOCATION,ORGANIZATION,PERSON
747,"(Lebanon, LOCATION)",,2.80735,1.80735
404,"(Eisenhower, PERSON)",3.80735,2.80735,1.80735
961,"(Newport West, ORGANIZATION)",3.80735,2.80735,1.80735
960,"(Newport West, LOCATION)",3.80735,2.80735,1.80735
959,"(Newington, LOCATION)",,1.80735,1.80735
958,"(Newcastle, ORGANIZATION)",,2.80735,1.80735
411,"(Emperor Hirohito, PERSON)",3.80735,2.80735,1.80735
412,"(Enfield, ORGANIZATION)",,2.80735,1.80735
954,"(New Hampshire, LOCATION)",3.80735,2.80735,1.80735
946,"(Nazis, ORGANIZATION)",3.80735,2.80735,1.80735


In [47]:
# Find entities with a strong association to Trump - n..b too many to show, based on 4 debates

len(trump_df.loc[trump_df[('Trump', 'PERSON')] > 1.8])

561

In [48]:
# Show an example for Motor Neurone Disease

mnd_df = assoc_df[['Name', 'MND']]

mnd_df.sort_values(('MND', 'ORGANIZATION'), ascending=False).replace(to_replace="-inf", value="")

Unnamed: 0_level_0,Name,MND
Unnamed: 0_level_1,Unnamed: 1_level_1,ORGANIZATION
1494,"(the University of Oxford, ORGANIZATION)",3.80735
1158,"(Rutland, LOCATION)",3.80735
458,"(Galloway, PERSON)",3.80735
446,"(Foundation of Hearts, ORGANIZATION)",3.80735
445,"(Fort William, LOCATION)",3.80735
441,"(First Minister of Scotland, ORGANIZATION)",3.80735
1078,"(Prague, LOCATION)",3.80735
420,"(Euan MacDonald, PERSON)",3.80735
1115,"(Richard Arkless, PERSON)",3.80735
395,"(Edinburgh University, ORGANIZATION)",3.80735


In [50]:
# Find entities with a strong association to MND - n..b too many to show, based on 4 debates

len(mnd_df.loc[mnd_df[('MND', 'ORGANIZATION')] > 3.8])

65

In [39]:
# Find the maximum value across the whole dataframe

assoc_df.ix[:, 1:].values.max()

3.8073549220576042

In [70]:
# Count the numebr of strongly related entities, for each entity in the sample

count_assoc_ents = assoc_df.ix[:, 1:].apply(lambda x: x>2, axis=1).sum()


In [80]:
count_assoc_ents[['Trump', 'MND', 'Vauxhall', 'High Speed Rail']]

Trump            LOCATION        279
                 ORGANIZATION    384
                 PERSON            0
MND              ORGANIZATION     83
Vauxhall         ORGANIZATION     95
High Speed Rail  ORGANIZATION    156
dtype: int64

In [83]:
# Export to CSV

assoc_df.replace(to_replace="-inf", value="").to_csv('adjacent_entities_hansard_14_sample.csv', index=False)

count_assoc_ents.to_csv('adjacent_entity_counts_hansard_14_sample.csv')



# Export to Excel - n.b. an only export 256 columns this way

# assoc_df.replace(to_replace="-inf", value="").to_excel('adjacent_entities_hansard_14_sample.xls')


# Scrap code

In [35]:
assoc_df.size

2236520

In [36]:
assoc_df.shape[0]

1495

In [37]:
assoc_df.shape[1]

1496

In [84]:
matrix_df[['Name', 'Trump']]

Unnamed: 0_level_0,Name,Trump,Trump,Trump
Unnamed: 0_level_1,Unnamed: 1_level_1,LOCATION,ORGANIZATION,PERSON
0,"(A&E, ORGANIZATION)",0,0,1
1,"(AAC, ORGANIZATION)",0,0,0
2,"(ALS Association, ORGANIZATION)",0,0,0
3,"(Aamir, ORGANIZATION)",0,1,1
4,"(Aberdeen, LOCATION)",0,0,0
5,"(Aberdeen South, ORGANIZATION)",0,0,0
6,(Acquire Land Compulsorily 18:01:00 The Parlia...,0,0,0
7,"(Adam Holloway, PERSON)",1,1,1
8,"(Administration, ORGANIZATION)",1,1,1
9,"(Afghanistan, LOCATION)",0,1,1


In [94]:
list(reduced_uc)

[('Great Britain', 'LOCATION'),
 ('Kilsyth', 'PERSON'),
 ('Hollande', 'PERSON'),
 ('Nazi Germany', 'LOCATION'),
 ('Dublin', 'LOCATION'),
 ('Peter Bone', 'PERSON'),
 ('Hackney North', 'ORGANIZATION'),
 ('Mr Speaker', 'PERSON'),
 ('Nicky Morgan', 'ORGANIZATION'),
 ("Children 's Commissioners", 'ORGANIZATION'),
 ('Independent Anti-Slavery Commissioner', 'ORGANIZATION'),
 ('Eritrea', 'LOCATION'),
 ('House', 'ORGANIZATION'),
 ('Braintree', 'LOCATION'),
 ('Rutherglen', 'ORGANIZATION'),
 ('Alison McGovern', 'PERSON'),
 ('UNICEF', 'ORGANIZATION'),
 ('David Burrowes', 'PERSON'),
 ('UK Government', 'ORGANIZATION'),
 ('Hannah Bardell', 'PERSON'),
 ('Wales', 'LOCATION'),
 ('Argyll', 'ORGANIZATION'),
 ('Arthur Helton', 'PERSON'),
 ("Her Majesty 's Government", 'ORGANIZATION'),
 ('International Organisation for Migration', 'ORGANIZATION'),
 ('Darfur', 'LOCATION'),
 ('EU', 'ORGANIZATION'),
 ('Charlie Elphicke', 'PERSON'),
 ('Birmingham', 'LOCATION'),
 ('Stoke Newington', 'ORGANIZATION'),
 ('Southgate

In [3]:
stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Motor Neurone Disease-Gordon Aikman 2017-02-20.txt','motor_neurone')


[('Neurone DiseaseGordon Aikman Motion', 'ORGANIZATION'),
 ('House', 'ORGANIZATION'),
 ('Chris Heaton-Harris', 'PERSON'),
 ('Ian Murray', 'PERSON'),
 ('Edinburgh South', 'ORGANIZATION'),
 ('Gordon Aikman', 'PERSON'),
 ('Gordon Aikman', 'PERSON'),
 ('February', 'DATE'),
 ('House', 'ORGANIZATION'),
 ('Gordon Aikman', 'PERSON'),
 ('MND', 'ORGANIZATION'),
 ('South Leicestershire', 'LOCATION'),
 ('House', 'ORGANIZATION'),
 ('MND', 'ORGANIZATION'),
 ('United Kingdom', 'LOCATION'),
 ('Rutland', 'LOCATION'),
 ('Ruth Morrison', 'PERSON'),
 ('Ian Murray', 'PERSON'),
 ('MND', 'ORGANIZATION'),
 ('Scotland', 'LOCATION'),
 ('House', 'ORGANIZATION'),
 ('Gordon Aikman', 'PERSON'),
 ('MND', 'ORGANIZATION'),
 ('Gordon', 'PERSON'),
 ('Joe', 'PERSON'),
 ('this evening', 'TIME'),
 ('Nancy', 'PERSON'),
 ('Lorraine', 'PERSON'),
 ('Murray', 'PERSON'),
 ('Gordon', 'PERSON'),
 ('University of Edinburgh', 'ORGANIZATION'),
 ('Scotland', 'LOCATION'),
 ('Scotland', 'LOCATION'),
 ('UK', 'ORGANIZATION'),
 ('2014', 'D

In [5]:
stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\Motor Neurone Disease-Gordon Aikman 2017-02-20.txt','motor_neurone')


[('Neurone DiseaseGordon Aikman Motion', 'MISC'),
 ('House', 'ORGANIZATION'),
 ('Chris Heaton-Harris', 'PERSON'),
 ('Ian Murray', 'PERSON'),
 ('Edinburgh South', 'LOCATION'),
 ('Lab', 'LOCATION'),
 ('Madam', 'PERSON'),
 ('Adjournment', 'MISC'),
 ('Gordon Aikman', 'PERSON'),
 ('Gordon Aikman', 'PERSON'),
 ('House', 'ORGANIZATION'),
 ('Gordon Aikman', 'PERSON'),
 ('MND', 'ORGANIZATION'),
 ('Alberto Costa', 'PERSON'),
 ('South Leicestershire', 'ORGANIZATION'),
 ('Con', 'MISC'),
 ('Floor of the House', 'ORGANIZATION'),
 ('MND', 'ORGANIZATION'),
 ('United Kingdom', 'LOCATION'),
 ('Leicestershire', 'ORGANIZATION'),
 ('Rutland', 'LOCATION'),
 ('Ruth Morrison', 'PERSON'),
 ('hon. Gentleman', 'MISC'),
 ('Ian Murray', 'PERSON'),
 ('hon. Gentleman', 'MISC'),
 ('MND', 'ORGANIZATION'),
 ('MND Scotland', 'ORGANIZATION'),
 ('House', 'ORGANIZATION'),
 ('Gordon Aikman', 'PERSON'),
 ('MND', 'ORGANIZATION'),
 ('Gordon', 'PERSON'),
 ('Gordon', 'PERSON'),
 ('Joe', 'PERSON'),
 ('Nancy', 'PERSON'),
 ('Lorrai