# Named Entity Recognition on Hansard sample data

Returns:
- title of document / topic of the debate ((title_trimmed))
- list of entities by type (list_ents_swap)
- relationships between entities mentioned in the same sentence (relations, also in datadrame format in relations_df)
- Keywords (word_freq_df)

In [37]:
### Imports

import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re
from collections import Counter
import itertools


### Function to get the entities from the tree

def get_ents(ne_tree):
    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == nltk.tree.Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    # Return list of entity tuples
    return ne_in_sent



### Main function

def NER_run(file_in):
    
    # Read in file
    sample = open(file_in, 'r').read()
    
    
    # Get title (first line) of doc - processed for stopwords and normalised
    title = sample.split('\n', 1)[0]
    
    # Remove stopwords
    title_trimmed = []
    
    title_tokenized=nltk.word_tokenize(title)
    title_trimmed = ' '.join([word for word in title_tokenized if word not in (stopwords.words('english'))])
    
    print('\n Title (title_trimmed): \n')
    print(title_trimmed)

    
    
    ### Exract all named individuals (will include MPs and others)

    # Prep data for format required for Regex parser
    tokenized_sentences = nltk.sent_tokenize(sample)
    # print("\n Tokenized Sentences:\n")
    # print(tokenized_sentences)

    tokenized_words = [nltk.word_tokenize(sent) for sent in tokenized_sentences]
    # print("\n Tokenized Words:\n")
    # print(tokenized_words)

    postagged_words = [nltk.pos_tag(sent) for sent in tokenized_words]
    # print("\n POS Tagged Words:\n")
    # print(postagged_words)

    # Chunk it
    chunked=[]

    # print("\n Chunked Words:\n")
    for word in postagged_words:
        chunk = nltk.ne_chunk(word, binary=False)
        # print(chunk)
        chunked.append(chunk)
        
        
    # Get list of all entities
    ents = [get_ents(tree) for tree in chunked]
    
    # Flatten entities into a simple list and make unique
    ents_list = list(itertools.chain.from_iterable(ents))
    ents_unique = set(ents_list)
    
    # Swap columns around so that entity type comes before entity name
    list_ents_swap=[]

    for item in ents_unique:
        list_ents_swap.append(item[::-1])

    list_ents_swap.sort()
    
    print('\n List of unique entities (list_ents_swap): \n')
    [print(item) for item in list_ents_swap]
    
    
    ### Relationships
    
    # Assign score of 1 for any relationship from the same sentence, for now.
    # May modify to score of 2 for string and 1 for weak, later.

    relations=[]

    for row in ents:
        for i in range(len(row)):
            thiselem = row[(i)]
            for j in range(len(row)):
                nextelem = row[(j)]
                if (thiselem != nextelem) and (thiselem, nextelem, 1) not in relations:
                    relations.append((thiselem, nextelem, 1))
            
    # Convert to dataframe
    relations_df = pd.DataFrame(relations, columns=["Subject", "Object", "Strength"])

    print('\n Relationships (relations, also in DataFrame format in relations_df): \n')
    print(relations_df)
    
    ### Keywords

    # Load some Hansard docs to act as a corpus
    corpus = open('Hansard Written Statements 24 Mar 2016.txt', 'r').read()

    # Remove stopwords, from corpus and from sample

    cachedStopWords = stopwords.words("english")

    def rem_stopwords(data_in):
        return ' '.join([ word for word in re.split('\s+',data_in.lower()) if word not in cachedStopWords])

    corpus_trimmed = rem_stopwords(corpus)
    sample_trimmed = rem_stopwords(sample)

    # Simple count of each word in the trimmed corpus
    corpus_word_counts = Counter(re.split('\s+',corpus_trimmed.lower()))
    
    # Count words in sample
    sample_word_counts = Counter(re.split('\s+',sample_trimmed.lower()))
    
    
    # Convert to Dataframe and calc word freq distribution
    def counter_to_df(data_in):
        data_out = pd.DataFrame.from_dict(data_in, orient='index').reset_index()
        data_out = data_out.rename(columns={'index':'word', 0:'count'})
        data_out['freq'] = data_out['count'] / data_out['count'].sum()
        return data_out

    corpus_counts_df = counter_to_df(corpus_word_counts)
    sample_counts_df = counter_to_df(sample_word_counts)
    
    
    # Drop out single occurence words, merge dataframes and cal index
    word_freq_df=[]
    word_freq_df = pd.merge(sample_counts_df.loc[(sample_counts_df["count"] > 1)], corpus_counts_df,on='word', how='left')

    # Play with boosting value for words not seen in main corpus
    word_freq_df['count_y'].fillna(value=1, inplace=True)
    word_freq_df['freq_y'].fillna(value=word_freq_df['freq_y'].min(), inplace=True)

    word_freq_df['freq_score'] = word_freq_df['freq_x'] / word_freq_df['freq_y']
    word_freq_df['count_ratio'] = word_freq_df['count_x'] / word_freq_df['count_y']

    
    # Sort and take top X words
    word_freq_df.sort_values(by="freq_score", ascending=False,inplace=True)
    print('\n Top 10 keywords: \n')
    print(word_freq_df['word'][:10].to_string(index=False))
    
    

In [38]:
# Call the main function

NER_run('brexit.txt')


 Title (title_trimmed): 

David Davis appointed Secretary State Brexit

 List of unique entities (list_ents_swap): 

('GPE', 'Brexit')
('GPE', 'Britain')
('GPE', 'British')
('GPE', 'EU')
('GPE', 'Europe')
('GPE', 'Leave')
('ORGANIZATION', 'BBC')
('ORGANIZATION', 'EU')
('ORGANIZATION', 'Leave')
('ORGANIZATION', 'Three Brexiteers')
('ORGANIZATION', 'UK')
('PERSON', 'Boris Johnson')
('PERSON', 'Brexit')
('PERSON', 'Brexit Theresa May')
('PERSON', 'Chris Morris')
('PERSON', 'David')
('PERSON', 'David Davis')
('PERSON', 'Davis')
('PERSON', 'Liam Fox')
('PERSON', 'Mrs May')
('PERSON', 'Theresa May')

 Relationships (relations, also in DataFrame format in relations_df): 

                         Subject                            Object  Strength
0                (David, PERSON)                   (Davis, PERSON)         1
1                (David, PERSON)      (Brexit Theresa May, PERSON)         1
2                (David, PERSON)                      (Leave, GPE)         1
3                

In [39]:
NER_run('blair.txt')


 Title (title_trimmed): 

The SNP 's motion said former Labour prime minister given Parliament correct information dealings US President George W Bush .

 List of unique entities (list_ents_swap): 

('GPE', 'Iraq')
('GPE', 'Labour')
('ORGANIZATION', 'Commons')
('ORGANIZATION', 'House')
('ORGANIZATION', 'Justice Committee')
('ORGANIZATION', 'MPs')
('ORGANIZATION', 'Parliament')
('ORGANIZATION', 'SNP')
('ORGANIZATION', 'UK')
('ORGANIZATION', 'US')
('PERSON', 'Alex Salmond')
('PERSON', 'Blair')
('PERSON', 'Fabian Hamilton')
('PERSON', 'George W Bush')
('PERSON', 'Labour')

 Relationships (relations, also in DataFrame format in relations_df): 

                              Subject                             Object  \
0                 (SNP, ORGANIZATION)                      (Labour, GPE)   
1                 (SNP, ORGANIZATION)         (Parliament, ORGANIZATION)   
2                 (SNP, ORGANIZATION)                 (US, ORGANIZATION)   
3                 (SNP, ORGANIZATION)         

In [41]:
NER_run('President Trump- State Visit 2017-02-20.txt')


 Title (title_trimmed): 

President Trump : State Visit

 List of unique entities (list_ents_swap): 

('FACILITY', 'Kremlin')
('FACILITY', 'Trump Administration')
('FACILITY', 'Trump Big Brother')
('FACILITY', 'Trump Tower')
('FACILITY', 'White House')
('FACILITY', 'White House— Alex Salmond')
('GPE', 'Acton')
('GPE', 'Ainsty')
('GPE', 'America')
('GPE', 'American')
('GPE', 'Americans')
('GPE', 'Americas')
('GPE', 'America—the')
('GPE', 'Australia')
('GPE', 'Ayrshire')
('GPE', 'Belgium')
('GPE', 'Birmingham')
('GPE', 'Bow')
('GPE', 'Brexit')
('GPE', 'Brighton')
('GPE', 'Britain')
('GPE', 'British')
('GPE', 'British-American')
('GPE', 'Brussels')
('GPE', 'Burmese')
('GPE', 'California')
('GPE', 'Cambridge')
('GPE', 'Chelmsford')
('GPE', 'China')
('GPE', 'Chinese')
('GPE', 'Colombia')
('GPE', 'Cowdenbeath')
('GPE', 'Crown')
('GPE', 'Culzean')
('GPE', 'Daesh')
('GPE', 'Dallas')
('GPE', 'Dark')
('GPE', 'Defence')
('GPE', 'Dewsbury')
('GPE', 'Diplomacy')
('GPE', 'EU')
('GPE', 'Edinburgh')
