## nlp_module
Here we look at the nlp module and see how we can use the NER tagger

### Setup

In [1]:
import sys
sys.path.append('../modules/')
import nlp
from webpage import *

In [9]:
url = test_url['wiki']
w = Webpage(url)
w.load()
text = w.text

### contents of the module

In [3]:
print dir(nlp)

['StanfordNERTagger', 'WordNetLemmatizer', '__builtins__', '__doc__', '__file__', '__name__', '__package__', 'extract_entity_names', 'java_path', 'ner_tag', 'ner_tagger_jar', 'ngram_count', 'ngrams', 'nltk', 'os', 'pos_tag', 'pre_process', 're', 'sent_chunker', 'sent_tokenize', 'sorted_ngram_count', 'st', 'stopwords', 'string', 'tagger_path', 'wnl', 'word_tokenize']


## NER Tagger

In [10]:
print text[:500]

The College of Computing at the Georgia Institute of Technology has roots stretching back to an Information Science degree established in 1964.[1] In 1988, Georgia Tech president John Patrick Crecine elevated the School of Information and Computer Science to become the College of Computing, making Georgia Tech the second university to do so, after Carnegie Mellon University created their School of Computer Science.[1]

Beyond using contemporary computer technology, the College of Computing also 


In [11]:
text_ner_tags = nlp.ner_tag(text)

In [12]:
nlp.sorted_ngram_count(x for x in text_ner_tags if x[1] != 'O')

[(((u'of', u'ORGANIZATION'),), 21),
 (((u'Georgia', u'ORGANIZATION'),), 13),
 (((u'Tech', u'ORGANIZATION'),), 13),
 (((u'College', u'ORGANIZATION'),), 12),
 (((u'Computing', u'ORGANIZATION'),), 10),
 (((u'Science', u'ORGANIZATION'),), 8),
 (((u'School', u'ORGANIZATION'),), 8),
 (((u'Computer', u'ORGANIZATION'),), 6),
 (((u'Information', u'ORGANIZATION'),), 4),
 (((u'The', u'ORGANIZATION'),), 4),
 (((u'and', u'ORGANIZATION'),), 4),
 (((u'Slamecka', u'PERSON'),), 3),
 (((u'DeMillo', u'PERSON'),), 2),
 (((u'Patrick', u'PERSON'),), 2),
 (((u'U.S.', u'LOCATION'),), 2),
 (((u'John', u'PERSON'),), 2),
 (((u'University', u'ORGANIZATION'),), 2),
 (((u"'s", u'ORGANIZATION'),), 2),
 (((u'Miller', u'PERSON'),), 2),
 (((u'Crecine', u'PERSON'),), 2),
 (((u'Computational', u'ORGANIZATION'),), 2),
 (((u'Klaus', u'PERSON'),), 2),
 (((u'Engineering', u'ORGANIZATION'),), 2),
 (((u'Emory', u'ORGANIZATION'),), 1),
 (((u'Vladimir', u'PERSON'),), 1),
 (((u'Institute', u'ORGANIZATION'),), 1),
 (((u'Vernon', u

## Merging NER tags
So if two adjacent tokens have the same Tag => They are probably the same thing

So the following function takes the list of (token,tag) tuples, merges the ones which are adjacent to each other and have the same tag

```
Create a new array -> merged_ner_tags
Create an empty temp array -> temp_array
initialize prev_tag = 'O'
for each tag in the input array:
    If the tag is 'O' or tag != prev_tag:
        merge tags present in the temporary array and add it into merge_ner_Tags
        append tag to merge_ner_tags
        prev_tag = 'O'
    if the tag is not 'O' 
        add it to the temporary array
        set prev_tag
```

In [13]:
def merge_same_ner_tags(temp_array):
    """ Merge tokens from the temp array into one token_tag tuple """
    
    merged_token = ' '.join(token_tag[0] for token_tag in temp_array)
    merged_tag = tuple({token_tag[1] for token_tag in temp_array})
    if len(merged_tag) != 1:
        raise ValueError('Trying to merge tokens with different NER tags %s ' %str(temp_array))
    return merged_token,merged_tag[0]

def merge_ner_tags(text_ner_tags):
    """ Combine adjacent tokens with same NER tag into one token"""
    
    merged_ner_tags = []
    temp_array = []
    prev_tag = 'O'
    
    for token_tag in text_ner_tags:
        tag = token_tag[1]
        
        if tag == 'O':
            # There can not be an continuity so empty the array and append the results to merged
            if len(temp_array) != 0:
                # Merge the contents
                m_token,m_tag = merge_same_ner_tags(temp_array)
                merged_ner_tags.append((m_token,m_tag))
                #print temp_array,'!!!'
                temp_array = []
                
            merged_ner_tags.append(token_tag)
            prev_tag = tag
            
        elif prev_tag != 'O' and tag == prev_tag:
            # continue the chain
            temp_array.append(token_tag)
            prev_tag = tag
            
            
        else:
            # current tag != 0 and prev_tag == 0 or tag != prev_tag
            # Start of something new
            if len(temp_array) != 0:
                # Merge the contents
                m_token,m_tag = merge_same_ner_tags(temp_array)
                merged_ner_tags.append((m_token,m_tag))
                #print temp_array,'!!!'
                temp_array = []
            
            temp_array.append(token_tag)
            prev_tag = tag
            
            
    if len(temp_array) != 0:
        merged_token = ' '.join(token_tag[0] for token_tag in temp_array)
        merged_tag = tuple({token_tag[1] for token_tag in temp_array})[0]
        merged_token_tag = (merged_token,merged_tag)
        merged_ner_tags.append(merged_token_tag)
    
    return merged_ner_tags


In [14]:
merge_ner_tags(text_ner_tags)

[(u'The', u'O'),
 (u'College', u'O'),
 (u'of', u'O'),
 (u'Computing', u'O'),
 (u'at', u'O'),
 (u'the', u'O'),
 (u'Georgia Institute of Technology', u'ORGANIZATION'),
 (u'has', u'O'),
 (u'roots', u'O'),
 (u'stretching', u'O'),
 (u'back', u'O'),
 (u'to', u'O'),
 (u'an', u'O'),
 (u'Information', u'O'),
 (u'Science', u'O'),
 (u'degree', u'O'),
 (u'established', u'O'),
 (u'in', u'O'),
 (u'1964', u'O'),
 (u'.', u'O'),
 (u'[', u'O'),
 (u'1', u'O'),
 (u']', u'O'),
 (u'In', u'O'),
 (u'1988', u'O'),
 (u',', u'O'),
 (u'Georgia Tech', u'ORGANIZATION'),
 (u'president', u'O'),
 (u'John Patrick Crecine', u'PERSON'),
 (u'elevated', u'O'),
 (u'the', u'O'),
 (u'School of Information and Computer Science', u'ORGANIZATION'),
 (u'to', u'O'),
 (u'become', u'O'),
 (u'the', u'O'),
 (u'College', u'O'),
 (u'of', u'O'),
 (u'Computing', u'O'),
 (u',', u'O'),
 (u'making', u'O'),
 (u'Georgia Tech', u'ORGANIZATION'),
 (u'the', u'O'),
 (u'second', u'O'),
 (u'university', u'O'),
 (u'to', u'O'),
 (u'do', u'O'),
 (u'so'

In [15]:
tagged_entities = [tag for tag in merge_ner_tags(text_ner_tags) if tag[1] != 'O']

In [16]:
# Merge similar tokens
tag_collected = {tag:[x[0] for x in tagged_entities if x[1] == tag] for tag in {x[1] for x in tagged_entities}}
# Dict with keys as tags and values as all the entities with that tag
tag_collected

{u'LOCATION': [u'U.S.', u'U.S.', u'United States', u'US'],
 u'ORGANIZATION': [u'Georgia Institute of Technology',
  u'Georgia Tech',
  u'School of Information and Computer Science',
  u'Georgia Tech',
  u'Carnegie Mellon University',
  u'School of Computer Science',
  u'College of Computing',
  u'Georgia Tech',
  u'Computational Media',
  u"Georgia Tech 's School of Literature",
  u'Ivan Allen College of Liberal Arts',
  u'The College of Computing',
  u'College of Computing',
  u"Georgia Tech 's College of Computing",
  u'Georgia Tech',
  u'School of Information Science',
  u'Georgia Tech',
  u'Georgia Tech',
  u'School of Information and Computer Science',
  u'Emory University',
  u'Computer Science',
  u'IBM',
  u'School of Information and Computer Science',
  u'College of Computing',
  u'Georgia Tech',
  u'Tech',
  u'Georgia Tech',
  u'School of Computer Science',
  u'SCS',
  u'College of Computing Dean',
  u'Georgia Tech',
  u'School of Interactive Computing',
  u'Computational Sci

In [84]:
# For every token see if it is a part of any other token if yes replace it by the bigger token
tag = 'PERSON'
def reduce_tag_tokens(tokens):
    """ Look at the tokens with the same NER tag and merge them if they are 'similar'
    
    Current definitions of similarity
        Compare the tag with every other tag and if the first tag is a part of a bigger tag
        Then they are probably referring to the same thing, so replace the smaller one by the bigger one
        
    input : list of tokens which have the same NER tag
    Output: list of reduced tokens
    
    TODO : Add Abbreviation to similarity thing
    
    """
    new_tags = []
    for token in tokens:
        flag = 0 
        for other_token in sorted(tokens,key = lambda x:len(x), reverse = True):
            if compare_tokens(token,other_token):
                new_tags.append(other_token)
                flag = 1
                break
        if flag != 1:
            new_tags.append(token)

    return new_tags

def compare_tokens(token,other_token):
    """ Compare two tokens to see if they are similar or not"""
    return ((token in other_token
                    and len(other_token) > len(token)
                    and "'s" not in other_token
                    and "of" not in other_token)
                or ("The" in other_token 
                    and token in other_token.replace("The",' ')
                   )
                or (token in get_abbr(other_token))
           )

"'s" and "of" create problems when "Gerogia Tech 's School of Literature"  merges with "Georgia Tech" => Avoid them
But avoiding them creates another problem:
"College of Computing" 

So we have hard coded a set of rules
    I am quite sure they can all be later replaced by some mapping dictionary which maps entities to known aliases

In [72]:
def get_abbr(st):
    """ Generate an abbreviation of the passed string by some rule and return it 
    
    
    input : string 
    output: set of possible abbreviations
    
    Current method of abbr:
        Pick up first leter of every word
        Capitalize it
            1. One abbr would have it as U.S.
            2. Another abbr would be as US
    
    Thus you cannot abbreviate single word strings
    
    TODO : Use some library to do this
    """
    words = st.replace('The','').replace('of','').replace('&','').split()
    if len(words) < 2:
        return {}
    
    first_letters = [word[0].capitalize() for word in words]
    
    abbr_1 = '.'.join(first_letters)+'.'
    abbr_2 = ''.join(first_letters)
    
    return {abbr_1,abbr_2}
get_abbr('National Securoty Agency')    

{'N.S.A.', 'NSA'}

In [78]:
for tag in tag_collected:
    print reduce_tag_tokens(tag_collected[tag])

[u'Georgia Institute of Technology', u"Georgia Tech 's School of Literature", u'School of Information and Computer Science', u"Georgia Tech 's School of Literature", u'Carnegie Mellon University', u'School of Computer Science', u"Georgia Tech 's College of Computing", u"Georgia Tech 's School of Literature", u'Computational Media', u"Georgia Tech 's School of Literature", u'Ivan Allen College of Liberal Arts', u'The College of Computing', u"Georgia Tech 's College of Computing", u"Georgia Tech 's College of Computing", u"Georgia Tech 's School of Literature", u'School of Information Science', u"Georgia Tech 's School of Literature", u"Georgia Tech 's School of Literature", u'School of Information and Computer Science', u'Emory University', u'School of Information and Computer Science', u'IBM', u'School of Information and Computer Science', u"Georgia Tech 's College of Computing", u"Georgia Tech 's School of Literature", u"Georgia Tech 's School of Literature", u"Georgia Tech 's School 

In [88]:
for tag in tag_collected:
    print tag
    print '\n'.join(str(x) for x in nlp.sorted_ngram_count(reduce_tag_tokens(tag_collected[tag])))
    print '\n\n'

ORGANIZATION
((u'Georgia Tech',), 11)
((u'The College of Computing',), 7)
((u'School of Computer Science',), 3)
((u'School of Information and Computer Science',), 3)
((u'Computational Science & Engineering',), 2)
((u"Georgia Tech 's School of Literature",), 1)
((u'Computer Science',), 1)
((u"Georgia Tech 's College of Computing",), 1)
((u'Georgia Institute of Technology',), 1)
((u'College of Engineering and College of Sciences',), 1)
((u'Ivan Allen College of Liberal Arts',), 1)
((u'IBM',), 1)
((u'DARPA',), 1)
((u'College of Computing Dean',), 1)
((u'School of Interactive Computing',), 1)
((u'Carnegie Mellon University',), 1)
((u'Emory University',), 1)
((u'Computational Media',), 1)
((u'School of Information Science',), 1)



LOCATION
((u'United States',), 4)



PERSON
((u'Vladimir Slamecka',), 3)
((u'Chris Klaus',), 2)
((u'Richard DeMillo',), 2)
((u'Ray Miller',), 2)
((u'John Patrick Crecine',), 2)
((u'Vernon Crawford',), 1)
((u'Zvi Galil',), 1)
((u'Peter A. Freeman',), 1)
((u'Willia

In [33]:
url

'https://en.wikipedia.org/wiki/Georgia_Institute_of_Technology_College_of_Computing'