##Practice with NLTK Chunking Code

The CONLL Corpus is tagged with NP, VP, and PP chunks and has a built in  training set. 

In [1]:
import nltk

def train_conll_chunker(chunk_type='NP'):
    chunk_trees = nltk.corpus.conll2000.chunked_sents('train.txt', chunk_type)
    return (chunk_trees)
    

In [2]:
# take a look at the output of the NP chunker
np_trees = train_conll_chunker('NP')
print(np_trees[3])


(S
  (NP This/DT)
  has/VBZ
  increased/VBN
  (NP the/DT risk/NN)
  of/IN
  (NP the/DT government/NN)
  being/VBG
  forced/VBN
  to/TO
  increase/VB
  (NP base/NN rates/NNS)
  to/TO
  (NP 16/CD %/NN)
  from/IN
  (NP their/PRP$ current/JJ 15/CD %/NN level/NN)
  to/TO
  defend/VB
  (NP the/DT pound/NN)
  ,/,
  (NP economists/NNS)
  and/CC
  (NP foreign/JJ exchange/NN market/NN analysts/NNS)
  say/VBP
  ./.)


In [3]:
# take a look at the output of the NP chunker

vp_trees = train_conll_chunker('VP')
print(vp_trees[3])

(S
  This/DT
  (VP has/VBZ increased/VBN)
  the/DT
  risk/NN
  of/IN
  the/DT
  government/NN
  (VP being/VBG forced/VBN to/TO increase/VB)
  base/NN
  rates/NNS
  to/TO
  16/CD
  %/NN
  from/IN
  their/PRP$
  current/JJ
  15/CD
  %/NN
  level/NN
  (VP to/TO defend/VB)
  the/DT
  pound/NN
  ,/,
  economists/NNS
  and/CC
  foreign/JJ
  exchange/NN
  market/NN
  analysts/NNS
  (VP say/VBP)
  ./.)


Demonstrate tracing by testing two chunkers with different rule ordering.

In [4]:
# NP chunker which puts NPs with adjectives first in the rule ordering
cp1 = nltk.RegexpParser(r'''
NP: {<DT><JJ.*><NN.*>} #Chunk det+adj+noun
    {<DT|NN.*>+}      #Chunk sequences of DT and noun (any number of determiners)
    ''')

# NP chunker which puts NPs with adjectives second in the rule ordering
cp2 = nltk.RegexpParser(r'''
NP:   {<DT|NN.*>+}      #Chunk sequences of DT and noun
     {<DT><JJ.*><NN.*>} #Chunk det+adj+noun
     ''')

Make sample sentence

In [5]:
tagged_tokens = nltk.pos_tag(nltk.word_tokenize("The enchantress clutched the beautiful hair"))
tagged_tokens

[('The', 'DT'),
 ('enchantress', 'NN'),
 ('clutched', 'VBD'),
 ('the', 'DT'),
 ('beautiful', 'JJ'),
 ('hair', 'NN')]

Compare the output of cp1 and cp2, and watch the trace as it happens.

In [1]:
print(cp1.parse(tagged_tokens, trace=1))

NameError: name 'cp1' is not defined

In [7]:
print(cp2.parse(tagged_tokens, trace=1))

# Input:
 <DT>  <NN>  <VBD>  <DT>  <JJ>  <NN> 
# Chunk sequences of DT and noun:
{<DT>  <NN>} <VBD> {<DT>} <JJ> {<NN>}
# Chunk det+adj+noun:
{<DT>  <NN>} <VBD> {<DT>} <JJ> {<NN>}
(S
  (NP The/DT enchantress/NN)
  clutched/VBD
  (NP the/DT)
  beautiful/JJ
  (NP hair/NN))


###Why did these two outputs differ?###

#Collocations Practice#

### Use NLTK bigram and trigram functions to create collocations and compute interesting ones using PMI ###

In [7]:
import nltk
from nltk.collocations import *
import string, random

text = "pubmed_depression.txt"
#take sept 7 hw, load in your text, split by spaces

# Analyze my own text
bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = BigramCollocationFinder.from_words(text)


In [8]:
finder.ngram_fd.most_common(10)   #going straight into FD, look at most common

[(('u', 'b'), 1),
 (('.', 't'), 1),
 (('t', 'x'), 1),
 (('p', 'u'), 1),
 (('p', 'r'), 1),
 (('e', 'p'), 1),
 (('i', 'o'), 1),
 (('x', 't'), 1),
 (('m', 'e'), 1),
 (('e', 's'), 1)]

In [10]:
# Remove tokens that start with punctuation
finder.apply_word_filter(lambda w: w[0] in string.punctuation)

finder.ngram_fd.most_common(10) #dont have punc, but a lot of stop words

[(('in', 'the'), 250),
 (('of', 'the'), 233),
 (('to', 'the'), 135),
 (('on', 'the'), 133),
 (('at', 'the'), 114),
 (('and', 'the'), 111),
 (('to', 'be'), 111),
 (('it', 'was'), 104),
 (('was', 'a'), 97),
 (('he', 'had'), 91)]

In [11]:
# Remove stopwords 
stop_words = nltk.corpus.stopwords.words('english')
finder.apply_word_filter(lambda w: w.lower() in stop_words)

finder.ngram_fd.most_common(10) 

[(('could', 'see'), 15),
 (('old', 'man'), 14),
 (('New', 'York'), 13),
 (('Mike', 'Deegan'), 12),
 (('Old', 'Man'), 12),
 (("I've", 'got'), 11),
 (('young', 'men'), 9),
 (('Poor', 'John'), 9),
 (("didn't", 'know'), 9),
 (('looked', 'like'), 9)]

In [12]:
# Apply pmi measure to rank the remaining bigrams
finder.nbest(bigram_measures.pmi, 10)


[('A.M.', 'starring'),
 ('A40-AjK', 'Mercedes'),
 ('Air', 'Force'),
 ('Akita', 'prefectures'),
 ('Appian', 'Way'),
 ('Arc', 'de'),
 ('Armed', 'Forces'),
 ('Ash', 'Road'),
 ('Auto', 'Company'),
 ("Best's", 'Liliputian')]

In [13]:
# Those results aren't great.  To fix PMI, make sure bigrams occur at least 2 times
print(finder.ngram_fd.N())
finder.apply_freq_filter(2)
print(finder.ngram_fd.N())

9476
1626


In [14]:
# apply pmi measure to rank the remaining bigrams
finder.nbest(bigram_measures.pmi, 10)


[('Bobbsey', 'Twins'),
 ('Bon', 'jour'),
 ('Crazy', 'Horse'),
 ('Jour', 'et'),
 ('Mt.', 'Pleasant'),
 ('Signor', 'Raymond'),
 ('Toodle', 'Williams'),
 ('Unit', 'Number'),
 ('V-shaped', 'inlet'),
 ('Young', "Christians'")]

### Do the same thing for news text ###

In [15]:
# Do the same thing for news text
finder = BigramCollocationFinder.from_words(
    nltk.corpus.brown.words(categories="news"))
stop_words = nltk.corpus.stopwords.words('english')

finder.apply_word_filter(lambda w: w[0] in string.punctuation)
finder.apply_word_filter(lambda w: w.lower() in stop_words)
finder.nbest(bigram_measures.chi_sq, 10)

[('1,257,700', 'non-farm'),
 ('100-yard', 'dash'),
 ('1044', 'Chestnut'),
 ('11-7', 'collapse'),
 ('1200', 'Larimer'),
 ('13-5', 'barrage'),
 ('165-unit', 'Harbor'),
 ('1671', 'Nakoma'),
 ('182', 'scholastics'),
 ('2-and-2', 'pitches')]

### Let's do trigram collocations ###

In [16]:
def trigram_collocations(words, num):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    finder = TrigramCollocationFinder.from_words(words)
    finder.apply_word_filter(lambda w: w[0] in string.punctuation)
    stop_words = nltk.corpus.stopwords.words('english')
    finder.apply_word_filter(lambda w: w.lower() in stop_words)
    finder.apply_freq_filter(2)
    return(finder.nbest(bigram_measures.pmi, num))

print("\nTrigram collocations for romance: ")
print(trigram_collocations(nltk.corpus.brown.words(categories="romance"), 20))

print("\nTrigram collocations for news: ")
print(trigram_collocations(nltk.corpus.brown.words(categories="news"), 20))



Trigram collocations for romance: 
[('Jour', 'et', 'Nuit'), ('Chief', 'Crazy', 'Horse'), ('Young', "Christians'", 'League'), ('hundred', 'dollar', 'bill'), ('wet', 'graham', 'crackers'), ('Evadna', 'Mae', 'Evans'), ('tightly', 'curled', 'paot'), ('Dr.', 'Fortman', 'says'), ('Mrs.', 'Gertrude', 'Parker'), ('New', 'York', 'harbor'), ('Cousin', 'Alexander', 'Carraway'), ('Frankie', 'Ricco', 'sat'), ('Poor', 'Cousin', 'Elec'), ('inning', 'Mike', 'Deegan'), ('Miss', 'Theresa', 'Stubblefield'), ('Old', 'Mr.', 'Thom'), ('told', 'Miss', 'Groggins'), ("isn't", 'strong', 'enough'), ('said', 'Samuel', 'Burns'), ('Gratt', 'Shafer', 'would')]

Trigram collocations for news: 
[('Ku', 'Klux', 'Klan'), ('Pinar', 'Del', 'Rio'), ('Rural', 'Roads', 'Authority'), ('Post', 'Office', 'Box'), ('Diversified', 'Growth', 'Stock'), ("Patrick's", 'Day', 'Purse'), ('Notre', 'Dame', 'Chapter'), ('Growth', 'Stock', 'Fund'), ('electronic', 'data', 'processing'), ('esprit', 'de', 'corps'), ('La', 'Dolce', 'Vita'), ('