# Imports

In [1]:
import itertools
import unicodedata

In [2]:
import nltk

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# Constants

# Grammar-Based Feature Extraction

## Context-Free Grammars

In [4]:
GRAMMAR = """
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen' | 'George'
    V -> 'looks' | 'burns'
    P -> 'in' | 'for'
    DT -> 'the'
    N -> 'castle' | 'ocean'
"""

In [5]:
cfg = nltk.CFG.fromstring(GRAMMAR)

In [6]:
print(cfg)

Grammar with 13 productions (start state = S)
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen'
    NNP -> 'George'
    V -> 'looks'
    V -> 'burns'
    P -> 'in'
    P -> 'for'
    DT -> 'the'
    N -> 'castle'
    N -> 'ocean'


In [7]:
cfg.start()

S

In [8]:
cfg.productions()

[S -> NNP VP,
 VP -> V PP,
 PP -> P NP,
 NP -> DT N,
 NNP -> 'Gwen',
 NNP -> 'George',
 V -> 'looks',
 V -> 'burns',
 P -> 'in',
 P -> 'for',
 DT -> 'the',
 N -> 'castle',
 N -> 'ocean']

## Syntactic Parsers

In [9]:
from nltk.chunk.regexp import RegexpParser

In [10]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
chunker = RegexpParser(GRAMMAR)

In [11]:
sent = """
    Dusty Baker proposed a simple solution to the Washington Nationals early-season bullpen 
    troubles Monday afternoon and it had nothing to do with his maligned group of relievers.
"""
tokens = nltk.word_tokenize(sent)
tagged = nltk.pos_tag(tokens)
chunked = chunker.parse(tagged)

In [12]:
print(chunked)

(S
  (KT Dusty/NNP Baker/NNP)
  proposed/VBD
  a/DT
  (KT simple/JJ solution/NN)
  to/TO
  the/DT
  (KT Washington/NNP Nationals/NNP)
  (KT
    early-season/JJ
    bullpen/NN
    troubles/NNS
    Monday/NNP
    afternoon/NN)
  and/CC
  it/PRP
  had/VBD
  (KT nothing/NN)
  to/TO
  do/VB
  with/IN
  his/PRP$
  maligned/VBN
  (KT group/NN of/IN relievers/NNS)
  ./.)


## Extracting Keyphrases

In [13]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
GOODTAGS = frozenset(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])

In [14]:
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = nltk.RegexpParser(self.grammar)
    
    def normalize(self, sentence):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        return [(token.lower(), tag)
                for (token, tag) in sentence
                if not all(unicodedata.category(char).startswith('P')
                           for char in token)]
    
    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for paragraph in document:
            for sentence in paragraph:
                sentence = self.normalize(sentence)
                if not sentence:
                    continue
                chunks = nltk.tree2conlltags(
                    self.chunker.parse(sentence)
                )
                keyphrases = [
                    ' '.join(word for word, pos, chunk in group)
                    for key, group in itertools.groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for keyphrase in keyphrases:
                    yield keyphrase
    
    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))

In [15]:
keyphrase_extractor = KeyphraseExtractor()

In [16]:
list(keyphrase_extractor.fit_transform([[[tagged]]]))

[['dusty baker',
  'simple solution',
  'washington nationals early-season bullpen troubles monday afternoon',
  'nothing',
  'group of relievers']]

In [None]:
# from reader import PickledCorpusReader

# corpus = PickledCorpusReader('../corpus')
# docs = corpus.docs()

# phrase_extractor = KeyphraseExtractor()
# keyphrases = list(phrase_extractor.fit_transform(docs))
# print(keyphrases[0])

## Extracting Entities

# n-Gram Feature Extraction

## An n-Gram-Aware CorpusReader

## Choosing the Right n-Gram Window

## Significant Collocations

# n-Gram Language Models

## Frequency and Conditional Frequency

## Estimating Maximum Likelihood

## Unknown Words: Back-off and Smoothing

## Language Generation