# Imports

In [1]:
import time
import json
import pickle
import pathlib
import itertools
import unicodedata

In [2]:
import nltk

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# Constants

In [4]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'
CORPUS_ROOT = DATA_DIR / 'sample'

# PickledCorpusReader

In [5]:
DOC_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'

In [6]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

class PickledCorpusReader(CategorizedCorpusReader, CorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

    def _resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids
    
    def feeds(self):
        data = self.open('feeds.json')
        return json.load(data)

    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to acheive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self._resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for doc in self.docs(fileids, categories):
            for paragraph in doc:
                yield paragraph

    def sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in paragraph:
                yield sentence
    
    def tagged_words(self, fileids=None, categories=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for sent in self.sents(fileids, categories):
            for token, tag in sent:
                yield token, tag

    def words(self, fileids=None, categories=None):
        """
        Returns a generator of tokens.
        """
        for sent in self.sents(fileids, categories):
            for token, tag in sent:
                yield token
    
    def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time.time()
        
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1
            
            for sent in para:
                counts['sents'] += 1
                
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1
        
        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': counts['words'] / len(tokens),
            'ppdoc':  counts['paras'] / n_fileids,
            'sppar':  counts['sents'] / counts['paras'],
            'secs':   time.time() - started,
        }

In [7]:
from collections import Counter

corpus = PickledCorpusReader(root=CORPUS_ROOT.as_posix())
words  = Counter(corpus.words())

print(f"{len(words.keys()):,} vocabulary {sum(words.values()):,} word count")

58,748 vocabulary 1,624,862 word count


# Grammar-Based Feature Extraction

## Context-Free Grammars

In [8]:
GRAMMAR = """
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen' | 'George'
    V -> 'looks' | 'burns'
    P -> 'in' | 'for'
    DT -> 'the'
    N -> 'castle' | 'ocean'
"""

In [9]:
cfg = nltk.CFG.fromstring(GRAMMAR)

In [10]:
print(cfg)

Grammar with 13 productions (start state = S)
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen'
    NNP -> 'George'
    V -> 'looks'
    V -> 'burns'
    P -> 'in'
    P -> 'for'
    DT -> 'the'
    N -> 'castle'
    N -> 'ocean'


In [11]:
cfg.start()

S

In [12]:
cfg.productions()

[S -> NNP VP,
 VP -> V PP,
 PP -> P NP,
 NP -> DT N,
 NNP -> 'Gwen',
 NNP -> 'George',
 V -> 'looks',
 V -> 'burns',
 P -> 'in',
 P -> 'for',
 DT -> 'the',
 N -> 'castle',
 N -> 'ocean']

## Syntactic Parsers

In [13]:
from nltk.chunk.regexp import RegexpParser

In [14]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
chunker = RegexpParser(GRAMMAR)

In [15]:
sent = """
    Dusty Baker proposed a simple solution to the Washington Nationals early-season bullpen 
    troubles Monday afternoon and it had nothing to do with his maligned group of relievers.
"""
tokens = nltk.word_tokenize(sent)
tagged = nltk.pos_tag(tokens)
chunked = chunker.parse(tagged)

In [16]:
print(chunked)

(S
  (KT Dusty/NNP Baker/NNP)
  proposed/VBD
  a/DT
  (KT simple/JJ solution/NN)
  to/TO
  the/DT
  (KT Washington/NNP Nationals/NNP)
  (KT
    early-season/JJ
    bullpen/NN
    troubles/NNS
    Monday/NNP
    afternoon/NN)
  and/CC
  it/PRP
  had/VBD
  (KT nothing/NN)
  to/TO
  do/VB
  with/IN
  his/PRP$
  maligned/VBN
  (KT group/NN of/IN relievers/NNS)
  ./.)


## Extracting Keyphrases

In [17]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
GOODTAGS = frozenset(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])

In [18]:
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = nltk.RegexpParser(self.grammar)
    
    def normalize(self, sentence):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        return [(token.lower(), tag)
                for (token, tag) in sentence
                if not all(unicodedata.category(char).startswith('P')
                           for char in token)]
    
    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for paragraph in document:
            for sentence in paragraph:
                sentence = self.normalize(sentence)
                if not sentence:
                    continue
                chunks = nltk.tree2conlltags(
                    self.chunker.parse(sentence)
                )
                keyphrases = [
                    ' '.join(word for word, pos, chunk in group)
                    for key, group in itertools.groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for keyphrase in keyphrases:
                    yield keyphrase
    
    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))

In [19]:
corpus = PickledCorpusReader(root=CORPUS_ROOT.as_posix())
docs = corpus.docs()

keyphrase_extractor = KeyphraseExtractor()
keyphrases = list(keyphrase_extractor.fit_transform(docs))
print(keyphrases[0])

['lonely city', 'heart piercing wisdom', 'loneliness', 'laing', 'everyone', 'feast later', 'point', 'own hermetic existence in new york', 'danger', 'thankfully', 'lonely city', 'cry for connection', 'overcrowded overstimulated world', 'blueprint of urban loneliness', 'emotion', 'calls', 'city', 'npr jason heller', 'olivia laing', 'lonely city', 'exploration of loneliness', 'others experiences in new york city', 'rumpus', 'review', 'lonely city', 'related posts']


## Extracting Entities

In [20]:
GOODLABELS = frozenset(['PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP'])

In [21]:
class EntityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, labels=GOODLABELS, **kwargs):
        self.labels = labels

    def get_entities(self, document):
        return [
            ' '.join(word.lower() for word, tag in tree)
            for paragraph in document
            for sentence in paragraph
            for tree in nltk.ne_chunk(sentence)
            if hasattr(tree, 'label')
            and tree.label() in self.labels
        ]

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.get_entities(document)

In [22]:
corpus = PickledCorpusReader(root=CORPUS_ROOT.as_posix())
docs = corpus.docs()

entity_extractor = EntityExtractor()
entities = list(entity_extractor.fit_transform(docs))
print(entities[0])

['lonely city', 'loneliness', 'laing', 'new york', 'lonely city', 'npr', 'jason heller', 'olivia laing', 'lonely city', 'new york city', 'rumpus', 'lonely city', 'related']


# n-Gram Feature Extraction

## An n-Gram-Aware CorpusReader

## Choosing the Right n-Gram Window

## Significant Collocations

# n-Gram Language Models

## Frequency and Conditional Frequency

## Estimating Maximum Likelihood

## Unknown Words: Back-off and Smoothing

## Language Generation