In [None]:
"""
A few examples of Python NLTK for Natural Language Processing
# Source: https://github.com/DistrictDataLabs/intro-to-nltk/blob/master/NLTK.ipynb
"""
import nltk

# This is followed by downloading the NLTK corpora, which are already on my machine.

In [None]:
# Get the text of _Moby Dick_. Display 10 occurrences of the word "monstrous"
# with 55 characters on eiher side.
moby = nltk.text.Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
moby.concordance("monstrous", 55, lines=10)

In [None]:
# Find words with distributions similar to "monstrous"
print moby.similar("monstrous")

print

# Then grab _Sense and Sensibility_ and do the same
austen = nltk.text.Text(nltk.corpus.gutenberg.words('austen-sense.txt'))
print austen.similar("monstrous")

In [None]:
# Load the Reuters news corpus and get some statistical information
reuters = nltk.corpus.reuters
counts  = nltk.FreqDist(reuters.words()) # Get word counts
vocab   = len(counts.keys()) # Get total number of word types
words   = sum(counts.values()) # Get the total number of tokens
lexdiv  = float(words) / float(vocab) # Lexical diversity is number of tokens / number of types

print "Corpus has %i types and %i tokens for a lexical diversity of %0.3f" % (vocab, words, lexdiv)
print
print "Most Frequent Type: " + str(counts.max())
print
most_common = []
for type in counts.most_common(40):
    most_common.append(type[0]+" ("+str(type[1])+")")
most_common = ", ".join(most_common)
print "40 Most Common Types:"
print most_common
print
hapaxes = ", ".join(counts.hapaxes()[0:10])
print "Top 10 Hapax Legomena: " + hapaxes
print
percent = format(counts.freq('stipulate') * 100, '.12f')
print '% of Corpus Occupied by "stipulate": ' + percent

In [None]:
# Plot the frequencies of the top 200 types
counts.plot(200, cumulative=False)

In [None]:
# Parse a document into sentences
text = u"Medical personnel returning to New York and New Jersey from the Ebola-riddled countries in West Africa will be automatically quarantined if they had direct contact with an infected person, officials announced Friday. New York Gov. Andrew Cuomo (D) and New Jersey Gov. Chris Christie (R) announced the decision at a joint news conference Friday at 7 World Trade Center. “We have to do more,” Cuomo said. “It’s too serious of a situation to leave it to the honor system of compliance.” They said that public-health officials at John F. Kennedy and Newark Liberty international airports, where enhanced screening for Ebola is taking place, would make the determination on who would be quarantined. Anyone who had direct contact with an Ebola patient in Liberia, Sierra Leone or Guinea will be quarantined. In addition, anyone who traveled there but had no such contact would be actively monitored and possibly quarantined, authorities said. This news came a day after a doctor who had treated Ebola patients in Guinea was diagnosed in Manhattan, becoming the fourth person diagnosed with the virus in the United States and the first outside of Dallas. And the decision came not long after a health-care worker who had treated Ebola patients arrived at Newark, one of five airports where people traveling from West Africa to the United States are encountering the stricter screening rules."

for sent in nltk.sent_tokenize(text): 
    print sent
    print

In [None]:
# Part-of-Speech (POS) Tagging
for sent in nltk.sent_tokenize(text):
    l = list(nltk.pos_tag(nltk.word_tokenize(sent)))
    for token in l:
        print(token[0].encode('utf-8') + ": " + token[1])

In [56]:
"""
Demonstrate Zipf's Law on the Brown Corpus.
#Source: https://finnaarupnielsen.wordpress.com/2013/10/22/zipf-plot-for-word-counts-in-brown-corpus/
"""
from __future__ import division
from itertools import *
from pylab import *
from nltk.corpus import brown
from string import lower
from collections import Counter

# The data: token counts from the Brown corpus
tokens_with_count = Counter(imap(lower, brown.words()))
counts = array(tokens_with_count.values())
tokens = tokens_with_count.keys()

# A Zipf plot
ranks = arange(1, len(counts)+1)
indices = argsort(-counts)
frequencies = counts[indices]
loglog(ranks, frequencies, marker=".")
title("Zipf plot for Brown corpus tokens")
xlabel("Frequency rank of token")
ylabel("Absolute frequency of token")
grid(True)
for n in list(logspace(-0.5, log10(len(counts)), 20).astype(int)):
    dummy = text(ranks[n], frequencies[n], " " + tokens[indices[n]], 
                 verticalalignment="bottom",
                 horizontalalignment="left")

show()