In [1]:
# Natural language toolkit is an entry point for all contemporary NL tools
# pip install nltk
import nltk

In [2]:
# before using - download the data http://www.nltk.org/data.html
# whole package is ~3G
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# text tokenization and tagging
text = "This is a sentence that we will use to test the magic tool"
# requires tokenizers/punkt/english.pickle of nltk_data
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens, tagset='universal')
print("default:", tagged, "\n")

# use this corpus to indentfy parts of speech
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
# unigram tagger does not consider any context, that's why it can make mistakes for words like "TEST"
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
bigram_tagger = nltk.BigramTagger(brown_tagged_sents)

tagged_u = unigram_tagger.tag(tokens)
print("unigram:", tagged_u, "\n")
tagged_b = bigram_tagger.tag(tokens)
print("bigram:", tagged_b, "\n")
# notice, that 'test' in bigrams detected correctly!

# For meanings and more information see http://www.nltk.org/book/ch05.html "2.3 A Universal Part-of-Speech Tagset"

default: [('This', 'DET'), ('is', 'VERB'), ('a', 'DET'), ('sentence', 'NOUN'), ('that', 'ADP'), ('we', 'PRON'), ('will', 'VERB'), ('use', 'VERB'), ('to', 'PRT'), ('test', 'VERB'), ('the', 'DET'), ('magic', 'NOUN'), ('tool', 'NOUN')] 

unigram: [('This', 'DT'), ('is', 'BEZ'), ('a', 'AT'), ('sentence', 'NN'), ('that', 'CS'), ('we', 'PPSS'), ('will', 'MD'), ('use', 'VB'), ('to', 'TO'), ('test', 'NN'), ('the', 'AT'), ('magic', 'JJ'), ('tool', 'NN')] 

bigram: [('This', 'DT'), ('is', 'BEZ'), ('a', 'AT'), ('sentence', 'NN'), ('that', 'CS'), ('we', 'PPSS'), ('will', 'MD'), ('use', 'VB'), ('to', 'TO'), ('test', 'VB'), ('the', 'AT'), ('magic', None), ('tool', None)] 



In [8]:
# grammar - https://stackoverflow.com/questions/6115677/english-grammar-for-parsing-in-nltk
# you can specify your own simple grammar

tokens = "I shot an elephant in my pajamas".split()
my_grammar = nltk.CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | 'my'
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> 'in'
""")

# there are a lot of parsing algorithms: see http://www.nltk.org/book/ch08.html
# Each parser need a grammar as input
parser = nltk.ChartParser(my_grammar)
trees = parser.parse(tokens)

for tree in trees:
    print("Possible sentence structure:\n", tree)
    tree.draw()

Possible sentence structure:
 (S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
Possible sentence structure:
 (S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [5]:
# this is more complex grammar, but it only works with predefined sentences. 
# You can extend this grammar with lexicon
better_grammar = nltk.data.load('file:D:/nltk_data/grammars/large_grammars/atis.cfg')
parser = nltk.ChartParser(better_grammar)
# trees = parser.parse(tokens) - this will fail, because grammar is not full

In [6]:
# pip install git+https://github.com/emilmont/pyStatParser
# but there are tools, that must not fail
from stat_parser import Parser
parser = Parser()
trees = parser.parse("Hey, Siri, call my mom")
print(trees)
trees.draw()

(NP+VP
  (NP (NN hey))
  (, ,)
  (NP (NNP Siri) (, ,) (VP (VB call) (NP (PRP$ my) (NN mom)))))


In [7]:
# nltk also provides a lot of interesting tools for natural text processing
# http://www.nltk.org/book/ch05.html
# E.g. context similarity in a corpus
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
# this will print an answer
text.similar('woman')

man time day year car moment world house family child country boy
state job place way war girl work word
