#### NLTK stands for Natural Language Toolkit

https://www.nltk.org/book/ NLTK Book

In [1]:
import nltk

### Tokenizing and tagging some text

In [2]:
text= "This text is going to be tokenized and tagged by Ria Lele."

In [3]:
tokens=nltk.word_tokenize(text)

In [4]:
tokens

['This',
 'text',
 'is',
 'going',
 'to',
 'be',
 'tokenized',
 'and',
 'tagged',
 'by',
 'Ria',
 'Lele',
 '.']

In [5]:
tagged=nltk.pos_tag(tokens)

In [6]:
tagged

[('This', 'DT'),
 ('text', 'NN'),
 ('is', 'VBZ'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('tokenized', 'VBN'),
 ('and', 'CC'),
 ('tagged', 'VBN'),
 ('by', 'IN'),
 ('Ria', 'NNP'),
 ('Lele', 'NNP'),
 ('.', '.')]

### Identifying the named entities

In [7]:
entities = nltk.chunk.ne_chunk(tagged)

In [8]:
print(entities)

(S
  This/DT
  text/NN
  is/VBZ
  going/VBG
  to/TO
  be/VB
  tokenized/VBN
  and/CC
  tagged/VBN
  by/IN
  (PERSON Ria/NNP Lele/NNP)
  ./.)


Identified the Name Entity 'Ria Lele' as a Person correctly

### Displaying a parse tree

In [9]:
from nltk.corpus import treebank

In [10]:
t= treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()

This displays a tree as a seperate tab on the device

### Investigating the corpora provided by NLTK

In [11]:
import os
import nltk.corpus

In [12]:
print(os.listdir(nltk.data.find("corpora")))

['abc', 'abc.zip', 'alpino', 'alpino.zip', 'biocreative_ppi', 'biocreative_ppi.zip', 'brown', 'brown.zip', 'brown_tei', 'brown_tei.zip', 'cess_cat', 'cess_cat.zip', 'cess_esp', 'cess_esp.zip', 'chat80', 'chat80.zip', 'city_database', 'city_database.zip', 'cmudict', 'cmudict.zip', 'comparative_sentences', 'comparative_sentences.zip', 'comtrans.zip', 'conll2000', 'conll2000.zip', 'conll2002', 'conll2002.zip', 'conll2007.zip', 'crubadan', 'crubadan.zip', 'dependency_treebank', 'dependency_treebank.zip', 'dolch', 'dolch.zip', 'europarl_raw', 'europarl_raw.zip', 'extended_omw', 'extended_omw.zip', 'floresta', 'floresta.zip', 'framenet_v15', 'framenet_v15.zip', 'framenet_v17', 'framenet_v17.zip', 'gazetteers', 'gazetteers.zip', 'genesis', 'genesis.zip', 'gutenberg', 'gutenberg.zip', 'ieer', 'ieer.zip', 'inaugural', 'inaugural.zip', 'indian', 'indian.zip', 'jeita.zip', 'kimmo', 'kimmo.zip', 'knbc.zip', 'lin_thesaurus', 'lin_thesaurus.zip', 'machado.zip', 'mac_morpho', 'mac_morpho.zip', 'masc_

In [13]:
from nltk.corpus import brown
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [14]:
hamlet= nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')

In [15]:
hamlet

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...]

In [16]:
for word in hamlet[:500]:
    print(word, sep=' ',end=' ')

[ The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had quiet Guard ? Fran . Not a Mouse stirring Barn . Well , goodnight . If you do meet Horatio and Marcellus , the Riuals of my Watch , bid them make hast . Enter Horatio and Marcellus . Fran . I thinke I heare them . Stand : who ' s there ? Hor . Friends to this ground Mar . And Leige - men to the Dane Fran . Giue you good night Mar . O farwel honest Soldier , who hath relieu ' d you ? Fra . Barnardo ha ' s my place : giue you goodnight . Exit Fran . Mar . Holla Barnardo Bar . Say , what is Horatio there ? Hor . A peece of him Bar 

In [17]:
from nltk.tokenize import word_tokenize

In [18]:
text_t= word_tokenize(text)

In [19]:
print(text_t)

['This', 'text', 'is', 'going', 'to', 'be', 'tokenized', 'and', 'tagged', 'by', 'Ria', 'Lele', '.']


#### Frequency Distinct gives the word count

In [20]:
from nltk.probability import FreqDist
fdist= FreqDist()

In [21]:
for word in text_t:
    fdist[word.lower()]+=1
fdist

FreqDist({'this': 1, 'text': 1, 'is': 1, 'going': 1, 'to': 1, 'be': 1, 'tokenized': 1, 'and': 1, 'tagged': 1, 'by': 1, ...})

In [22]:
fdist['text']

1

In [23]:
fdist_top10=fdist.most_common(10)
fdist_top10

[('this', 1),
 ('text', 1),
 ('is', 1),
 ('going', 1),
 ('to', 1),
 ('be', 1),
 ('tokenized', 1),
 ('and', 1),
 ('tagged', 1),
 ('by', 1)]

#### In this example all the words have the same frequncy. This could be helpful in other cases.

#### Blankline Tokenize

In [24]:
from nltk.tokenize import blankline_tokenize

In [25]:
text_b=blankline_tokenize(text)

In [26]:
len(text_b)

1

#### This gives the number of paragraphs seperated by a new line. Here its 1. 

text_b[n-1] will give you the nth paragraph which is seperated by a new line. 

### Bigrams, Trigrams and Ngrams

In [27]:
from nltk.util import bigrams, trigrams, ngrams

In [28]:
text_bigrams= list(nltk.bigrams(text_t))
text_bigrams

[('This', 'text'),
 ('text', 'is'),
 ('is', 'going'),
 ('going', 'to'),
 ('to', 'be'),
 ('be', 'tokenized'),
 ('tokenized', 'and'),
 ('and', 'tagged'),
 ('tagged', 'by'),
 ('by', 'Ria'),
 ('Ria', 'Lele'),
 ('Lele', '.')]

In [29]:
text_trigrams= list(nltk.trigrams(text_t))
text_trigrams

[('This', 'text', 'is'),
 ('text', 'is', 'going'),
 ('is', 'going', 'to'),
 ('going', 'to', 'be'),
 ('to', 'be', 'tokenized'),
 ('be', 'tokenized', 'and'),
 ('tokenized', 'and', 'tagged'),
 ('and', 'tagged', 'by'),
 ('tagged', 'by', 'Ria'),
 ('by', 'Ria', 'Lele'),
 ('Ria', 'Lele', '.')]

In [30]:
text_ngrams= list(nltk.ngrams(text_t, 4))
text_ngrams

[('This', 'text', 'is', 'going'),
 ('text', 'is', 'going', 'to'),
 ('is', 'going', 'to', 'be'),
 ('going', 'to', 'be', 'tokenized'),
 ('to', 'be', 'tokenized', 'and'),
 ('be', 'tokenized', 'and', 'tagged'),
 ('tokenized', 'and', 'tagged', 'by'),
 ('and', 'tagged', 'by', 'Ria'),
 ('tagged', 'by', 'Ria', 'Lele'),
 ('by', 'Ria', 'Lele', '.')]

### Stemming (uses stem of the word)

Porter Stemmer

In [31]:
word_to_stem=['give','giving','given','gave']

In [32]:
from nltk.stem import PorterStemmer
pst=PorterStemmer()

In [33]:
pst.stem("having")

'have'

In [34]:
for word in word_to_stem:
    print(word+":"+pst.stem(word))

give:give
giving:give
given:given
gave:gave


Lancaster Stemmer

In [35]:
from nltk.stem import LancasterStemmer
lcs=LancasterStemmer()

In [36]:
lcs.stem('having')

'hav'

In [37]:
for word in word_to_stem:
    print(word+":"+lcs.stem(word))

give:giv
giving:giv
given:giv
gave:gav


Note the difference between the outputs of both stemmers

Snowball Stemmer

In [38]:
from nltk.stem import SnowballStemmer
sbst= SnowballStemmer('english')

In [39]:
sbst.stem('having')

'have'

In [40]:
for word in word_to_stem:
    print(word+":"+sbst.stem(word))

give:give
giving:give
given:given
gave:gave


### Lemmatization (uses context in which word is used)

Groups together different inflected forms of a word called lemma

In [42]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
word_lem=WordNetLemmatizer()

In [43]:
word_lem.lemmatize('corpora')

'corpus'

In [45]:
for word in word_to_stem:
    print(word+":"+word_lem.lemmatize(word))

give:give
giving:giving
given:given
gave:gave


No POS Tags have been assigned so it has assumed all words as nouns

### Stopwords

In [46]:
from nltk.corpus import stopwords

In [47]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

RE Module

#### Removing Punctuations

In [60]:
import re
punctuation=re.compile(r'[-.?!,:;()|0-9]')

In [61]:
post_punc=[]
for word in text_t:
    word=punctuation.sub("",word)
    if len(word)>0:
        post_punc.append(word)

In [62]:
post_punc

['This',
 'text',
 'is',
 'going',
 'to',
 'be',
 'tokenized',
 'and',
 'tagged',
 'by',
 'Ria',
 'Lele']

### POS- Parts of Speech

In [63]:
for token in text_t:
    print(nltk.pos_tag([token]))

[('This', 'DT')]
[('text', 'NN')]
[('is', 'VBZ')]
[('going', 'VBG')]
[('to', 'TO')]
[('be', 'VB')]
[('tokenized', 'VBN')]
[('and', 'CC')]
[('tagged', 'VBN')]
[('by', 'IN')]
[('Ria', 'NN')]
[('Lele', 'NNP')]
[('.', '.')]


DT- Determiner

### Named Entity Recognition (NER)

Can be a movie, organization, person, monetary value, location, quantity,etc.

In [64]:
from nltk import ne_chunk

In [65]:
text_tagged=nltk.pos_tag(text_t)

In [67]:
ne_ner=ne_chunk(text_tagged)

In [68]:
print(ne_ner)

(S
  This/DT
  text/NN
  is/VBZ
  going/VBG
  to/TO
  be/VB
  tokenized/VBN
  and/CC
  tagged/VBN
  by/IN
  (PERSON Ria/NNP Lele/NNP)
  ./.)


### Syntax Tree

Tree representation of the syntactic structure of sentences or strings

### Chunking- Picking up pieces of information and grouping them into bigger pieces

In [69]:
grammar_np=r"NP: {<DT>?<JJ>*<NN>}"

In [71]:
chunk_parser= nltk.RegexpParser(grammar_np)

In [72]:
chunk_result= chunk_parser.parse(text_tagged)
chunk_result

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [Tree('NP', [('This', 'DT'), ('text', 'NN')]), ('is', 'VBZ'), ('going', 'VBG'), ('to', 'TO'), ('be', 'VB'), ('tokenized', 'VBN'), ('and', 'CC'), ('tagged', 'VBN'), ('by', 'IN'), ('Ria', 'NNP'), ('Lele', 'NNP'), ('.', '.')])