###  Tokenization

In [1]:
import nltk

In [2]:
txt = 'SHR is my favorite team in IPL'
tokens = nltk.word_tokenize(txt)
tokens

['SHR', 'is', 'my', 'favorite', 'team', 'in', 'IPL']

In [3]:
len(tokens)

7

In [4]:
txt2 = 'SHR is my favorite team in IPL. i love playing cricket'
tokens2 = nltk.sent_tokenize(txt2)
tokens2

['SHR is my favorite team in IPL.', 'i love playing cricket']

In [5]:
len(tokens2)

2

In [6]:
#tokenize our sentence tokens.
for item in tokens2:
    print(nltk.word_tokenize(item))

['SHR', 'is', 'my', 'favorite', 'team', 'in', 'IPL', '.']
['i', 'love', 'playing', 'cricket']


###  Normalization

In [7]:
#Removing punctuation.
w1 = nltk.corpus.gutenberg.words("melville-moby_dick.txt")

In [8]:
w1_10 = w1[:10]
w1_10

['[',
 'Moby',
 'Dick',
 'by',
 'Herman',
 'Melville',
 '1851',
 ']',
 'ETYMOLOGY',
 '.']

In [9]:
# Making everything lower case.
for word in w1_10:
    print(word.lower())

[
moby
dick
by
herman
melville
1851
]
etymology
.


In [10]:
norm = [word.lower() for word in w1_10 if word.isalpha()]
norm

['moby', 'dick', 'by', 'herman', 'melville', 'etymology']

### Stemmers

###### Stemmers help further normalize text when we run into words that might be plural. There are many different kinds of stemmers  available

In [11]:
porter = nltk.PorterStemmer()

In [12]:
my_words = ["cat","cats","lie","lying","run","running","city","cities","month","monthly","woman","women"]

In [13]:
for word in my_words:
    print(porter.stem(word))

cat
cat
lie
lie
run
run
citi
citi
month
monthli
woman
women


In [14]:
lancaster = nltk.LancasterStemmer()

In [15]:
for word in my_words:
    print(lancaster.stem(word))

cat
cat
lie
lying
run
run
city
city
mon
month
wom
wom


In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
wnlem = nltk.WordNetLemmatizer()    

In [18]:
for word in my_words:
    print(wnlem.lemmatize(word))

cat
cat
lie
lying
run
running
city
city
month
monthly
woman
woman


### Part of Speech Tagging

###### A part of speech tagger will identify the part of speech for a sequence of words

In [19]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [20]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [21]:
text = 'i Take PATH train to go to work in Manhatten.'

In [22]:
tokens =  nltk.word_tokenize(text)

In [23]:
tokens

['i',
 'Take',
 'PATH',
 'train',
 'to',
 'go',
 'to',
 'work',
 'in',
 'Manhatten',
 '.']

In [24]:
nltk.pos_tag(tokens)

[('i', 'NNS'),
 ('Take', 'VBP'),
 ('PATH', 'NNP'),
 ('train', 'NN'),
 ('to', 'TO'),
 ('go', 'VB'),
 ('to', 'TO'),
 ('work', 'VB'),
 ('in', 'IN'),
 ('Manhatten', 'NNP'),
 ('.', '.')]

In [25]:
nltk.pos_tag(nltk.word_tokenize('i love NYC'))

[('i', 'NN'), ('love', 'VBP'), ('NYC', 'NN')]

In [26]:
#Create a list of all nouns
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")
md_norm = [word.lower() for word in md if word.isalpha()]
md_tag = nltk.pos_tag(md_norm, tagset='universal')

In [27]:
md_tag[:10]

[('moby', 'NOUN'),
 ('dick', 'NOUN'),
 ('by', 'ADP'),
 ('herman', 'NOUN'),
 ('melville', 'NOUN'),
 ('etymology', 'NOUN'),
 ('supplied', 'VERB'),
 ('by', 'ADP'),
 ('a', 'DET'),
 ('late', 'ADJ')]

In [28]:
md_nouns = [word[0] for word in md_tag if word[1] == "NOUN" ]

In [29]:
nouns_fd = nltk.FreqDist(md_nouns)

In [30]:
nouns_fd.most_common()[:10]

[('i', 1182),
 ('whale', 909),
 ('s', 774),
 ('man', 527),
 ('ship', 498),
 ('sea', 435),
 ('head', 337),
 ('time', 334),
 ('boat', 332),
 ('ahab', 278)]

### Multiple Parts of Speech

###### Words can be tagged with a different part of speech based on usage.

In [31]:
alice      = nltk.corpus.gutenberg.words("carroll-alice.txt")
alice_norm = [word.lower() for word in alice if word.isalpha()]
alice_tags = nltk.pos_tag(alice_norm, tagset='universal')
alice_cfd   = nltk.ConditionalFreqDist(alice_tags)

In [32]:
alice_cfd['answer']

FreqDist({'NOUN': 5, 'VERB': 3, 'ADP': 1})

In [33]:
alice_cfd['book']

FreqDist({'NOUN': 11})

In [34]:
alice_cfd['over']

FreqDist({'ADP': 31, 'PRT': 5, 'ADV': 4})

### Choices

###### Finding all the cases in a given text where there was a choice between two options, "NOUN 'or' NOUN"

In [35]:
stories = nltk.corpus.gutenberg.words("bryant-stories.txt")
tags = nltk.pos_tag(stories, tagset='universal')

In [36]:
tags[:10]

[('[', 'NOUN'),
 ('Stories', 'NOUN'),
 ('to', 'PRT'),
 ('Tell', 'VERB'),
 ('to', 'PRT'),
 ('Children', 'NOUN'),
 ('by', 'ADP'),
 ('Sara', 'NOUN'),
 ('Cone', 'NOUN'),
 ('Bryant', 'NOUN')]

In [37]:
for ((word1, tag1), (word2, tag2), (word3, tag3)) in nltk.trigrams(tags):
    if tag1 == 'NOUN' and word2 == 'or' and tag3 =='NOUN':
        print(word1 + " " + word2 + " " + word3)    

ship or part
food or water
queens or princesses
rank or wealth


### Chunking

###### Through chunking, we can prevent two word entities from being split.

In [38]:
sentence = "I will go to the coffee shop in New York after I get off the jet plane."
sent_tag = nltk.pos_tag(nltk.word_tokenize(sentence))

In [39]:
sent_tag

[('I', 'PRP'),
 ('will', 'MD'),
 ('go', 'VB'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('coffee', 'NN'),
 ('shop', 'NN'),
 ('in', 'IN'),
 ('New', 'NNP'),
 ('York', 'NNP'),
 ('after', 'IN'),
 ('I', 'PRP'),
 ('get', 'VBP'),
 ('off', 'IN'),
 ('the', 'DT'),
 ('jet', 'NN'),
 ('plane', 'NN'),
 ('.', '.')]

In [40]:
sequence =  '''
            CHUNK: {<NNP>+}
                   {<NN>+}
            '''

In [41]:
NPChunker = nltk.RegexpParser(sequence)

In [42]:
result = NPChunker.parse(sent_tag)

In [43]:
print(result)

(S
  I/PRP
  will/MD
  go/VB
  to/TO
  the/DT
  (CHUNK coffee/NN shop/NN)
  in/IN
  (CHUNK New/NNP York/NNP)
  after/IN
  I/PRP
  get/VBP
  off/IN
  the/DT
  (CHUNK jet/NN plane/NN)
  ./.)


### Named Entity Recognition

In [44]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [45]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\750010524\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [46]:
text = open('example.txt').read()

In [47]:
text

'World War II (WWII or WW2), also known as the Second World War, was a global war that lasted from 1939 to 1945, though related conflicts began earlier. It involved the vast majority of the world\'s nationsâ€”including all of the great powersâ€”eventually forming two opposing military alliances: the Allies and the Axis. It was the most widespread war in history, and directly involved more than 100 million people from over 30 countries. In a state of "total war", the major participants threw their entire economic, industrial, and scientific capabilities behind the war effort, erasing the distinction between civilian and military resources. Marked by mass deaths of civilians, including the Holocaust (in which approximately 11 million people were killed) and the strategic bombing of industrial and population centres (in which approximately one million were killed, and which included the atomic bombings of Hiroshima and Nagasaki), it resulted in an estimated 50 million to 85 million fatali

In [48]:
text_tag = nltk.pos_tag(nltk.word_tokenize(text))

In [49]:
text_ch = nltk.ne_chunk(text_tag)

In [50]:
for chunk in text_ch:
    if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk.leaves()))  

ORGANIZATION WWII
ORGANIZATION WW2
ORGANIZATION Second
ORGANIZATION Axis
ORGANIZATION Hiroshima
GPE Nagasaki
ORGANIZATION Empire of Japan
GPE Asia
ORGANIZATION Pacific
ORGANIZATION Republic
GPE China
GPE Poland
GPE Germany
GPE Germany
GPE France
ORGANIZATION United Kingdom
GPE Germany
GPE Europe
ORGANIZATION Axis
GPE Italy
GPE Japan
GPE Germany
GPE Soviet Union
GPE European
GPE Poland
GPE Finland
GPE Romania
GPE Baltic
ORGANIZATION United Kingdom
GPE British
ORGANIZATION European Axis
GPE North Africa
ORGANIZATION Horn
GPE Africa
GPE Britain
GPE Blitz
ORGANIZATION Atlantic
ORGANIZATION European Axis
GPE Soviet Union
ORGANIZATION Axis
GPE Japan
GPE United States
GPE European
ORGANIZATION Pacific Ocean
LOCATION Western Pacific
ORGANIZATION Axis
PERSON Japan
GPE Midway
GPE Hawaii
GPE Germany
GPE North Africa
FACILITY Stalingrad
GPE Soviet Union
GPE German
LOCATION Eastern Front
GPE Italy
GPE Italian
GPE Allied
ORGANIZATION Pacific
ORGANIZATION Axis
LOCATION Western
GPE France
GPE Soviet U