In [1]:
import nltk

In [2]:
import nltk.corpus
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Divyank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Tokenization
from nltk.tokenize import sent_tokenize
text="Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard"
tokenized_text = sent_tokenize(text)


In [4]:
print(tokenized_text)

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and city is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard"]


In [5]:
# break paragraphs to words
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


In [6]:
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_word)
print(fdist)

<FreqDist with 25 samples and 30 outcomes>


In [7]:
fdist.most_common(2)

[('is', 3), (',', 2)]

In [8]:
# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot()
plt.show()

<Figure size 640x480 with 1 Axes>

In [9]:
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(len(stop_words),stop_words)

179 {'other', "hadn't", 'mustn', 'ain', 'the', 't', "isn't", 'am', 'did', 'all', 'nor', 'his', 'own', 'only', 'by', 'do', 'shan', 'once', 'most', 'doesn', 'to', "it's", "weren't", 'be', 'i', 'before', 'but', 'this', "doesn't", 'weren', 'he', 'off', 'no', 'not', 'as', 'with', 'your', 'through', 'just', 'at', 'for', 'below', 'is', 'yours', "didn't", 'hadn', 'himself', 'their', 'when', 'aren', 'our', 'ourselves', "that'll", "you'll", 'itself', 'why', 'how', 're', "mightn't", 'don', 'you', 'while', 'are', "haven't", 'being', 'have', 'during', 'same', 'y', 'having', 'against', 'ma', 'them', "wouldn't", 'above', "shouldn't", 'and', 'because', 'hasn', 'under', 'between', "you're", 'further', 'who', 'needn', 'doing', 'again', 'if', 'myself', 'wasn', 'd', "should've", 'wouldn', 'yourself', 'some', 'few', "mustn't", 'didn', 'that', 'theirs', 'here', 'were', 'been', 'down', 'after', 'shouldn', "won't", 'what', 'had', 'now', 'can', 'of', 'any', 'she', "she's", 'where', 'me', 'each', 'was', 'my', '

In [10]:
# removing stopwords
filtered_sent=[]
for w in tokenized_word:
    if w not in stop_words:
        filtered_sent.append(w)
print("Original Sentence:",tokenized_word)
print("Filterd Sentence:",filtered_sent)

Original Sentence: ['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']
Filterd Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


In [11]:
# stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

stemmed_words=[]
for w in filtered_sent:
    stemmed_words.append(ps.stem(w))

print("Filtered Sentence:",filtered_sent)
print("Stemmed Sentence:",stemmed_words)

Filtered Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']
Stemmed Sentence: ['hello', 'mr.', 'smith', ',', 'today', '?', 'the', 'weather', 'great', ',', 'citi', 'awesom', '.', 'the', 'sky', 'pinkish-blu', '.', 'you', "n't", 'eat', 'cardboard']


In [12]:
# lemmatization
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()

word = "flying"
print("Lemmatized Word:",lem.lemmatize(word,"v"))
print("Stemmed Word:",stem.stem(word))

Lemmatized Word: fly
Stemmed Word: fli


In [13]:
# parts of speech tagging
peace = "What do you mean, 'I dont believe in God'? I talk to him everyday."
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Divyank\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [14]:
peace_tokenize = word_tokenize(peace)
for i in peace_tokenize:
    print(nltk.pos_tag([i]))

[('What', 'WP')]
[('do', 'VB')]
[('you', 'PRP')]
[('mean', 'NN')]
[(',', ',')]
[("'", "''")]
[('I', 'PRP')]
[('dont', 'NN')]
[('believe', 'VB')]
[('in', 'IN')]
[('God', 'NNP')]
[("'", "''")]
[('?', '.')]
[('I', 'PRP')]
[('talk', 'NN')]
[('to', 'TO')]
[('him', 'PRP')]
[('everyday', 'NN')]
[('.', '.')]


In [15]:
sent = "Albert Einstein was born in Ulm, Germany in 1879."
tokens=nltk.word_tokenize(sent)
print(tokens)

['Albert', 'Einstein', 'was', 'born', 'in', 'Ulm', ',', 'Germany', 'in', '1879', '.']


In [16]:
nltk.pos_tag(tokens)

[('Albert', 'NNP'),
 ('Einstein', 'NNP'),
 ('was', 'VBD'),
 ('born', 'VBN'),
 ('in', 'IN'),
 ('Ulm', 'NNP'),
 (',', ','),
 ('Germany', 'NNP'),
 ('in', 'IN'),
 ('1879', 'CD'),
 ('.', '.')]

In [17]:
# n grams
import re
from nltk.util import ngrams

s = "Natural-language processing (NLP) is an area of computer science " \
    "and artificial intelligence concerned with the interactions " \
    "between computers and human (natural) languages."

s = s.lower()
#s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
output = list(ngrams(tokens, 5))

In [18]:
print('s',s)
print(s.split(" "))
print('tokens',tokens)

s natural-language processing (nlp) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages.
['natural-language', 'processing', '(nlp)', 'is', 'an', 'area', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(natural)', 'languages.']
tokens ['natural-language', 'processing', '(nlp)', 'is', 'an', 'area', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(natural)', 'languages.']


In [19]:
output

[('natural-language', 'processing', '(nlp)', 'is', 'an'),
 ('processing', '(nlp)', 'is', 'an', 'area'),
 ('(nlp)', 'is', 'an', 'area', 'of'),
 ('is', 'an', 'area', 'of', 'computer'),
 ('an', 'area', 'of', 'computer', 'science'),
 ('area', 'of', 'computer', 'science', 'and'),
 ('of', 'computer', 'science', 'and', 'artificial'),
 ('computer', 'science', 'and', 'artificial', 'intelligence'),
 ('science', 'and', 'artificial', 'intelligence', 'concerned'),
 ('and', 'artificial', 'intelligence', 'concerned', 'with'),
 ('artificial', 'intelligence', 'concerned', 'with', 'the'),
 ('intelligence', 'concerned', 'with', 'the', 'interactions'),
 ('concerned', 'with', 'the', 'interactions', 'between'),
 ('with', 'the', 'interactions', 'between', 'computers'),
 ('the', 'interactions', 'between', 'computers', 'and'),
 ('interactions', 'between', 'computers', 'and', 'human'),
 ('between', 'computers', 'and', 'human', '(natural)'),
 ('computers', 'and', 'human', '(natural)', 'languages.')]

In [20]:
# named entity recognition
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [21]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [22]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    print('Word Tokenize:',sent)
    sent = nltk.pos_tag(sent)
    return sent

In [23]:
import nltk
sent = preprocess(ex)
sent

Word Tokenize: ['European', 'authorities', 'fined', 'Google', 'a', 'record', '$', '5.1', 'billion', 'on', 'Wednesday', 'for', 'abusing', 'its', 'power', 'in', 'the', 'mobile', 'phone', 'market', 'and', 'ordered', 'the', 'company', 'to', 'alter', 'its', 'practices']


[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [24]:
# noun phrase chunking
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [25]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs) #The output can be read as a tree or a hierarchy with S as the first level, denoting sentence.

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [26]:
from nltk.chunk import conlltags2tree, tree2conlltags #CoNLL, the Conference on Natural Language Learning, is SIGNLL's yearly meeting.
from pprint import pprint
iob_tagged = tree2conlltags(cs) #Convert a tree to the CoNLL IOB tag format.
pprint(iob_tagged) # input output beggining tag

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [27]:
from nltk import ne_chunk
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)#Geo-Political Entity

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)
