In [1]:
#Implementing different NLP Tasks such as tokenization,stemmming, lemmatisation, removal of stopwords, 
#pos tagging etc will be implemented


In [2]:
#tokenisation: process of getting sentences out of list of sentences or document, or getting words out of sentences

samp_text="Hey How are you? How's NLP going? Isn't it cool to extract patterns from text!"

from nltk.tokenize import sent_tokenize, word_tokenize

sent_tokenize(samp_text)#extracting sentences from list of sentences

['Hey How are you?',
 "How's NLP going?",
 "Isn't it cool to extract patterns from text!"]

In [3]:
word_tokenize(samp_text)#extracting sentences from list of sentences

['Hey',
 'How',
 'are',
 'you',
 '?',
 'How',
 "'s",
 'NLP',
 'going',
 '?',
 'Is',
 "n't",
 'it',
 'cool',
 'to',
 'extract',
 'patterns',
 'from',
 'text',
 '!']

In [4]:
#stemming-one of the normalisation techniques, which reduces the words to their base forms by removing influctions

from nltk.stem import PorterStemmer #porter stemmer is frequently used

stemmer=PorterStemmer() #creating an object of that porter stemmer class

stemmer.stem('playing')
stemmer.stem('played')
stemmer.stem('decreases')#stems to a word "decreas" which is not in the dictionary at all


'decreas'

In [5]:
#as stemming sometimes gives words that are not present in dictionary, we have to use lemmatization

from nltk.stem import WordNetLemmatizer
lemm=WordNetLemmatizer()
lemm.lemmatize('increases') #now the root word is much cleaner when compared to stemming

'increase'

In [6]:
lemm.lemmatize('running') #converted to running, because part of speech tag is not provided

'running'

In [13]:
lemm.lemmatize('running',pos='v')#outputs run
#import nltk
#nltk.download('averaged_perceptron_tagger') if pos_tags doesnt work, execute this
from nltk import pos_tag
pos_tag(word_tokenize(samp_text))

[('Hey', 'NNP'),
 ('How', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('?', '.'),
 ('How', 'WRB'),
 ("'s", 'POS'),
 ('NLP', 'NNP'),
 ('going', 'VBG'),
 ('?', '.'),
 ('Is', 'VBZ'),
 ("n't", 'RB'),
 ('it', 'PRP'),
 ('cool', 'VB'),
 ('to', 'TO'),
 ('extract', 'VB'),
 ('patterns', 'NNS'),
 ('from', 'IN'),
 ('text', 'NN'),
 ('!', '.')]

In [17]:
#nltk has connection with wordnet, which is comprehensive vocabulary of all possible words
#so you can get synonyms and anotonyms
#pstag=pos_tag(word_tokenize(samp_text))
#lemm.lemmatize(samp_text,pos=pstag)

from nltk.corpus import wordnet
wordnet.synsets('good')

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

In [22]:
#creating n-grams
from nltk import ngrams
ngrams(word_tokenize(samp_text),3)#output is generator..so we should use for loop to see the output

for grm in ngrams(word_tokenize(samp_text),3): #trigrams, since n=3
    print(grm)

('Hey', 'How', 'are')
('How', 'are', 'you')
('are', 'you', '?')
('you', '?', 'How')
('?', 'How', "'s")
('How', "'s", 'NLP')
("'s", 'NLP', 'going')
('NLP', 'going', '?')
('going', '?', 'Is')
('?', 'Is', "n't")
('Is', "n't", 'it')
("n't", 'it', 'cool')
('it', 'cool', 'to')
('cool', 'to', 'extract')
('to', 'extract', 'patterns')
('extract', 'patterns', 'from')
('patterns', 'from', 'text')
('from', 'text', '!')


In [1]:
import nltk
nltk.download()
s1='Hey David, how are you? Are you going to IBM Corp tomorrow?'
print(s1)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
Hey David, how are you? Are you going to IBM Corp tomorrow?


In [4]:
s1='Hey David, how are you? Are you going to IBM Corp tomorrow?'
print(s1)

Hey David, how are you? Are you going to IBM Corp tomorrow?


In [14]:
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
tokens=nltk.word_tokenize(s1)#need to tokenize
pos=nltk.pos_tag(tokens)#get pos tags for tokens
chunks=nltk.ne_chunk(pos)#get named entity chunks from parts of speech tags


In [17]:
for chunk in chunks:#printing all chunks
    print(chunk)

(PERSON Hey/NNP)
(PERSON David/NNP)
(',', ',')
('how', 'WRB')
('are', 'VBP')
('you', 'PRP')
('?', '.')
('Are', 'VBP')
('you', 'PRP')
('going', 'VBG')
('to', 'TO')
(ORGANIZATION IBM/NNP Corp/NNP)
('tomorrow', 'NN')
('?', '.')


In [19]:
#but of all chunks printed earlier, only few of them are named entities

for chunk in chunks:
    if hasattr(chunk,'label'):
       print(chunk) #below words are named entities

(PERSON Hey/NNP)
(PERSON David/NNP)
(ORGANIZATION IBM/NNP Corp/NNP)
