# Tokenization

In [1]:
import nltk

In [2]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [3]:
a = "Hello and welcome friends to NLP workshop. My name is shridhar mankar. I will be teaching you NLP from scratch"

In [4]:
A = word_tokenize(a)
A

['Hello',
 'and',
 'welcome',
 'friends',
 'to',
 'NLP',
 'workshop',
 '.',
 'My',
 'name',
 'is',
 'shridhar',
 'mankar',
 '.',
 'I',
 'will',
 'be',
 'teaching',
 'you',
 'NLP',
 'from',
 'scratch']

In [5]:
S = sent_tokenize(a)
S

['Hello and welcome friends to NLP workshop.',
 'My name is shridhar mankar.',
 'I will be teaching you NLP from scratch']

# Type, Length and Frequency Checking

In [6]:
type(A),len(A)

(list, 20)

In [7]:
from nltk.probability import FreqDist
frequency = FreqDist()

In [8]:
for i in A:
 frequency[i] = frequency[i]+1

frequency

FreqDist({'NLP': 2, 'Hello': 1, 'and': 1, 'welcome': 1, 'friends': 1, 'to': 1, 'workshop': 1, 'My': 1, 'name': 1, 'is': 1, ...})

# Stemming

In [9]:
from nltk.stem import PorterStemmer
pst = PorterStemmer()

In [10]:
pst.stem('Making')

'make'

In [11]:
for i in A:
    print(pst.stem(i))

hello
and
welcom
friend
to
nlp
workshop
my
name
is
shridhar
mankar
i
will
be
teach
you
nlp
from
scratch


In [12]:
pst.stem('universal')

'univers'

In [13]:
pst.stem('universe')

'univers'

In [14]:
pst.stem('university')

'univers'

In [15]:
pst.stem('alumni')

'alumni'

In [16]:
pst.stem('alumnus')

'alumnu'

# Lemmatization

In [17]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shridharmankar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
lemmatizer = WordNetLemmatizer()

In [19]:
pst.stem('trouble')

'troubl'

In [20]:
lemmatizer.lemmatize('trouble')

'trouble'

In [21]:
for i in A:
    print(lemmatizer.lemmatize(i))

Hello
and
welcome
friend
to
NLP
workshop
My
name
is
shridhar
mankar
I
will
be
teaching
you
NLP
from
scratch


In [22]:
lemmatizer.lemmatize('alumnus')

'alumnus'

In [23]:
lemmatizer.lemmatize('alumni')

'alumnus'

In [24]:
lemmatizer.lemmatize('universe')

'universe'

In [25]:
lemmatizer.lemmatize('university')

'university'

# pos_tag

In [26]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shridharmankar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [27]:
for i in A:
 print(nltk.pos_tag([i]))

[('Hello', 'NN')]
[('and', 'CC')]
[('welcome', 'NN')]
[('friends', 'NNS')]
[('to', 'TO')]
[('NLP', 'NN')]
[('workshop', 'NN')]
[('My', 'PRP$')]
[('name', 'NN')]
[('is', 'VBZ')]
[('shridhar', 'NN')]
[('mankar', 'NN')]
[('I', 'PRP')]
[('will', 'MD')]
[('be', 'VB')]
[('teaching', 'VBG')]
[('you', 'PRP')]
[('NLP', 'NN')]
[('from', 'IN')]
[('scratch', 'NN')]


# Named entity recognition

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [10]:
text= '''Harry Lives in New York'''
words= word_tokenize(text)
postags=pos_tag(words)

In [11]:
tree = nltk.ne_chunk(postags)
print(tree)

(S (PERSON Harry/NNP) Lives/VBZ in/IN (GPE New/NNP York/NNP))


In [18]:
text= 'John wants a new Samsung device from Pune'
words= word_tokenize(text)
postags=pos_tag(words)

In [19]:
tree = nltk.ne_chunk(postags)
print(tree)

(S
  (PERSON John/NNP)
  wants/VBZ
  a/DT
  new/JJ
  (ORGANIZATION Samsung/NNP)
  device/NN
  from/IN
  (GPE Pune/NNP))


# Stopwords

In [33]:
from nltk.corpus import stopwords

In [34]:
stop_words = set(stopwords.words('english'))

In [35]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [36]:
msg = "My name is shridhar mankar, I love making videos and watching kdrama. My speciality is making things easy"

words = word_tokenize(msg)

filtered_sentence = []

for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

print(words)
print(filtered_sentence)

['My', 'name', 'is', 'shridhar', 'mankar', ',', 'I', 'love', 'making', 'videos', 'and', 'watching', 'kdrama', '.', 'My', 'speciality', 'is', 'making', 'things', 'easy']
['My', 'name', 'shridhar', 'mankar', ',', 'I', 'love', 'making', 'videos', 'watching', 'kdrama', '.', 'My', 'speciality', 'making', 'things', 'easy']
