In [0]:
import nltk

# Tokenization

In [0]:
  nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
#WORD TOKENIZATION
from nltk.tokenize import word_tokenize
text = "CDAC workshop is on 5th and 6th july."
print(word_tokenize(text))


['CDAC', 'workshop', 'is', 'on', '5th', 'and', '6th', 'july', '.']


In [0]:
#SENTENCE TOKENIZATION
from nltk.tokenize import sent_tokenize
text = "CDAC workshop is on 5th and 6th july. Will get to machine learning, deep learning, NLP and many more."
print(sent_tokenize(text))

['CDAC workshop is on 5th and 6th july.', 'Will get to machine learning, deep learning, NLP and many more.']


# POS Tagging

In [0]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
from nltk import pos_tag
text ="CDAC workshop is on 5th and 6th july.".split()
print("After Split:",text)
tokens_tag = pos_tag(text)
print("After Token:",tokens_tag)

After Split: ['CDAC', 'workshop', 'is', 'on', '5th', 'and', '6th', 'july.']
After Token: [('CDAC', 'NNP'), ('workshop', 'NN'), ('is', 'VBZ'), ('on', 'IN'), ('5th', 'CD'), ('and', 'CC'), ('6th', 'CD'), ('july.', 'NN')]


In [0]:
from collections import Counter
import nltk
text = "CDAC workshop is on 5th and 6th july."
lower_case = text.lower()
print(lower_case)
tokens = nltk.word_tokenize(lower_case)
tags = nltk.pos_tag(tokens)
counts = Counter( tag for word,  tag in tags)
print(counts)

cdac workshop is on 5th and 6th july.
Counter({'NN': 3, 'CD': 2, 'VBZ': 1, 'IN': 1, 'CC': 1, '.': 1})


#  Chunking

In [0]:
from nltk import RegexpParser
text = "learn machine learning and NLP."
tokens = nltk.word_tokenize(text)
print(tokens)
tag = nltk.pos_tag(tokens)
print(tag)
grammar = "S: {<NN>*<NN>}"
cp  =nltk.RegexpParser(grammar)
result = cp.parse(tag)
print(result)

['learn', 'machine', 'learning', 'and', 'NLP', '.']
[('learn', 'JJ'), ('machine', 'NN'), ('learning', 'NN'), ('and', 'CC'), ('NLP', 'NNP'), ('.', '.')]
(S learn/JJ (S machine/NN learning/NN) and/CC NLP/NNP ./.)


CC- coordinating conjunction ,
NNP - proper noun, singular ,
NN - noun, singular ,
JJ - adjective

In [0]:
patterns= """mychunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}"""
chunker = RegexpParser(patterns)
print("After Regex:",chunker)
output = chunker.parse(tokens_tag)
print("After Chunking",output)

After Regex: chunk.RegexpParser with 1 stages:
RegexpChunkParser with 1 rules:
       <ChunkRule: '<NN.?>*<VBD.?>*<JJ.?>*<CC>?'>
After Chunking (S
  (mychunk CDAC/NNP workshop/NN)
  is/VBZ
  on/IN
  5th/CD
  (mychunk and/CC)
  6th/CD
  (mychunk july./NN))


# Stemmer

In [0]:
from nltk.stem.porter import PorterStemmer
e_words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in e_words:
    rootWord=ps.stem(w)
    print(rootWord)

wait
wait
wait
wait


# Lemmatization

In [0]:
from nltk.stem.porter import PorterStemmer
porter_stemmer  = PorterStemmer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Stemming for {} is {}".format(w,porter_stemmer.stem(w)))  

Stemming for studies is studi
Stemming for studying is studi
Stemming for cries is cri
Stemming for cry is cri


In [0]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))  

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


# Stop words

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

{'should', 'me', 'where', 'shouldn', "didn't", 'his', 'only', 'them', 'how', 'wouldn', "isn't", 'that', 'he', 'whom', 'be', 'mightn', 'were', 'hers', 'are', 'more', 'on', 'do', 'my', 'yourselves', 'being', 'up', 'own', "mustn't", "weren't", 'have', 'm', 'myself', 'than', 'to', 'shan', 'i', 'doing', "you'll", 'and', 'in', 'its', 'him', 'mustn', 'won', 'or', "it's", 'it', "she's", 're', 'other', 'which', 'll', 'off', 'hadn', 'does', 'herself', 'we', 'having', 'd', 'once', 's', 'all', 'aren', 'why', "needn't", 't', 'can', 'above', 'itself', 'has', 'until', "don't", 'yourself', "aren't", 'yours', 'am', 'had', "shan't", "wouldn't", 'most', 'about', 'few', 'from', 'ma', "you'd", 'but', 'our', 'themselves', 'because', 'over', 'both', 'don', 'been', 'when', 'these', 'then', 'y', 'each', 'himself', 'was', 'haven', 'ain', 'if', 'o', 'this', 'before', "wasn't", 'theirs', 'too', 'of', 'during', "you're", 'through', 'a', "won't", 'same', 'an', 've', "haven't", 'no', 'who', 'hasn', "you've", 'agains

In [0]:
text = "CDAC workshop is on 5th and 6th july."
tokenized_sent= word_tokenize(text)

filtered_sent=[]
for w in tokenized_sent:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized Sentence:",tokenized_sent)
print("Filterd Sentence:",filtered_sent)

Tokenized Sentence: ['CDAC', 'workshop', 'is', 'on', '5th', 'and', '6th', 'july', '.']
Filterd Sentence: ['CDAC', 'workshop', '5th', '6th', 'july', '.']
