In [8]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk import pos_tag,ne_chunk

In [12]:
nltk.download('punkt') #For tokenizers
nltk.download('punkt_tab')
nltk.download('averaged_preparation')#For PDS tagging
nltk.download('maxent_ne_chunker_tab')#For named entity recognition
nltk.download('words')#For named entity recognition
nltk.download('wordnet')#For lemmatization
nltk.download('stopwords')#For stopword
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Error loading averaged_preparation: Package
[nltk_data]     'averaged_preparation' not found in index
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Un

True

In [4]:
text="""NLTK is a powerful Python library for natural language processing.
It provides tools for tokenization stemming,lemmatization,
and more.Apple Inc. is located in California."""

In [5]:
# 1. Tokenization
words=word_tokenize(text)
sentences=sent_tokenize(text)
print("Words",words)
print("Sentences:",sentences)

Words ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing', '.', 'It', 'provides', 'tools', 'for', 'tokenization', 'stemming', ',', 'lemmatization', ',', 'and', 'more.Apple', 'Inc.', 'is', 'located', 'in', 'California', '.']
Sentences: ['NLTK is a powerful Python library for natural language processing.', 'It provides tools for tokenization stemming,lemmatization,\nand more.Apple Inc. is located in California.']


In [13]:
# 2. Part-of-Speech(Pos) Tagging
pos_tags=pos_tag(words)
print("POS Tags:",pos_tags)

POS Tags: [('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('Python', 'NNP'), ('library', 'NN'), ('for', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.'), ('It', 'PRP'), ('provides', 'VBZ'), ('tools', 'NNS'), ('for', 'IN'), ('tokenization', 'NN'), ('stemming', 'NN'), (',', ','), ('lemmatization', 'NN'), (',', ','), ('and', 'CC'), ('more.Apple', 'NNP'), ('Inc.', 'NNP'), ('is', 'VBZ'), ('located', 'VBN'), ('in', 'IN'), ('California', 'NNP'), ('.', '.')]


In [15]:
# 3. Stemming and Lemmatization
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

stemmed_words=[stemmer.stem(word) for word in words]
lemmatized_words=[lemmatizer.lemmatize(word) for word in words]
print("Stemmed Words :",stemmed_words)
print("Lemmatized Words:",lemmatized_words)

Stemmed Words : ['nltk', 'is', 'a', 'power', 'python', 'librari', 'for', 'natur', 'languag', 'process', '.', 'it', 'provid', 'tool', 'for', 'token', 'stem', ',', 'lemmat', ',', 'and', 'more.appl', 'inc.', 'is', 'locat', 'in', 'california', '.']
Lemmatized Words: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing', '.', 'It', 'provides', 'tool', 'for', 'tokenization', 'stemming', ',', 'lemmatization', ',', 'and', 'more.Apple', 'Inc.', 'is', 'located', 'in', 'California', '.']


In [16]:
# 4. Named Entity Recognition(NER)
ner_tags=ne_chunk(pos_tags)
print("NER Tags:",ner_tags)

NER Tags: (S
  (ORGANIZATION NLTK/NNP)
  is/VBZ
  a/DT
  powerful/JJ
  (PERSON Python/NNP)
  library/NN
  for/IN
  natural/JJ
  language/NN
  processing/NN
  ./.
  It/PRP
  provides/VBZ
  tools/NNS
  for/IN
  tokenization/NN
  stemming/NN
  ,/,
  lemmatization/NN
  ,/,
  and/CC
  more.Apple/NNP
  Inc./NNP
  is/VBZ
  located/VBN
  in/IN
  (GPE California/NNP)
  ./.)


In [18]:
#5. Stop words removal
stop_words=set(stopwords.words('english'))
filtered_words=[word for  word in  words if word.lower() not in stop_words]
print("Filtered Words(Stop Words Removed:",filtered_words)

Filtered Words(Stop Words Removed: ['NLTK', 'powerful', 'Python', 'library', 'natural', 'language', 'processing', '.', 'provides', 'tools', 'tokenization', 'stemming', ',', 'lemmatization', ',', 'more.Apple', 'Inc.', 'located', 'California', '.']


In [21]:
# basic classification

from nltk.classify import NaiveBayesClassifier

def word_feats(words):
    return dict([(word,True) for word in words])

positive_reviews=[("This is an excellent movie","pos"),
                 ("The plot was amazing","pos")]
negative_reviews=[("The moview was terrible ","neg"),
                 ("I did not like the acting","neg")]

positive_features=[(word_feats(word_tokenize(review)),label) for review,label in positive_reviews]
negative_features=[(word_feats(word_tokenize(review)),label) for review,label in negative_reviews]

train_set=positive_features + negative_features

classifier=NaiveBayesClassifier.train(train_set)

test_review="The movie was not great"
test_features=word_feats(word_tokenize(test_review))
print(f"Classification of '{test_review}': {classifier.classify(test_features)}")
    

Classification of 'The movie was not great': pos


In [25]:
from nltk.tokenize import word_tokenize

test_review2 = "The movie was boring"
test_features2 = word_feats(word_tokenize(test_review2))

print(f"Classification of '{test_review2}': {classifier.classify(test_features2)}")


Classification of 'The movie was boring': pos
