In [26]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package vader

True

# Tokenization

In [29]:
#!pip install nltk
import nltk

In [31]:
# tokenizing involves splitting sentences and words from the body of the text

from nltk.tokenize import sent_tokenize, word_tokenize

text = "Wow, Natural language processing is an exciting area. Huge budget have been allocated for this."

print(sent_tokenize(text))
print(word_tokenize(text))

['Wow, Natural language processing is an exciting area.', 'Huge budget have been allocated for this.']
['Wow', ',', 'Natural', 'language', 'processing', 'is', 'an', 'exciting', 'area', '.', 'Huge', 'budget', 'have', 'been', 'allocated', 'for', 'this', '.']


# Stop Words removal

In [34]:
# Removes top words like the, he, her etc

from nltk.corpus import stopwords
words = text.split()
words = [w for w in words if w not in stopwords.words('english')]
print(words)

['Wow,', 'Natural', 'language', 'processing', 'exciting', 'area.', 'Huge', 'budget', 'allocated', 'this.']


# Stemming

In [37]:
# Eliminates prefixes and suffixes from words, transforming them into their fundamental or root form

from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()
stemmed_words = [porter_stemmer.stem(word) for word in words]
print("Stemmed words:", stemmed_words)

Stemmed words: ['wow,', 'natur', 'languag', 'process', 'excit', 'area.', 'huge', 'budget', 'alloc', 'this.']


# Lemmatization

In [40]:
# Process of reducing a word to its base or dictionary form

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("lemmatized words:", lemmatized_words)

lemmatized words: ['Wow,', 'Natural', 'language', 'processing', 'exciting', 'area.', 'Huge', 'budget', 'allocated', 'this.']


In [44]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [46]:
# Example sentence
sentence = "The striped cat jumped over the fence"

# Tokenize the sentence
words = nltk.word_tokenize(sentence)

# Get the part of speech for each word
pos_tags = nltk.pos_tag(words)

# Map POS tag to first character lemmatize() accepts
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'  # adjective
    elif tag.startswith('V'):
        return 'v'  # verb
    elif tag.startswith('N'):
        return 'n'  # noun
    elif tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun

# Lemmatize each word with its POS tag
lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]

# Print the original and lemmatized words
for original, lemmatized in zip(words, lemmatized_words):
    print(f"Original: {original}, Lemmatized: {lemmatized}")



Original: The, Lemmatized: The
Original: striped, Lemmatized: striped
Original: cat, Lemmatized: cat
Original: jumped, Lemmatized: jump
Original: over, Lemmatized: over
Original: the, Lemmatized: the
Original: fence, Lemmatized: fence


# POS tagging

In [49]:
# Part of Speech tagging is used in text processing to avoid confusion between 
# two same words that have different meanings

tagged = nltk.pos_tag(words)
print(tagged)

[('The', 'DT'), ('striped', 'JJ'), ('cat', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('fence', 'NN')]


In [51]:


# Example sentence
sentence = "The quick brown fox jumps over the lazy dog"

# Tokenize the sentence
words = nltk.word_tokenize(sentence)

# Perform POS tagging
pos_tags = nltk.pos_tag(words)

# Print the POS tags
for word, tag in pos_tags:
    print(f"Word: {word}, POS Tag: {tag}")


Word: The, POS Tag: DT
Word: quick, POS Tag: JJ
Word: brown, POS Tag: NN
Word: fox, POS Tag: NN
Word: jumps, POS Tag: VBZ
Word: over, POS Tag: IN
Word: the, POS Tag: DT
Word: lazy, POS Tag: JJ
Word: dog, POS Tag: NN


## Explanation of some POS tags:

# DT: Determiner
# JJ: Adjective
# NN: Noun, singular or mass
# VBZ: Verb, 3rd person singular present
# IN: Preposition or subordinating conjunction
# This example demonstrates how to tokenize a sentence into words and then tag each word with its corresponding part of speech using NLTK's pos_tag function.

In [54]:
#NER

In [60]:
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

# Download necessary data for nltk
nltk.download('maxent_ne_chunker')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Example sentence
sentence = "Barack Obama was born in Hawaii and served as the 44th President of the United States."

# Tokenize the sentence
words = word_tokenize(sentence)

# Perform POS tagging
pos_tags = pos_tag(words)

# Perform NER
named_entities = ne_chunk(pos_tags)

# Extract named entities
for chunk in named_entities:
    if isinstance(chunk, Tree):
        entity = " ".join(c[0] for c in chunk)
        entity_type = chunk.label()
        print(f"Entity: {entity}, Type: {entity_type}")



[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Entity: Barack, Type: PERSON
Entity: Obama, Type: PERSON
Entity: Hawaii, Type: GPE
Entity: United States, Type: GPE


## Explanation of some entity labels:
# Entity: Barack Obama, Type: PERSON
# Entity: Hawaii, Type: GPE
# Entity: United States, Type: GPE

In [63]:
# Chunking

In [65]:
import nltk
from nltk import word_tokenize, pos_tag, RegexpParser
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "Mr Alok is take a franchise of Apple Inc. is planning to open a new store in Delhi on January 1, 2022"

# Tokenize the text
tokens = word_tokenize(text)

# Part-of-speech tagging
pos_tags = pos_tag(tokens)

# Define a chunk grammar using regular expressions
chunk_grammar = r"""
    NP: {<DT>?<JJ>*<NN>}  # Chunk sequences of determiner, adjectives, and nouns as NP
    PP: {<IN><NP>}       # Chunk prepositions followed by NP as PP
    VP: {<VB.*><NP|PP>*}  # Chunk verbs followed by NP or PP as VP
"""

# Create a chunk parser with the defined grammar
chunk_parser = RegexpParser(chunk_grammar)

# Apply chunking
chunked_result = chunk_parser.parse(pos_tags)

# Print the chunked result
print(chunked_result)


(S
  Mr/NNP
  Alok/NNP
  (VP is/VBZ)
  (VP take/VB (NP a/DT franchise/NN))
  of/IN
  Apple/NNP
  Inc./NNP
  (VP is/VBZ)
  (VP planning/VBG)
  to/TO
  (VP open/VB (NP a/DT new/JJ store/NN))
  in/IN
  Delhi/NNP
  on/IN
  January/NNP
  1/CD
  ,/,
  2022/CD)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [67]:
# Sentiment analysis

In [69]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, Natural language processing is an exciting area. Huge budget have been allocated for this.")

{'neg': 0.0, 'neu': 0.482, 'pos': 0.518, 'compound': 0.8957}

In [71]:
def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)['compound']

    if sentiment_score >= 0.05:
        return "Positive"
    elif sentiment_score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

In [73]:
sentiment = analyze_sentiment("Wow, Natural language processing is an exciting area. Huge budget have been allocated for this.")
print(f"Sentiment: {sentiment}")

Sentiment: Positive


In [75]:
pip install gensim


Note: you may need to restart the kernel to use updated packages.


In [77]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample sentences
sentences = [ 'Wow, Natural language processing is an exciting area. Huge budget have been allocated for this.']

# Tokenize the sentences into words
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=10, window=2, min_count=1, workers=1)

# Save the model (optional)
model.save("word2vec.model")

# Get the vector representation of a word
word_vector = model.wv['exciting']


# Print results
print("Vector representation of 'exciting':", word_vector)


Vector representation of 'exciting': [ 0.07313307  0.05071469  0.0676039   0.00765088  0.06350219 -0.03407759
 -0.00945148  0.05771925 -0.07524488 -0.03937855]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SENAPATHI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
