# Natural Language Processing

## Using NLTK

In [1]:
import nltk

In [4]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/satyapattnaik/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/satyapattnaik/nltk_data...


True

In [40]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/satyapattnaik/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [47]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/satyapattnaik/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [49]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/satyapattnaik/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [67]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/satyapattnaik/nltk_data...


True

In [87]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/satyapattnaik/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Tokenization

In [2]:
sentence = "We are having a class on Natural Language Processing"

In [5]:
tokens = nltk.word_tokenize(sentence)

In [6]:
tokens

['We',
 'are',
 'having',
 'a',
 'class',
 'on',
 'Natural',
 'Language',
 'Processing']

# Lemmatization/Stemming

- https://en.wikipedia.org/wiki/Stemming
- https://en.wikipedia.org/wiki/Inflection

### Porter
- It is removing well known suffixes

In [7]:
from nltk.stem import PorterStemmer

In [22]:
stemmer = PorterStemmer()

In [14]:
word = "Processing"
second_word = "Processed"
third_word = "Process"

In [10]:
print(stemmer.stem(word))

process


In [12]:
print(stemmer.stem(second_word))

process


In [15]:
print(stemmer.stem(third_word))

process


In [17]:
words = ["play", "played", "playing", "plays"]
for each_word in words:
    print(each_word, "-->>>>>",stemmer.stem(each_word))

play -->>>>> play
played -->>>>> play
playing -->>>>> play
plays -->>>>> play


In [23]:
words = ["give","gave","given","giving"]
for each_word in words:
    print(each_word, "-->>>>>",stemmer.stem(each_word))

give -->>>>> give
gave -->>>>> gave
given -->>>>> given
giving -->>>>> give


### Snowball

In [18]:
from nltk.stem import SnowballStemmer

In [19]:
stemmer = SnowballStemmer(language="english")

In [20]:
words = ["play", "played", "playing", "plays"]
for each_word in words:
    print(each_word, "-->>>>>",stemmer.stem(each_word))

play -->>>>> play
played -->>>>> play
playing -->>>>> play
plays -->>>>> play


In [21]:
words = ["give","gave","given","giving"]
for each_word in words:
    print(each_word, "-->>>>>",stemmer.stem(each_word))

give -->>>>> give
gave -->>>>> gave
given -->>>>> given
giving -->>>>> give


## Lemmatization

In [24]:
from nltk.stem import WordNetLemmatizer

In [25]:
lemmatizer = WordNetLemmatizer()

In [28]:
sentence = "I am giving a lecture on Natutal Language Processing. After that towards 8pm, I will have dinner."

In [29]:
tokenized_sentence = nltk.word_tokenize(sentence)
print(tokenized_sentence)

['I', 'am', 'giving', 'a', 'lecture', 'on', 'Natutal', 'Language', 'Processing', '.', 'After', 'that', 'towards', '8pm', ',', 'I', 'will', 'have', 'dinner', '.']


In [32]:
lemmatized_sentence = ""

for each_token in tokenized_sentence:
    lemmatized_sentence = lemmatized_sentence + lemmatizer.lemmatize(each_token) + " "

In [33]:
print(lemmatized_sentence)

I am giving a lecture on Natutal Language Processing . After that towards 8pm , I will have dinner . 


In [34]:
stemmed_sentence = ""

for each_token in tokenized_sentence:
    stemmed_sentence = stemmed_sentence + stemmer.stem(each_token) + " "

In [35]:
print(stemmed_sentence)

i am give a lectur on natut languag process . after that toward 8pm , i will have dinner . 


### Stemming - Resulting word might not be actual

In [36]:
stemmer.stem("studies")

'studi'

### Lemmatizer - Slower but result will always be actual

In [37]:
lemmatizer.lemmatize("studies")

'study'

# POS Tagging

https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk

In [41]:
sentence = "Welcome to SkillVertex"
tokens = nltk.word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[('Welcome', 'VB'), ('to', 'TO'), ('SkillVertex', 'NNP')]


In [42]:
sentence = "Welcoming to SkillVertex"
tokens = nltk.word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[('Welcoming', 'VBG'), ('to', 'TO'), ('SkillVertex', 'NNP')]


In [43]:
for each_word in pos_tags:
    if each_word[1] == 'NNP':
        print(each_word[0])

SkillVertex


# Named Entity Recognition

In [60]:
sentence = " Mahatma Gandhi is regarded as the father of the nation."
#sentence = "New Delhi is the capital of India."
#sentence = "Google Invented Transformers. Openai came up with ChatGPT"

In [61]:
tokens = nltk.word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[('Mahatma', 'NNP'), ('Gandhi', 'NNP'), ('is', 'VBZ'), ('regarded', 'VBN'), ('as', 'IN'), ('the', 'DT'), ('father', 'NN'), ('of', 'IN'), ('the', 'DT'), ('nation', 'NN'), ('.', '.')]


In [62]:
ner_tags = nltk.ne_chunk(pos_tags)
print(ner_tags)

(S
  (PERSON Mahatma/NNP)
  (ORGANIZATION Gandhi/NNP)
  is/VBZ
  regarded/VBN
  as/IN
  the/DT
  father/NN
  of/IN
  the/DT
  nation/NN
  ./.)


# Sentiment Classification

In [65]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [68]:
sentiment_classifer = SentimentIntensityAnalyzer()

In [69]:
sentence = "This Song is amazing."
sentiment_classifer.polarity_scores(sentence)

{'neg': 0.0, 'neu': 0.441, 'pos': 0.559, 'compound': 0.5859}

In [70]:
sentence = "This Song is horrible."
sentiment_classifer.polarity_scores(sentence)

{'neg': 0.538, 'neu': 0.462, 'pos': 0.0, 'compound': -0.5423}

The compound score is the sum of positive, negative & neutral scores which is then normalized between -1(most extreme negative) and +1 (most extreme positive).

# Sentence Tokenizer

In [71]:
text = "Arijit Singh is a good Singer. He is from India"

In [72]:
nltk.sent_tokenize(text)

['Arijit Singh is a good Singer.', 'He is from India']

# Remove punctuations

In [73]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [74]:
def remove_puncts(input_string):
    punctuations = string.punctuation

    op_string = "".join(each_token for each_token in input_string if each_token not in punctuations)

    return op_string

In [77]:
text = "Natural Language, Processing is great!"

In [81]:
clean_text = remove_puncts(text)
clean_text

'Natural Language Processing is great'

In [79]:
def get_lower(word):
    return word.lower()

In [82]:
clean_tokens = nltk.word_tokenize(clean_text)
clean_tokens

['Natural', 'Language', 'Processing', 'is', 'great']

In [90]:
clean_lower_case_tokens = [get_lower(each_token) for each_token in clean_tokens]

In [91]:
clean_lower_case_tokens

['natural', 'language', 'processing', 'is', 'great']

# Stop Word Removal

In [84]:
from nltk.corpus import stopwords

In [89]:
stop_words_to_be_removed = set(stopwords.words("english"))

In [93]:
filtered_tokens = [each_token for each_token in clean_lower_case_tokens if each_token not in stop_words_to_be_removed]

In [94]:
filtered_tokens

['natural', 'language', 'processing', 'great']