# Natural Language Processing (NLP) with Python

In [None]:
#Open the text file :
text_file = open("nlp.txt")
#Read the data :
text = text_file.read()
#Print the text :
print(text)

## Exploring Features of NLTK:

In [None]:
#Import required libraries :
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [None]:
# Sentence tokenization
sentences = sent_tokenize(text)
print (len(sentences))
print(sentences)

In [None]:
#Word tokenization
words = word_tokenize(text)
print(len(words))
print(words)

In [None]:
# The FreqDist class is used to encode “frequency distributions”.
# FreqDist count the number of times that each outcome of an experiment occurs.
from nltk.probability import FreqDist
fdist = FreqDist(words)
fdist.most_common(5)                  #Print 5 most common words

In [None]:
fdist.plot(5)                         #Plot the graph for fdist :

In [None]:
#Removing punctuation marks :
words_no_punc = []
for w in words:
    if w.isalpha():
        words_no_punc.append(w.lower()) #Append words with alphabet only. also changing the case to lower
print (len(words_no_punc))              #Length
print (words_no_punc)                   #Print the words without punctution marks

In [None]:
fdist = FreqDist(words_no_punc)
fdist.most_common(5)                     #Print 5 most common words

In [None]:
fdist.plot(5)                            #Plot the most common words on grpah

In [None]:
from nltk.corpus import stopwords
#List of stopwords
stopwords = stopwords.words("english")
print(stopwords)

In [None]:
# Remove Stopwords
clean_words = []
for w in words_no_punc:
    if w not in stopwords:
        clean_words.append(w)       
print(len(clean_words))
print(clean_words)

In [None]:
#Frequency distribution :
fdist = FreqDist(clean_words)
fdist.most_common(5)                      #Print 5 most common words

In [None]:
fdist.plot(5)                             #Plot the most common words on grpah:

## Word Cloud
Word cloud is a technique for visualising frequent words in a text where the size of the words represents their frequency.

In [None]:
#Library to form wordcloud :
from wordcloud import WordCloud
#Library to plot the wordcloud :
import matplotlib.pyplot as plt
#Generating the wordcloud :
wordcloud = WordCloud().generate(text)
#Plot the wordcloud :
plt.figure(figsize = (12, 12)) 
plt.imshow(wordcloud) 
#To remove the axis value :
plt.axis("off") 
plt.show()

## Stemming
It is a process of reducing inflected (or sometimes derived) words to their word stem, base or root form. We use Stemming to normalize words.

In [None]:
#Stemming Example :
#Import stemming library :
from nltk.stem import PorterStemmer
porter = PorterStemmer()
#Word-list for stemming :
word_list = ["Carry","Carrying","Carried"]
for w in word_list:
    print(porter.stem(w))

In [None]:
#Stemming Example :
#Import stemming library :
from nltk.stem import PorterStemmer
porter = PorterStemmer()
#Word-list for stemming :
word_list = ["carry","eating","decreases","reading"]
for w in word_list:
    print(porter.stem(w))

In [None]:
#Stemming Example :
#Import stemming library :
from nltk.stem import SnowballStemmer
#Print languages supported :
SnowballStemmer.languages

In [None]:
#Stemming Example :
#Import stemming library :
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer("english")
#Word-list for stemming :
word_list = ["carry","carrying","carried"]
for w in word_list:
    print(snowball.stem(w))

## Lemmatization
Lemmatization in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form

In [None]:
from nltk import WordNetLemmatizer

lemma = WordNetLemmatizer()
word_list = ["carry","carrying","carried"]
for w in word_list:
    print(lemma.lemmatize(w ,pos="v"))

In [None]:
from nltk.stem import WordNetLemmatizer
# pos: part of speeach 
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('reading', pos="v")) #Verb
print(lemmatizer.lemmatize('reading', pos="n")) #Noun
print(lemmatizer.lemmatize('reading', pos="a")) #Adjective
print(lemmatizer.lemmatize('reading', pos="r")) #Adverb

## Part-of-Speech (PoS) tagging

In [None]:
# print(nltk.pos_tag.__doc__)

In [None]:
#PoS tagging example :

sentence = "Almost every sentence contains at least one noun phrase."

#Tokenizing words :
tokenized_words = word_tokenize(sentence)

for words in tokenized_words:
    tagged_words = nltk.pos_tag(tokenized_words)
tagged_words

## WordNet

In [None]:
#Import wordnet :
# nltk.download('wordnet')
from nltk.corpus import wordnet

for words in wordnet.synsets("Fun"): 
    print(words)      

In [None]:
#Word meaning with definitions :
#LEMMA : a heading indicating the subject or argument of a literary composition or annotation.
for words in wordnet.synsets("Fun"): 
    print(words.name())
    print(words.definition())
    print(words.examples())
    for lemma in words.lemmas():   #Corresponding Synonyms
        print(lemma)
    print("....")

In [None]:
#Finding synonyms :
#Empty list to store synonyms :
synonyms = []
for words in wordnet.synsets('Fun'):
    for lemma in words.lemmas():
        synonyms.append(lemma.name())
        
synonyms

In [None]:
#Finding antonyms :
#Empty list to store antonyms :
antonyms = []
for words in wordnet.synsets('Good'):
    for lemma in words.lemmas():
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())         
#Print antonyms :            
antonyms

In [None]:
#Similarity in words :
word1 = wordnet.synsets("ship","n")[0]
word2 = wordnet.synsets("boat","n")[0] 
#Check similarity :
print(word1.wup_similarity(word2))

In [None]:
#Similarity in words :
word1 = wordnet.synsets("ship","n")[0]
word2 = wordnet.synsets("bat","n")[0] 
#Check similarity :
print(word1.wup_similarity(word2)) 

## Integer_Encoding and One_Hot_Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot']
print(data)
# # integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(data)
print(integer_encoded)
# # binary encode
onehot_encoder = OneHotEncoder(sparse=False) #No hashing
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) #Make it column matrix 6x1
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

## Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

sentences = ['Shyam is a bad boy','Ram is a good boy']
cv = CountVectorizer()                              #Create an object 
B_O_W = cv.fit_transform(sentences).toarray()       #Generating output for Bag of Words :
print(cv.vocabulary_)
print(cv.get_feature_names())
print('..........Bag of Words...........')
print(B_O_W)

In [None]:
# n gram
sentences = ['Shyam is a good boy','Ram is not a good boy']
cv = CountVectorizer(ngram_range=(1,2))             #Create an object 
B_O_W = cv.fit_transform(sentences).toarray()       #Generating output for Bag of Words :
print(cv.vocabulary_)
print(cv.get_feature_names())
print('..........Bag of Words...........')
print(B_O_W)

## TF-IDF (Term Frequency Inverse Document Frequency)

In [None]:
#Import required libraries :
from sklearn.feature_extraction.text import TfidfVectorizer
#Sentences for analysis :
sentences = ['it is a good book', 'it is not a good book', 'It is a very good book', 'It is very very good book']
#Create an object :
vectorizer = TfidfVectorizer(norm = None)
#Generating output for TF_IDF :
X = vectorizer.fit_transform(sentences).toarray()
#Total words with their index in model :
print(vectorizer.vocabulary_)
print(vectorizer.get_feature_names())
#Show the output :
print(X)