
# Text Preprocessing Using `NLTK` On given Dataset

## 19BCE221 || `IRS` || Practical 2
### Text Preprocessing using NLTK.
#### Visualization 
#### Word Cloud 
#### Histogram of top N frequent terms

In [None]:
import matplotlib.pyplot as plt
import string
import numpy as np
import pandas as pd 
import os

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [None]:
data=pd.DataFrame()
df=pd.read_csv('../input/nlp-getting-started/train.csv')
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
data['text']=df[:200]['text'].astype('str')

sampledata=''
for x in range(7):
    sampledata+=' '+data['text'][x]

data

In [None]:
sentences=sent_tokenize(sampledata)
data1=pd.DataFrame()

stop_words = set(stopwords.words("english"))
punc = string.punctuation
no_sw=[]

for sentence in sentences: 
    no_sw+=list(set(word_tokenize(sentence)) - stop_words -set(punc) )
no_sw

data1['Tokens']=no_sw
porter = PorterStemmer()
lancaster=LancasterStemmer()

In [None]:
# i use two type of stemming.
# 1) porter 2) lancaster
ps=[]
lc=[]
for token in no_sw:
    ps.append(porter.stem(token))
    lc.append(lancaster.stem(token))

data1['Porter']=ps
data1['Lancaster']=lc
data1.head()

In [None]:
# Punctuation

no_puncts=[]
for x in data[:]['text']:
    no_puncts.append(x.translate(str.maketrans('', '', punc)))
    
data["no_punc"] = pd.DataFrame(no_puncts)
data.head()

In [None]:
# remove stopword
no_stops=[]
for x in data[:]['no_punc']:
    no_stops.append(' '.join([word for word in str(x).split() if word not in stop_words]))
    
data['no_punc_stops'] = pd.DataFrame(no_stops)
data

In [None]:
por=[]
lan=[]
for x in data[:]['no_punc_stops']:
    por.append(' '.join([porter.stem(word) for word in str(x).split() ]))
    lan.append(' '.join([lancaster.stem(word) for word in str(x).split()]))
    
data['porter'] = pd.DataFrame(por)
data['lancaster'] = pd.DataFrame(lan)
data

# More towards text processing

## Tokenization

In [None]:
# One simpe example of Tokenizer
from nltk.tokenize import word_tokenize
str2 = "Rathod mayur 19bce221"
quotes_tokens1 = word_tokenize(str2)
quotes_tokens1

In [None]:
str1 = """According to the father of Artificial Intelligence, John McCarthy, it is “The science and engineering of making intelligent machines, 
especially intelligent computer programs”.Artificial Intelligence is a way of making a computer, a computer-controlled robot, 
or a software think intelligently, in the similar manner the intelligent humans think. 

AI is accomplished by studying how human brain thinks, and how humans learn, decide, and work while trying to solve a problem, 
and then using the outcomes of this study as a basis of developing intelligent software and systems."""

type(str1)

In [None]:
from nltk.tokenize import word_tokenize
str1_tokens = word_tokenize(str1)
str1_tokens

In [None]:
len(str1_tokens)

In [None]:
from nltk.probability import FreqDist
fdist = FreqDist()

for word in str1_tokens:
    fdist[word.lower()]+=1
fdist

In [None]:
print(fdist['humans'])

print(len(fdist))

# Output of len(fdist) is 61. At the starting time it 101.

In [None]:
# We can also use any partitular paragraph from given data using blankline_tokenize.
from nltk.tokenize import blankline_tokenize
str_blank = blankline_tokenize(str1)
print(len(str_blank))

str_blank[0]

In [None]:
from nltk.util import bigrams, trigrams, ngrams
str3 = "The best and the most beautiful things in the world cannot be seen or even touched, they must be felt with the heart"
quotes_tokens = word_tokenize(str3)
quotes_tokens

In [None]:
quotes_bigrams = list(bigrams(quotes_tokens))
quotes_bigrams

In [None]:
quotes_trigrams = list(trigrams(quotes_tokens))
quotes_trigrams

In [None]:
# Here i give N = 5
quotes_ngrams = list(nltk.ngrams(quotes_tokens, 5))
quotes_ngrams

## Stemming

In [None]:
#  Stamming : Normalize words into its base form or root form.  => stem
#  Ex : Affectation, Affets, Affections, Affected ====> Affect 

#  Three Type : 
#      PorterStemmer
#      LancasterStemmer
#      SnowballStemmer (In this we tell language name)

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

pst = PorterStemmer()
lst = LancasterStemmer()
sbts = SnowballStemmer('english')

print(pst.stem("having"))
print(lst.stem("having"))

In [None]:
word_to_stem = ["give", "giving", "given", "gave"]
for words in word_to_stem:
    print(words+ ":" + pst.stem(words))
print("------------------")
for words in word_to_stem:
    print(words + ":" + lst.stem(words))
    
print("------------------")
for words in word_to_stem:
    print(words + ":" + sbts.stem(words))

## Lemmatization

In [None]:
# Lemmatisation in linguistics is the process of grouping together the inflected forms of a word 
# so they can be analysed as a single item, identified by the word's lemma.

# Somehow similar to Stemming, as it maps several words into one common root.
# Output of Lemmatisation is a proper word
# Ex : gone, going => go.

In [None]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
word_len = WordNetLemmatizer()
word_len.lemmatize("corpora")

In [None]:
for words in word_to_stem:
    print(words + ":" + word_len.lemmatize(words))
    
# We get same output as word because we not apply POS = "Parts Of Speech".

## Stop Words

In [None]:
# Stop words are useful in english language but not useful in natural process language (NLP)
from nltk.corpus import stopwords
print(len(stopwords.words('english')))

In [None]:
stopwords.words('english')

In [None]:
# I have to remove the stopwords form over string
import re
punctuation = re.compile(r'[-.?!,:;()[0-9]')

In [None]:
after_puncturation = []
for words in str1_tokens:
    word = punctuation.sub("",words)
    if len(word)>0 :
        after_puncturation.append(word)
        
after_puncturation

### POS = Parts of Speech


In [None]:
# grammer, verbs, prepositions, nouns, articles, abjectives, conditionals
nltk.help.upenn_tagset()

In [None]:
sentance = "Mayur is a honest when it comes to coding"
sentance_tokens = word_tokenize(sentance)
sentance_tokens

In [None]:
for token in sentance_tokens:
    print(nltk.pos_tag([token]))

### Chunking

Picking up individual pieces of Information and Grouping then into bigger Pieces.

# Word Clouds

Hi! this is Rathod Mayur | `19bce221` || This is the part of my 2nd practical of IRS Course"

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator


background_image = np.array(Image.open('/kaggle/input/image/masks-wordclouds/upvote.png'))
plt.imshow(background_image)

In [None]:
word_cloud2 = WordCloud(background_color = 'white',mask = background_image, 
               width = 2048, height = 1080).generate(" ".join(data1['Porter']))
# font color matching the masked image
img_colors = ImageColorGenerator(background_image)
word_cloud2.recolor(color_func = img_colors)

#saving the image
word_cloud2.to_file('photo.png')

# Show the word_cloud
plt.imshow(word_cloud2, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
background_image2 = np.array(Image.open('/kaggle/input/image/masks-wordclouds/user.png'))
plt.imshow(background_image2)

In [None]:
# first read the data file
textfile = open('../input/txtfile/Week1Summary.txt', 'r').read()

word_cloud2 = WordCloud(background_color = 'white',mask = background_image2, 
               width = 2048, height = 1080).generate(textfile)
# font color matching the masked image
img_colors = ImageColorGenerator(background_image2)
word_cloud2.recolor(color_func = img_colors)

#saving the image
word_cloud2.to_file('photo.png')

# Show the word_cloud
plt.imshow(word_cloud2, interpolation='bilinear')
title = 'Most common words in this file'
plt.title(title, fontdict={'size': 25, 'color': 'green', 
                                  'verticalalignment': 'bottom'})
plt.axis("off")
plt.show()

In [None]:
# Advantages of Word Clouds : 
#     Analyzing customer and employee feedback.
#     Identifying new SEO keywords to target.

# Drawbacks of Word Clouds : 
#     Word Clouds are not perfect for every situation.
#     Data should be optimized for context.