In [1]:
import nltk
import string
import re

## Step-1
### Convert text to lower case.

In [2]:
def doLowercase(text):
    return text.lower()

inputStr="I am doing a good JOB here.I can do ANyting,Today!!"
doLowercase(inputStr)

'i am doing a good job here.i can do anyting,today!!'

## Step-2
### Remove numbers.

In [3]:
def remove_num(text):
    return re.sub(r'\d+','',text)
inputStr = "You bought 6 candies from shop, and 4 candies are in home."
remove_num(inputStr)

'You bought  candies from shop, and  candies are in home.'

### Or convert number to text using inflect library

In [5]:
!pip install inflect

Collecting inflect
  Downloading inflect-5.3.0-py3-none-any.whl (32 kB)
Installing collected packages: inflect
Successfully installed inflect-5.3.0


In [11]:
import inflect
q=inflect.engine()

def convert_num(text):
    temp_string=text.split()
    
    new_str=[]
    
    for word in temp_string:
        if word.isdigit():
            temp=q.number_to_words(word)
            new_str.append(temp)
        else:
            new_str.append(word)
    temp_str=' '.join(new_str)
    return temp_str
input_str = 'You bought 6 candies from shop, and 4 candies are in home.'
convert_num(input_str)

'You bought six candies from shop, and four candies are in home.'

## Step-3 
### Remove Punctuation


In [13]:
def rem_punc(text):
    translator=str.maketrans('','',string.punctuation)
    return text.translate(translator)
input_str = "Hey, Are you excited??, After a week, we will be in Shimla!!!"
rem_punc(input_str)

'Hey Are you excited After a week we will be in Shimla'

## Step-4
### Remove Stop words

Words that conributes a negligible probabiliy to the context eg. is, the ,a etc.

In [15]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

nltk.download('stopwords')
nltk.download('punkt')

def rem_stop_words(text):
    stop_words=set(stopwords.words('english'))
    word_tokens=word_tokenize(text)
    filtered_text=[word for word in word_tokens if word not in stop_words]
    return filtered_text
input_text = "Data is the new oil. A.I is the last invention"
rem_stop_words(input_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Data', 'new', 'oil', '.', 'A.I', 'last', 'invention']

## Step-4
### Stemming
From Stemming we will process of getting the root form of a word.

In [18]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stem1=PorterStemmer()

def stem_words(text):
    words_tokens=word_tokenize(text)
    stems=[stem1.stem(word) for word in words_tokens]
    return stems

text = 'Data is the new revolution in the World, in a day one individual would generate terabytes of data.'
stem_words(text)

['data',
 'is',
 'the',
 'new',
 'revolut',
 'in',
 'the',
 'world',
 ',',
 'in',
 'a',
 'day',
 'one',
 'individu',
 'would',
 'gener',
 'terabyt',
 'of',
 'data',
 '.']

## Step-5
### Lemmatization
As stemming, lemmatization do the same but the only difference is that lemmatization ensures that root word belongs to the language. Because of the use of lemmatization we will get the valid words. 

In [21]:
from nltk.stem import wordnet
from nltk.tokenize import word_tokenize

lemma=wordnet.WordNetLemmatizer()

nltk.download('wordnet')

def lemmatize_word(text):
    word_tokens=word_tokenize(text)
    lemmas=[lemma.lemmatize(word,pos='v') for word in word_tokens]
    return lemmas
text = 'Data is the newly revolution in the World, in a days one individual would generate terabytes of data.'
lemmatize_word(text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['Data',
 'be',
 'the',
 'newly',
 'revolution',
 'in',
 'the',
 'World',
 ',',
 'in',
 'a',
 'days',
 'one',
 'individual',
 'would',
 'generate',
 'terabytes',
 'of',
 'data',
 '.']

## Step-6
### Parts of Speech (POS) Tagging
The pos(parts of speech) explain you how a word is used in a sentence. In the sentence, a word have different contexts and semantic meanings.

In [23]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

def pos_tagg(text):
    word_tokens = word_tokenize(text) 
    return pos_tag(word_tokens)
pos_tagg('Are you afraid of something?')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('Are', 'NNP'),
 ('you', 'PRP'),
 ('afraid', 'IN'),
 ('of', 'IN'),
 ('something', 'NN'),
 ('?', '.')]

### Chunking

Chunking is the process of extracting phrases from the Unstructured text and give them more structure to it. We also called them shallow parsing.We can do it on top of pos tagging. It groups words into chunks mainly for noun phrases. chunking we do by using regular expression

In [24]:
from nltk.tokenize import word_tokenize  
from nltk import pos_tag 

def chunking(text, grammar): 
    word_tokens = word_tokenize(text) 
  
    # label words with pos 
    word_pos = pos_tag(word_tokens) 
  
    # create chunk parser using grammar 
    chunkParser = nltk.RegexpParser(grammar) 
  
    # test it on the list of word tokens with tagged pos 
    tree = chunkParser.parse(word_pos) 
      
    for subtree in tree.subtrees(): 
        print(subtree) 
    #tree.draw() 
      
sentence = 'the little red parrot is flying in the sky'
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunking(sentence, grammar) 

(S
  (NP the/DT little/JJ red/JJ parrot/NN)
  is/VBZ
  flying/VBG
  in/IN
  (NP the/DT sky/NN))
(NP the/DT little/JJ red/JJ parrot/NN)
(NP the/DT sky/NN)


### Named Entity Recognition

It is used to extract information from unstructured text. It is used to classy the entities which is present in the text into categories like a person, organization, event, places, etc. This will give you a detail knowledge about the text and the relationship between the different entities.

In [25]:
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, ne_chunk 
nltk.download('maxent_ne_chunker')
nltk.download('words')
  
def ner(text): 
    # tokenize the text 
    word_tokens = word_tokenize(text) 
  
    # pos tagging of words 
    word_pos = pos_tag(word_tokens) 
  
    # tree of word entities 
    print(ne_chunk(word_pos)) 
  
text = 'Brain Lara scored the highest 400 runs in a test match which played in between WI and England.'
ner(text) 

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...


(S
  (PERSON Brain/NNP)
  (PERSON Lara/NNP)
  scored/VBD
  the/DT
  highest/JJS
  400/CD
  runs/NNS
  in/IN
  a/DT
  test/NN
  match/NN
  which/WDT
  played/VBD
  in/IN
  between/IN
  (ORGANIZATION WI/NNP)
  and/CC
  (GPE England/NNP)
  ./.)


[nltk_data]   Unzipping corpora\words.zip.


## Word Count

.By word frequency we can find out how many times each tokens appear in the text. When talking about word frequency, we distinguished between types and tokens.Types are the distinct words in a corpus, whereas tokens are the words, including repeats.

In [26]:
from nltk.tokenize.regexp import WhitespaceTokenizer
m = "'There is no need to panic. We need to work together, take small yet importat measures to ensure self-protection,' the Prime Minister tweeted."
tokens = WhitespaceTokenizer().tokenize(m)
print(len(tokens))

23


In [30]:
my_vocab = set(tokens)
print(len(my_vocab))

20


In [28]:
my_st = "'There is no need to panic. We need to work together, take small yet important measures to ensure self-protection,' the Prime Minister tweeted."
from nltk.tokenize.regexp import WordPunctTokenizer
m_t = WordPunctTokenizer().tokenize(my_st)

print(len(m_t))

30


In [29]:
my_vocab = set(tokens)
print(len(my_vocab))

20


## Frequency distribution

What is Frequency distribution? This is basically counting words in your texts.To give a brief example of how it works,


In [31]:
#from nltk.book import *
import nltk
#nltk.download('gutenberg')
print("\n\n\n")
text1 = "'There is no need to panic. We need to work together, take small yet important measures to ensure self-protection,' the Prime Minister tweeted."
freqDist = nltk.FreqDist(word_tokenize(text1))
print(freqDist)





<FreqDist with 23 samples and 28 outcomes>


In [32]:
print(freqDist["person"])
print(freqDist["need"])

0
2
