In [None]:
import nltk
nltk.download('popular')
nltk.download('universal_tagset')

## Tokenization
- Tokenization is breaking the raw text into small chunks. Tokenization breaks the raw text into words, sentences called tokens.

In [2]:
from nltk.tokenize import sent_tokenize,word_tokenize
example="Hello world! Welcome to this amazing party."
print(f"Sentence tokenizer: {sent_tokenize(example)}")
print(f"Sentence tokenizer: {word_tokenize(example)}")

Sentence tokenizer: ['Hello world!', 'Welcome to this amazing party.']
Sentence tokenizer: ['Hello', 'world', '!', 'Welcome', 'to', 'this', 'amazing', 'party', '.']


## Stop Words Removal
- Stop words are words with very little meaning such as a,an,the,etc.
- Stop words are filtered out before processing  natural language as they don't reveal much information.

In [3]:
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
print("Stop words examples :",stop_words[:10])

Stop words examples : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [4]:
from nltk.tokenize import word_tokenize

example="An apple a day keeps a doctor away."
tokens=word_tokenize(example)

#removing stop words from tokens
filtered_sentence=[tok for tok in tokens if tok not in stop_words]

print("tokens          :",tokens)
print("filtered tokens :",filtered_sentence)

tokens          : ['An', 'apple', 'a', 'day', 'keeps', 'a', 'doctor', 'away', '.']
filtered tokens : ['An', 'apple', 'day', 'keeps', 'doctor', 'away', '.']


## Stemming
- Stemming involves reducing a word to a base(root) form by removing suffixes 
- There are various stemmng algo: Porter Stemmer, Lancaster stemmer,Snowball Stemmer

In [5]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps=PorterStemmer()
example="Stemming involves reducing a word to a base(root) form by removing suffixes"
tokens=word_tokenize(example)
print("Befor stemming tokens:",example)
print("After stemming tokens: ",end='')
for word in tokens:
    print(ps.stem(word),end=" ")

Befor stemming tokens: Stemming involves reducing a word to a base(root) form by removing suffixes
After stemming tokens: stem involv reduc a word to a base ( root ) form by remov suffix 

## Lemmatization
Lemmatization is a text normalization technique used in Natural Language Processing (NLP), that switches any kind of a word to its base root mode. Lemmatization is responsible for grouping different inflected forms of words into the root form, having the same meaning.

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
example=["cacti","geese"]
print("Befor lemmatizing :",example)
print("After lemmatizing : ",end='')
for ex in example:
    print(lemmatizer.lemmatize(ex),end=", ")

Befor lemmatizing : ['cacti', 'geese']
After lemmatizing : cactus, goose, 

### Stemming v/s Lemmatization
- Stemming is a process that stems or removes last few characters from a word, often leading to incorrect meanings and spelling.	Lemmatization considers the context and converts the word to its meaningful base form, which is called Lemma.
- For instance, stemming the word ‘reducing‘ would return ‘reduc‘ and lemmatizing the word ‘geese‘ would return ‘goose‘.
- Stemming is used in case of large dataset where performance is an issue.	
- Lemmatization is computationally expensive since it involves look-up tables and what not.

## POS Taggging
- Parts Of Speech Tagging is used to tag words with their corresponding parts of speech tag based on its context and definition.
- Pos tags are useful for lemmatization,named entity recognition (NER), and extracting relationshib between words

In [7]:
example="Alice wrote a program"
#tokenizing
tokens=word_tokenize(example)
#removing stop words
filtered_tokens=[token for token in tokens if token not in stop_words]
#pos tagging
print(nltk.pos_tag(filtered_tokens ,tagset='universal'))
print(nltk.pos_tag(filtered_tokens ),"you can get  full list of abbrevations on websites")

[('Alice', 'NOUN'), ('wrote', 'VERB'), ('program', 'NOUN')]
[('Alice', 'NNP'), ('wrote', 'VBD'), ('program', 'NN')] you can get  full list of abbrevations on websites


## Named Entity Recognition (NER)
- NER extracts real world entity from text and sorts it into predefined caegories like name, location, organization,etc.

In [8]:
example="Satya is CEO of Microsoft"
#tokenizing
tokens=word_tokenize(example)
#removing stop words
filtered_tokens=[token for token in tokens if token not in stop_words]
#pos tagging
tagged_sentences=nltk.pos_tag(filtered_tokens)
#Use NLTK's currently recommended named entity chunker to chunk the given list of tagged tokens.
ne_chunk_sentences=nltk.ne_chunk(tagged_sentences)


named_entities=[]
for tagged_tree in ne_chunk_sentences:
    if hasattr(tagged_tree,'label'):
        entity_name=' '.join( [name[0] for name in tagged_tree.leaves()] )
        entity_type=tagged_tree.label()
        named_entities.append((entity_name,entity_type))
        
print('example        :',example)       
print('named_entities :',named_entities)

example        : Satya is CEO of Microsoft
named_entities : [('Satya', 'PERSON'), ('CEO Microsoft', 'ORGANIZATION')]


## NLP Process Workflow

- Tokenization - Stop word removal - Stemming and Lemmatization - POS Tagging - Information Retrieval

In [9]:
import nltk

In [10]:
with open('brown_corpus_ca10.txt','r') as myfile:
    data=myfile.read().lower().replace('\n',' ')

In [11]:
data[:500]

'  \tvincent/np g./np ierulli/np has/hvz been/ben appointed/vbn temporary/jj assistant/nn district/nn attorney/nn ,/, it/pps was/bedz announced/vbn monday/nr by/in charles/np e./np raymond/np ,/, district/nn-tl attorney/nn-tl ./.   \tierulli/np will/md replace/vb desmond/np d./np connall/np who/wps has/hvz been/ben called/vbn to/in active/jj military/jj service/nn but/cc is/bez expected/vbn back/rb on/in the/at job/nn by/in march/np 31/cd ./.   \tierulli/np ,/, 29/cd ,/, has/hvz been/ben practicing/'

In [12]:
#As we can see many words have ending in format /something which dosent make sense, so remove it with regex.
#Example: has/hvz, attorney/nn-tl, etc.
import re
data=re.sub('/[a-z0-9]*\D?\D?[a-z0-9]* ',' ',data)   #This replaces given pattern with space in 'data' text
                                                    #\D Matches any non-digits for cases like '-' or '$'
                                                    # '?' matches 0 or 1 occurence

data=re.sub('\s',' ',data)      # '\s' Matches whitespace characters, which include the \t, \n, \r, and space characters.
data=re.sub("''",'',data)  
data=re.sub("``",'',data) 

In [13]:
data[:500]

'   vincent g. ierulli has been appointed temporary assistant district attorney , it was announced monday by charles e. raymond , district attorney .   ierulli will replace desmond d. connall who has been called to active military service but is expected back on the job by march 31 .   ierulli , 29 , has been practicing in portland since november , 1959 . he is a graduate of portland university and the northwestern college of law . he is married and the father of three children .   helping foreig'

In [14]:
from nltk import sent_tokenize,word_tokenize
word_tokens=word_tokenize(data)

In [15]:
#remove stopwords and punchuations
from nltk.corpus import stopwords
from string import punctuation   #punctuation='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

stopwords_en=stopwords.words('english')
stopwords_with_punctuations=set(stopwords_en).union(set(punctuation)) #combining stopwords and punctuations

filtered_tokens=[token for token in word_tokens if token not in stopwords_with_punctuations]

print(filtered_tokens[:10])

['vincent', 'g.', 'ierulli', 'appointed', 'temporary', 'assistant', 'district', 'attorney', 'announced', 'monday']


In [68]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
ps=PorterStemmer()
wnl=WordNetLemmatizer()

stem_tokens=[ps.stem(token) for token in filtered_tokens ]
wnl_tokens=[wnl.lemmatize(token) for token in filtered_tokens ]   

print("Stemmer    :",stem_tokens[:10])
print("Lemmatizer :",wnl_tokens[:10])

Stemmer    : ['vincent', 'g.', 'ierulli', 'appoint', 'temporari', 'assist', 'district', 'attorney', 'announc', 'monday']
Lemmatizer : ['vincent', 'g.', 'ierulli', 'appointed', 'temporary', 'assistant', 'district', 'attorney', 'announced', 'monday']


In [75]:
#POS Tagging
## Full form of abbrevations : https://www.guru99.com/pos-tagging-chunking-nltk.html#:~:text=POS%20Tagging%20(Parts%20of%20Speech,of%20Speech)%20to%20each%20word.
pos_tagged=nltk.pos_tag(wnl_tokens)
print(pos_tagged[:10])

[('vincent', 'NN'), ('g.', 'NN'), ('ierulli', 'NN'), ('appointed', 'VBD'), ('temporary', 'JJ'), ('assistant', 'NN'), ('district', 'NN'), ('attorney', 'NN'), ('announced', 'VBD'), ('monday', 'JJ')]


In [76]:
chunked_pos_tagged=nltk.ne_chunk(pos_tagged)

In [174]:
for tree in chunked_pos_tagged:
    if hasattr(tree,'label'):
        print(tree,tree.label())
        if tree.label=='NN':
            print(' '.join([ child[0] for child in tree] ))
        else:
            for c in tree:
                print('other',c)
            

(PERSON mr./JJ) PERSON
other ('mr.', 'JJ')
(PERSON mr./NN brandt/NNP) PERSON
other ('mr.', 'NN')
other ('brandt', 'NNP')
(PERSON mr./JJ) PERSON
other ('mr.', 'JJ')
(GPE u.s./JJ) GPE
other ('u.s.', 'JJ')
(GPE u.s./JJ) GPE
other ('u.s.', 'JJ')
(GPE u.s./JJ) GPE
other ('u.s.', 'JJ')


In [207]:
sentences=sent_tokenize(data)
tokenized_sentences=[word_tokenize(sentence) for sentence in sentences]
tagged_sentences=[nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences=nltk.ne_chunk_sents(tagged_sentences,binary=True)

In [208]:
def extract_entity_names(tree):
    entity_names=[]
    
    if hasattr(tree,'label') and tree.label():
        
        if tree.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in tree]) )
        else:
            for child in tree:
                entity_names.extend(extract_entity_names(child))
                
    return entity_names

In [209]:
entity_names=[]
for tree in chunked_sentences:
    entity_names.extend(extract_entity_names(tree))

In [211]:
print(entity_names)

['mr.', 'u.s.', 'u.s.', 'u.s.']


In [225]:
for t in nltk.ne_chunk(nltk.pos_tag(word_tokenize(data))):
    if hasattr(t,'label') and  t.label() !='S':
        print(t.label(),':', ' '.join([ child[0] for child in t]) )

PERSON : mr. zimmerman
GPE : u.s.
GPE : u.s.
GPE : u.s.
