# Experiment 1 :
<b>Perform the following task using NLTK: Tokenize and tag some text, identify named entities, display a parse tree and find the ambiguity of the sentence using parse tree</b>

In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk
from nltk.corpus import gutenberg, stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tree import Tree
from nltk.parse import RecursiveDescentParser
import string

sentences = gutenberg.raw('austen-emma.txt')[:999]

#Tokenization : using nltk.sent_tokenize, nltk.word_tokenize
sent_tokens = sent_tokenize(sentences)
print('-'*50 + "\nTOKENIZED SENTENCES\n" + '-'*50)
for i, sentence in enumerate(sent_tokens) :
    sentence = sentence.replace('\n', ' ')
    print(rf"{i+1}. {sentence}")

word_tokens = []; print()
print('-'*50 + "\nTOKENIZED WORDS\n" + '-'*50)
for sentence in sent_tokens :
    translator = str.maketrans('', '', string.punctuation)
    sentence = sentence.translate(translator)
    word_tokens.append(word_tokenize(sentence))
    print(word_tokens[-1], end = '\n')

#Removal of stopwords : using nltk.corpus.stopwords
stops = set(stopwords.words('english')); print()
print('-'*50 + "\nSENTENCES WITHOUT STOPWORDS\n" + '-'*50)
for i, tokens in enumerate(word_tokens) :
    sentence = ' '.join(word for word in tokens if word.lower() not in stops)
    print(rf"{i+1}. {sentence}")

#Stemming : using nltk.stem.PorterStemmer
stemmer = PorterStemmer(); print()
print('-'*50 + "\nSENTENCES AFTER STEMMING\n" + '-'*50)
for i, tokens in enumerate(word_tokens) :
    sentence = ' '.join(stemmer.stem(word) for word in tokens)
    print(rf"{i+1}. {sentence}")

#Lemmatization : using nltk.stem.WordNetLemmatizer
lemmatizer = WordNetLemmatizer(); print()
print('-'*50 + "\nSENTENCES AFTER LEMMATIZATION\n" + '-'*50)
for i, tokens in enumerate(word_tokens) :
    sentence = ' '.join(lemmatizer.lemmatize(word) for word in tokens)
    print(rf"{i+1}. {sentence}")


--------------------------------------------------
TOKENIZED SENTENCES
--------------------------------------------------
1. [Emma by Jane Austen 1816]  VOLUME I  CHAPTER I   Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and had lived nearly twenty-one years in the world with very little to distress or vex her.
2. She was the youngest of the two daughters of a most affectionate, indulgent father; and had, in consequence of her sister's marriage, been mistress of his house from a very early period.
3. Her mother had died too long ago for her to have more than an indistinct remembrance of her caresses; and her place had been supplied by an excellent woman as governess, who had fallen little short of a mother in affection.
4. Sixteen years had Miss Taylor been in Mr. Woodhouse's family, less as a governess than a friend, very fond of both daughters, but particularly of Emma.
5. Between _t

In [2]:
#Tagging tokens with its parts of speech: using nltk.pos_tag
print('-'*50 + "\nTAGGED TOKENS\n" + '-'*50)
tags = []
for i, tokens in enumerate(word_tokens) :
    tags += [pos_tag(tokens)] 
    print(f"{i+1}. {tags[-1]}\n")

--------------------------------------------------
TAGGED TOKENS
--------------------------------------------------
1. [('Emma', 'NN'), ('by', 'IN'), ('Jane', 'NNP'), ('Austen', 'NNP'), ('1816', 'CD'), ('VOLUME', 'NNP'), ('I', 'PRP'), ('CHAPTER', 'VBP'), ('I', 'PRP'), ('Emma', 'NNP'), ('Woodhouse', 'NNP'), ('handsome', 'VBD'), ('clever', 'NN'), ('and', 'CC'), ('rich', 'JJ'), ('with', 'IN'), ('a', 'DT'), ('comfortable', 'JJ'), ('home', 'NN'), ('and', 'CC'), ('happy', 'JJ'), ('disposition', 'NN'), ('seemed', 'VBD'), ('to', 'TO'), ('unite', 'VB'), ('some', 'DT'), ('of', 'IN'), ('the', 'DT'), ('best', 'JJS'), ('blessings', 'NNS'), ('of', 'IN'), ('existence', 'NN'), ('and', 'CC'), ('had', 'VBD'), ('lived', 'VBN'), ('nearly', 'RB'), ('twentyone', 'CD'), ('years', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('with', 'IN'), ('very', 'RB'), ('little', 'JJ'), ('to', 'TO'), ('distress', 'VB'), ('or', 'CC'), ('vex', 'VB'), ('her', 'PRP$')]

2. [('She', 'PRP'), ('was', 'VBD'), ('the', 'DT

In [3]:
#Named entities recogonization : using nltk.ne_chunk
print('-'*50 + "\nNAMED ENTITIES\n" + '-'*50)
ne = []
for i, tag in enumerate(tags) :
    ne += [ne_chunk(tag)] 
    for subtree in ne[-1] :
        if isinstance(subtree, Tree) :
            words = [word for word,tag in subtree.leaves()]
            label = subtree.label()
            print(f"ENTITY : {' '.join(words)}, LABEL : {label}")
    

--------------------------------------------------
NAMED ENTITIES
--------------------------------------------------
ENTITY : Emma, LABEL : GPE
ENTITY : Jane Austen, LABEL : PERSON
ENTITY : Emma Woodhouse, LABEL : PERSON
ENTITY : Miss Taylor, LABEL : PERSON
ENTITY : Emma, LABEL : GPE
ENTITY : Miss Taylor, LABEL : PERSON


In [4]:
#To extract the list of all tags in nltk
# tagdict = nltk.data.load('help/tagsets/upenn_tagset.pickle')

In [3]:
import nltk
from nltk import word_tokenize
from nltk.parse import RecursiveDescentParser

In [4]:
#Displaying a ParseTree and finding the ambiguity of a given sentence
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    NP -> AT NNS | AT NN | AT NNS PP | AT NN PP
    VP -> V NP PP | V NP
    PP -> IN NP
    AT -> "The" | "the" | "a"
    NNS -> "children"
    V -> "ate"
    NN -> "cake" | "spoon"
    IN -> "with"
""")

#parser = RecursiveDescentParser(grammar) #Can use nltk.ChartParser too
parser = RecursiveDescentParser(grammer)
print("PARSE TREES : ")

tokens = word_tokenize("The children ate the cake with a spoon")

for tree in parser.parse(tokens) :
    tree.pretty_print()

PARSE TREES : 
                      S                             
      ________________|_______                       
     |                        VP                    
     |             ___________|_________             
     |            |       |             PP          
     |            |       |         ____|___         
     NP           |       NP       |        NP      
  ___|_____       |    ___|___     |     ___|____    
 AT       NNS     V   AT      NN   IN   AT       NN 
 |         |      |   |       |    |    |        |   
The     children ate the     cake with  a      spoon

                      S                         
      ________________|___                       
     |                    VP                    
     |             _______|____                  
     |            |            NP               
     |            |    ________|____             
     |            |   |   |         PP          
     |            |   |   |     ____|___         
