# Project : 
We will perform a **natural language parsing** analysis to gain deeper insight into one of two famous and often discussed novels in the public domain:   
- Oscar Wilde’s The Picture of Dorian Gray   
- Homer’s The Iliad! 
- With Lawrence in Arabia, by Lowell Thomas

[Interesting link for nltk libraries in other langages (french, german, ect)](https://stackoverflow.com/questions/35275001/use-of-punktsentencetokenizer-in-nltk)

### NOTE :  there is no pre-processing provided here, we just do the parsing part in th

### Important : the cell just below needs to be runed one time because it contains the useful functions for later on (you can also implement this part in your code right at the beginning)

In [23]:
# before we go further we need first to create this specific tokenization function:------------

# tokenizes text in sentences via specific library, then tokenizes each word in it.
# "PunktSentenceTokenizer" = divides text into list of sentences (use unsupervised algorithm)
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

def word_sentence_tokenize(text):
    # create a PunktSentenceTokenizer
    sentence_tokenizer = PunktSentenceTokenizer(text)
  
    # sentence tokenize text
    sentence_tokenized = sentence_tokenizer.tokenize(text)
    
    # create a list to hold word tokenized sentences
    word_tokenized = list()
  
    # for-loop through each tokenized sentence in sentence_tokenized
    for tokenized_sentence in sentence_tokenized:
        # word tokenize "word_tokenize" each sentence and append to word_tokenized
        word_tokenized.append(word_tokenize(tokenized_sentence))
    
    return word_tokenized
# --------


# A function np_chunk_counter() that returns the 30 most common NP-chunks
# from a list of chunked sentences
from collections import Counter

# function that pulls chunks out of chunked sentence and finds the most common chunks
def np_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract noun phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'NP'):
            chunks.append(tuple(subtree))

    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)
# ----------


# vp_chunk_counter() that returns the 30 most common VP-chunks from a list of chunked sentences
from collections import Counter

# function that pulls chunks out of chunked sentence and finds the most common chunks
def vp_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract verb phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'VP'):
            chunks.append(tuple(subtree))

    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)



In [18]:
# THE ILIAD OF HOMER


print("THE ILIAD OF HOMER\n")
# 1 ) Import and Preprocess Text Data
from nltk import pos_tag, RegexpParser
from nltk.tokenize import word_tokenize

# import chosen text and convert it to lowercase
text = open("the_iliad.txt",encoding='utf-8').read().lower()
# sentence and word tokenized text (function "word_sentence_tokenize" above)
word_tokenized_text = word_sentence_tokenize(text)

# store and print any word tokenized sentence
# let us see the word tokenized sentence 1945th
single_word_tokenized_sentence = word_tokenized_text[1945]
print("- single_word_tokenized_sentence (the 1945th): \n", single_word_tokenized_sentence) # check


# 2 ) Part-of-speech (POS)  Tag Text
# create a list to hold part-of-speech tagged sentences here
pos_tagged_text = []

# create a for loop through each word tokenized sentence here
for word_tokenized_sentence in word_tokenized_text:
  # part-of-speech tag each sentence and append to list of "pos-tagged sentences" with 
  # "pos_tag()" nltk built-in function
    pos_tagged_text.append(pos_tag(word_tokenized_sentence))
  

# store and print any part-of-speech tagged sentence
# let us see the pos-tagged sentence 1945th
single_pos_sentence = pos_tagged_text[1945]
print("\n- single_pos_sentence  (the 1645th  ):\n", single_pos_sentence)


# 3) Chunk Sentences ("sens extraction" based on noun or verb focus)
# define noun phrase (np) chunk grammar
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create noun phrase RegexpParser object (RegexpParser object named "np_chunk_parser" 
# using the noun phrase np chunk grammar as argument)
np_chunk_parser = RegexpParser(np_chunk_grammar)

# define verb phrase chunk grammar
vp_first_chunk_grammar = "VP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}" # first structure
vp_second_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}" # second structure

# create verb phrase RegexpParser object (RegexpParser object named "vp_chunk_parser" 
#using the verb phrase vp chunk grammar as argument)
vp_first_chunk_parser = RegexpParser(vp_first_chunk_grammar) # first structure
vp_second_chunk_parser = RegexpParser(vp_second_chunk_grammar)  # second structure

# create a list to hold noun phrase chunked sentences 
np_chunked_text = []
# and a list to hold verb phrase chunked sentences
vp_first_chunked_text = []  # first structure
vp_second_chunked_text = [] # second structure

# create a for loop through each pos-tagged sentence
for pos_tagged_sentence in pos_tagged_text: # for every sentence where the words have been POSed
  # chunk each sentence and append to lists (with np_chunk_parser‘s or "vp_chunk_parser" (RegexpParser object) ".parse()" method and append to dedicated list)
  np_chunked_text.append(np_chunk_parser.parse(pos_tagged_sentence)) # for noun focus
  vp_first_chunked_text.append(vp_second_chunk_parser.parse(pos_tagged_sentence)) # for verb focus Second Structure
  vp_second_chunked_text.append(vp_first_chunk_parser.parse(pos_tagged_sentence)) # for verb focus Second Structure

    
# 4 ) Analyze Chunks 
# store and print the most common NP-chunks (via "np_chunk_counter function" in the cell above)
most_common_np_chunks = np_chunk_counter(np_chunked_text)
print("\n- most_common_np_chunks: \n", most_common_np_chunks)  # check

# store and print the most common VP-chunks here
most_common_vp_chunks_first_structure = vp_chunk_counter(vp_first_chunked_text)
print("\n- most_common_vp_chunks_first_structure, {<VB.*><DT>?<JJ>*<NN><RB.?>?} : \n", most_common_vp_chunks_first_structure)
most_common_vp_chunks_seconde_structure = vp_chunk_counter(vp_second_chunked_text)
print("\n- most_common_vp_chunks_seconde_structure, {<DT>?<JJ>*<NN><VB.*><RB.?>?} : \n", most_common_vp_chunks_seconde_structure)



THE ILIAD OF HOMER

- single_word_tokenized_sentence (the 1945th): 
 ['on', 'the', 'bright', 'axle', 'turns', 'the', 'bidden', 'wheel', 'of', 'sounding', 'brass', ';', 'the', 'polished', 'axle', 'steel', '.']

- single_pos_sentence  (the 1645th  ):
 [('on', 'IN'), ('the', 'DT'), ('bright', 'JJ'), ('axle', 'NN'), ('turns', 'VBZ'), ('the', 'DT'), ('bidden', 'JJ'), ('wheel', 'NN'), ('of', 'IN'), ('sounding', 'VBG'), ('brass', 'NN'), (';', ':'), ('the', 'DT'), ('polished', 'JJ'), ('axle', 'NN'), ('steel', 'NN'), ('.', '.')]

- most_common_np_chunks: 
 [((('hector', 'NN'),), 322), ((('i', 'NN'),), 277), ((('jove', 'NN'),), 257), ((('troy', 'NN'),), 208), ((('vain', 'NN'),), 195), ((('war', 'NN'),), 193), ((('son', 'NN'),), 170), ((('thou', 'NN'),), 158), ((('the', 'DT'), ('plain', 'NN')), 157), ((('the', 'DT'), ('field', 'NN')), 154), ((('the', 'DT'), ('ground', 'NN')), 138), ((('death', 'NN'),), 134), ((('hand', 'NN'),), 134), ((('greece', 'NN'),), 128), ((('heaven', 'NN'),), 127), ((('fat

In [19]:
# The Picture of Dorian Gray


print("The Picture of Dorian Gray\n")
# 1 ) Import and Preprocess Text Data
from nltk import pos_tag, RegexpParser
from nltk.tokenize import word_tokenize

# import chosen text and convert it to lowercase
text = open("dorian_gray.txt",encoding='utf-8').read().lower()
# sentence and word tokenized text (function "word_sentence_tokenize" above)
word_tokenized_text = word_sentence_tokenize(text)

# store and print any word tokenized sentence
# let us see the word tokenized sentence 1945th
single_word_tokenized_sentence = word_tokenized_text[1945]
print("- single_word_tokenized_sentence (the 1945th): \n", single_word_tokenized_sentence) # check


# 2 ) Part-of-speech (POS)  Tag Text
# create a list to hold part-of-speech tagged sentences here
pos_tagged_text = []

# create a for loop through each word tokenized sentence here
for word_tokenized_sentence in word_tokenized_text:
  # part-of-speech tag each sentence and append to list of "pos-tagged sentences" with 
  # "pos_tag()" nltk built-in function
    pos_tagged_text.append(pos_tag(word_tokenized_sentence))
  

# store and print any part-of-speech tagged sentence
# let us see the pos-tagged sentence 1945th
single_pos_sentence = pos_tagged_text[1945]
print("\n- single_pos_sentence  (the 1645th  ):\n", single_pos_sentence)


# 3) Chunk Sentences ("sens extraction" based on noun or verb focus)
# define noun phrase (np) chunk grammar
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create noun phrase RegexpParser object (RegexpParser object named "np_chunk_parser" 
# using the noun phrase np chunk grammar as argument)
np_chunk_parser = RegexpParser(np_chunk_grammar)

# define verb phrase chunk grammar
vp_first_chunk_grammar = "VP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}" # first structure
vp_second_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}" # second structure

# create verb phrase RegexpParser object (RegexpParser object named "vp_chunk_parser" 
#using the verb phrase vp chunk grammar as argument)
vp_first_chunk_parser = RegexpParser(vp_first_chunk_grammar) # first structure
vp_second_chunk_parser = RegexpParser(vp_second_chunk_grammar)  # second structure

# create a list to hold noun phrase chunked sentences 
np_chunked_text = []
# and a list to hold verb phrase chunked sentences
vp_first_chunked_text = []  # first structure
vp_second_chunked_text = [] # second structure

# create a for loop through each pos-tagged sentence
for pos_tagged_sentence in pos_tagged_text: # for every sentence where the words have been POSed
  # chunk each sentence and append to lists (with np_chunk_parser‘s or "vp_chunk_parser" (RegexpParser object) ".parse()" method and append to dedicated list)
  np_chunked_text.append(np_chunk_parser.parse(pos_tagged_sentence)) # for noun focus
  vp_first_chunked_text.append(vp_second_chunk_parser.parse(pos_tagged_sentence)) # for verb focus Second Structure
  vp_second_chunked_text.append(vp_first_chunk_parser.parse(pos_tagged_sentence)) # for verb focus Second Structure

    
# 4 ) Analyze Chunks 
# store and print the most common NP-chunks (via "np_chunk_counter function" in the cell above)
most_common_np_chunks = np_chunk_counter(np_chunked_text)
print("\n- most_common_np_chunks: \n", most_common_np_chunks)  # check

# store and print the most common VP-chunks here
most_common_vp_chunks_first_structure = vp_chunk_counter(vp_first_chunked_text)
print("\n- most_common_vp_chunks_first_structure, {<VB.*><DT>?<JJ>*<NN><RB.?>?} : \n", most_common_vp_chunks_first_structure)
most_common_vp_chunks_seconde_structure = vp_chunk_counter(vp_second_chunked_text)
print("\n- most_common_vp_chunks_seconde_structure, {<DT>?<JJ>*<NN><VB.*><RB.?>?} : \n", most_common_vp_chunks_seconde_structure)



The Picture of Dorian Gray

- single_word_tokenized_sentence (the 1945th): 
 ['he', 'had', 'something', 'on', 'his', 'mind', 'to', 'ask', 'of', 'her', ',', 'something', 'that', 'he', 'had', 'brooded', 'on', 'for', 'many', 'months', 'of', 'silence', '.']

- single_pos_sentence  (the 1645th  ):
 [('he', 'PRP'), ('had', 'VBD'), ('something', 'NN'), ('on', 'IN'), ('his', 'PRP$'), ('mind', 'NN'), ('to', 'TO'), ('ask', 'VB'), ('of', 'IN'), ('her', 'PRP'), (',', ','), ('something', 'NN'), ('that', 'IN'), ('he', 'PRP'), ('had', 'VBD'), ('brooded', 'VBN'), ('on', 'IN'), ('for', 'IN'), ('many', 'JJ'), ('months', 'NNS'), ('of', 'IN'), ('silence', 'NN'), ('.', '.')]

- most_common_np_chunks: 
 [((('i', 'NN'),), 962), ((('henry', 'NN'),), 200), ((('lord', 'NN'),), 197), ((('life', 'NN'),), 170), ((('harry', 'NN'),), 136), ((('dorian', 'JJ'), ('gray', 'NN')), 127), ((('something', 'NN'),), 126), ((('nothing', 'NN'),), 93), ((('basil', 'NN'),), 85), ((('the', 'DT'), ('world', 'NN')), 70), ((('everyth

In [24]:
# With Lawrence in Arabia, by Lowell Thomas


print("With Lawrence in Arabia, by Lowell Thomas\n")
# 1 ) Import and Preprocess Text Data
from nltk import pos_tag, RegexpParser
from nltk.tokenize import word_tokenize

# import chosen text and convert it to lowercase
text = open("With_lawrence_in_arabia.txt",encoding='utf-8').read().lower()
# sentence and word tokenized text (function "word_sentence_tokenize" above)
word_tokenized_text = word_sentence_tokenize(text)

# store and print any word tokenized sentence
# let us see the word tokenized sentence 1945th
single_word_tokenized_sentence = word_tokenized_text[1945]
print("- single_word_tokenized_sentence (the 1945th): \n", single_word_tokenized_sentence) # check


# 2 ) Part-of-speech (POS)  Tag Text
# create a list to hold part-of-speech tagged sentences here
pos_tagged_text = []

# create a for loop through each word tokenized sentence here
for word_tokenized_sentence in word_tokenized_text:
  # part-of-speech tag each sentence and append to list of "pos-tagged sentences" with 
  # "pos_tag()" nltk built-in function
    pos_tagged_text.append(pos_tag(word_tokenized_sentence))
  

# store and print any part-of-speech tagged sentence
# let us see the pos-tagged sentence 1945th
single_pos_sentence = pos_tagged_text[1945]
print("\n- single_pos_sentence  (the 1645th  ):\n", single_pos_sentence)


# 3) Chunk Sentences ("sens extraction" based on noun or verb focus)
# define noun phrase (np) chunk grammar
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create noun phrase RegexpParser object (RegexpParser object named "np_chunk_parser" 
# using the noun phrase np chunk grammar as argument)
np_chunk_parser = RegexpParser(np_chunk_grammar)

# define verb phrase chunk grammar
vp_first_chunk_grammar = "VP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}" # first structure
vp_second_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}" # second structure

# create verb phrase RegexpParser object (RegexpParser object named "vp_chunk_parser" 
#using the verb phrase vp chunk grammar as argument)
vp_first_chunk_parser = RegexpParser(vp_first_chunk_grammar) # first structure
vp_second_chunk_parser = RegexpParser(vp_second_chunk_grammar)  # second structure

# create a list to hold noun phrase chunked sentences 
np_chunked_text = []
# and a list to hold verb phrase chunked sentences
vp_first_chunked_text = []  # first structure
vp_second_chunked_text = [] # second structure

# create a for loop through each pos-tagged sentence
for pos_tagged_sentence in pos_tagged_text: # for every sentence where the words have been POSed
  # chunk each sentence and append to lists (with np_chunk_parser‘s or "vp_chunk_parser" (RegexpParser object) ".parse()" method and append to dedicated list)
  np_chunked_text.append(np_chunk_parser.parse(pos_tagged_sentence)) # for noun focus
  vp_first_chunked_text.append(vp_second_chunk_parser.parse(pos_tagged_sentence)) # for verb focus Second Structure
  vp_second_chunked_text.append(vp_first_chunk_parser.parse(pos_tagged_sentence)) # for verb focus Second Structure

    
# 4 ) Analyze Chunks 
# store and print the most common NP-chunks (via "np_chunk_counter function" in the cell above)
most_common_np_chunks = np_chunk_counter(np_chunked_text)
print("\n- most_common_np_chunks: \n", most_common_np_chunks)  # check

# store and print the most common VP-chunks here
most_common_vp_chunks_first_structure = vp_chunk_counter(vp_first_chunked_text)
print("\n- most_common_vp_chunks_first_structure, {<VB.*><DT>?<JJ>*<NN><RB.?>?} : \n", most_common_vp_chunks_first_structure)
most_common_vp_chunks_seconde_structure = vp_chunk_counter(vp_second_chunked_text)
print("\n- most_common_vp_chunks_seconde_structure, {<DT>?<JJ>*<NN><VB.*><RB.?>?} : \n", most_common_vp_chunks_seconde_structure)



With Lawrence in Arabia, by Lowell Thomas

- single_word_tokenized_sentence (the 1945th): 
 ['the', 'temple', 'was', 'carved', 'from', 'the', 'cliff', 'almost', 'two', 'thousand', 'years', 'ago', 'during', 'the', 'reign', 'of', 'the', 'roman', 'emperor', 'hadrian', ',', 'who', 'visited', 'petra', 'in', 'a.', 'd.', '131', '.']

- single_pos_sentence  (the 1645th  ):
 [('the', 'DT'), ('temple', 'NN'), ('was', 'VBD'), ('carved', 'VBN'), ('from', 'IN'), ('the', 'DT'), ('cliff', 'NN'), ('almost', 'RB'), ('two', 'CD'), ('thousand', 'CD'), ('years', 'NNS'), ('ago', 'RB'), ('during', 'IN'), ('the', 'DT'), ('reign', 'NN'), ('of', 'IN'), ('the', 'DT'), ('roman', 'NN'), ('emperor', 'NN'), ('hadrian', 'JJ'), (',', ','), ('who', 'WP'), ('visited', 'VBD'), ('petra', 'NN'), ('in', 'IN'), ('a.', 'NN'), ('d.', 'NN'), ('131', 'CD'), ('.', '.')]

- most_common_np_chunks: 
 [((('lawrence', 'NN'),), 718), ((('s', 'NN'),), 299), ((('the', 'DT'), ('desert', 'NN')), 183), ((('i', 'NN'),), 180), ((('feisal', '