# Discover Insights into Classic Texts

Novels and text contain insights into ideologies and places that are often originally unknown to the reader. By reading a written piece, you uncover the opinions of the author on their chosen topic and come to understand both the topic and how the author thinks.

In this project you will perform a natural language parsing analysis to gain deeper insight into one of two famous and often discussed novels in the public domain: [Oscar Wilde’s The Picture of Dorian Gray](http://www.gutenberg.org/ebooks/174) or [Homer’s The Iliad!](http://www.gutenberg.org/ebooks/6130) Fear not if you haven’t heard or read the novels, one of the beauties of natural language parsing with regular expressions is the ability to gain insight into lengthy pieces of text without a formal read!

By the end of this project, you will find out the main topics of discussion in the novel of your choosing and can begin to discern some of the author’s thoughts and beliefs!

In [12]:
# Define Tokenized Words

from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

def word_sentence_tokenize(text):
  
    # create a PunktSentenceTokenizer
    sentence_tokenizer = PunktSentenceTokenizer(text)

    # sentence tokenize text
    sentence_tokenized = sentence_tokenizer.tokenize(text)

    # create a list to hold word tokenized sentences
    word_tokenized = list()

    # for-loop through each tokenized sentence in sentence_tokenized
    for tokenized_sentence in sentence_tokenized:
    # word tokenize each sentence and append to word_tokenized
        word_tokenized.append(word_tokenize(tokenized_sentence))

    return word_tokenized
    
# Define Chunking Counters

from collections import Counter

# function that pulls chunks out of chunked sentence and finds the most common chunks
def np_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract noun phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'NP'):
            chunks.append(tuple(subtree))

    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)

from collections import Counter

# function that pulls chunks out of chunked sentence and finds the most common chunks
def vp_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract verb phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'VP'):
            chunks.append(tuple(subtree))

    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)

In [16]:
# Identifying the Most Common Word Occurances

from nltk import pos_tag, RegexpParser

# import text of choice here
text = open("the_iliad.txt",encoding='utf-8').read().lower()

# sentence and word tokenize text here
word_tokenized_text = word_sentence_tokenize(text)

# store and print any word tokenized sentence here
single_word_tokenized_sentence = word_tokenized_text[10]
print("The Single word Token Sentence is: \n")
print(single_word_tokenized_sentence)
print('\n')

# create a list to hold part-of-speech tagged sentences here
pos_tagged_text = list()

# create a for loop through each word tokenized sentence here
for token in word_tokenized_text:
  # part-of-speech tag each sentence and append to list of pos-tagged sentences here
    pos_tagged_text.append(pos_tag(token))

# store and print any part-of-speech tagged sentence here
single_pos_sentence = pos_tagged_text[100]
print("The part-of-speech tagged sentence for 100th index is: \n")
print(pos_tagged_text[100])
print('\n')

# define noun phrase chunk grammar here
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create noun phrase RegexpParser object here
np_chunk_parser = RegexpParser(np_chunk_grammar)

# define verb phrase chunk grammar here
vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"

# create verb phrase RegexpParser object here
vp_chunk_parser = RegexpParser(vp_chunk_grammar)

# create a list to hold noun phrase chunked sentences and a list to hold verb phrase chunked sentences here
np_chunked_text = list()
vp_chunked_text = list()

# create a for loop through each pos-tagged sentence here
for token in pos_tagged_text:
  # chunk each sentence and append to lists here
    np_chunked_text.append(np_chunk_parser.parse(token))
    vp_chunked_text.append(vp_chunk_parser.parse(token))  

# store and print the most common NP-chunks here
most_common_np_chunks = np_chunk_counter(np_chunked_text)
print("The most common NP-chunks here are: \n")
print(most_common_np_chunks)
print('\n')

# store and print the most common VP-chunks here
most_common_vp_chunks = vp_chunk_counter(vp_chunked_text)
print("The most common VP-chunks here are: \n")
print(most_common_vp_chunks)
print('\n')

The Single word Token Sentence is: 

['illustrations', 'homer', 'invoking', 'the', 'muse', '.']


The part-of-speech tagged sentence for 100th index is: 

[('he', 'PRP'), ('appears', 'VBZ'), ('as', 'IN'), ('the', 'DT'), ('enunciator', 'NN'), ('of', 'IN'), ('opinions', 'NNS'), ('as', 'IN'), ('different', 'JJ'), ('in', 'IN'), ('their', 'PRP$'), ('tone', 'NN'), ('as', 'IN'), ('those', 'DT'), ('of', 'IN'), ('the', 'DT'), ('writers', 'NNS'), ('who', 'WP'), ('have', 'VBP'), ('handed', 'VBN'), ('them', 'PRP'), ('down', 'RP'), ('.', '.')]


The most common NP-chunks here are: 

[((('hector', 'NN'),), 322), ((('i', 'NN'),), 277), ((('jove', 'NN'),), 257), ((('troy', 'NN'),), 208), ((('vain', 'NN'),), 195), ((('war', 'NN'),), 193), ((('son', 'NN'),), 170), ((('thou', 'NN'),), 158), ((('the', 'DT'), ('plain', 'NN')), 157), ((('the', 'DT'), ('field', 'NN')), 154), ((('the', 'DT'), ('ground', 'NN')), 138), ((('death', 'NN'),), 134), ((('hand', 'NN'),), 134), ((('greece', 'NN'),), 128), ((('heaven',