In [2]:
# HW-2 
# Exploratory exercise for sentiment analysis
# Finding adverb and adjective phrases, and computing basic statistics

# importing required nltk libraries
import nltk
from nltk import sent_tokenize

# In HW-1, I used books written by Shakespeare - Caesar and Hamlet. We will continue exploratory 
# sentiment analysis on the same books
nltk.corpus.gutenberg.fileids()

# Get Shakespeare books in the Gutenberh corpus
shakespeare_books = [book for book in nltk.corpus.gutenberg.fileids( ) \
                     if 'shakespeare' in book]

# Book-1: Caesar (Genre: Tragedy)
caesar = nltk.corpus.gutenberg.raw(shakespeare_books[0])

# Book-2: Hamlet (Genre: Tragedy/Comedy)
hamlet = nltk.corpus.gutenberg.raw(shakespeare_books[1])

print(caesar[:50])
print(hamlet[:50])

[The Tragedie of Julius Caesar by William Shakespe
[The Tragedie of Hamlet by William Shakespeare 159


In [3]:
# Separate the text into sentences first
caesar_split = nltk.sent_tokenize(caesar)
print(caesar_split[:10])

hamlet_split = nltk.sent_tokenize(hamlet)
print(hamlet_split[:10])

['[The Tragedie of Julius Caesar by William Shakespeare 1599]\n\n\nActus Primus.', 'Scoena Prima.', 'Enter Flauius, Murellus, and certaine Commoners ouer the Stage.', 'Flauius.', 'Hence: home you idle Creatures, get you home:\nIs this a Holiday?', 'What, know you not\n(Being Mechanicall) you ought not walke\nVpon a labouring day, without the signe\nOf your Profession?', 'Speake, what Trade art thou?', 'Car.', 'Why Sir, a Carpenter\n\n   Mur.', 'Where is thy Leather Apron, and thy Rule?']
['[The Tragedie of Hamlet by William Shakespeare 1599]\n\n\nActus Primus.', 'Scoena Prima.', 'Enter Barnardo and Francisco two Centinels.', 'Barnardo.', "Who's there?", 'Fran.', 'Nay answer me: Stand & vnfold\nyour selfe\n\n   Bar.', 'Long liue the King\n\n   Fran.', 'Barnardo?', 'Bar.']


In [19]:
# Apply the word tokenizer to each sentence
token_caesar = [nltk.word_tokenize(sent) for sent in caesar_split]
print(token_caesar[:2])

# the output is a list of strings that contains the sentences
print('Type %s' %(type(token_caesar)))
print('Caesar tokens: %d' %(len(token_caesar)))

# Repeat the same for hamlet text
token_hamlet = [nltk.word_tokenize(sent) for sent in hamlet_split]
print(token_hamlet[:2])
print('Hamlet tokens: %d' %(len(token_hamlet)))

[['[', 'The', 'Tragedie', 'of', 'Julius', 'Caesar', 'by', 'William', 'Shakespeare', '1599', ']', 'Actus', 'Primus', '.'], ['Scoena', 'Prima', '.']]
Type <class 'list'>
Caesar tokens: 1592
[['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']', 'Actus', 'Primus', '.'], ['Scoena', 'Prima', '.']]
Hamlet tokens: 2355


In [21]:
## POS Tagging, to retrieve adjective (JJs) and adverb (RBs) tags

# use the Stanford POS tagger to POS tag tokens of each sentence
# this is the default tagger in nltk
caesar_tagged = [nltk.pos_tag(tokens) for tokens in token_caesar]
print(caesar_tagged[:2])

hamlet_tagged = [nltk.pos_tag(tokens) for tokens in token_hamlet]
print(hamlet_tagged[:2])

[[('[', 'IN'), ('The', 'DT'), ('Tragedie', 'NNP'), ('of', 'IN'), ('Julius', 'NNP'), ('Caesar', 'NNP'), ('by', 'IN'), ('William', 'NNP'), ('Shakespeare', 'NNP'), ('1599', 'CD'), (']', 'NNP'), ('Actus', 'NNP'), ('Primus', 'NNP'), ('.', '.')], [('Scoena', 'NNP'), ('Prima', 'NNP'), ('.', '.')]]
[[('[', 'IN'), ('The', 'DT'), ('Tragedie', 'NNP'), ('of', 'IN'), ('Hamlet', 'NNP'), ('by', 'IN'), ('William', 'NNP'), ('Shakespeare', 'NNP'), ('1599', 'CD'), (']', 'NNP'), ('Actus', 'NNP'), ('Primus', 'NNP'), ('.', '.')], [('Scoena', 'NNP'), ('Prima', 'NNP'), ('.', '.')]]


In [47]:
# Following our NLTK textbook, chapter on Information Extraction--Chunking (https://www.nltk.org/book/ch07.html)

# Using CHUNKING to parse sentences 
# to look for "adjective phrases", i.e. phrases (or chunks) that have adverbs and adjectives ('RB'+'JJ')
# First step: writing a grammar that defines the POS in the chunk
# we name this grammar "ADJPH" ("ADJective PHrase") using regexes 

import re

def chunking(grammar, taggedtext, metadata):
    # Extract the metadata
    label = metadata['label']
    desc = metadata['desc']
    text = metadata['text']
    
    # Second step: import the nltk parser to process each sentence
    chunk_parser = nltk.RegexpParser(grammar)

    _tags = []
    for sent in taggedtext:
        if len(sent) > 0:
            tree = chunk_parser.parse(sent)
            for subtree in tree.subtrees():
                if subtree.label() == label:
                    _tags.append(subtree)
                
    # Visualizing the actual adj/adv phrase
    _phrases = []
    for sent in _tags:
        temp = ''
        for w, t in sent:
            temp += w+ ' '    
        _phrases.append(temp)
    
    print('First 10 %s phrases (%s): %s' %(desc, text, _phrases[:10]))
    
    # Following our NLTK textbook, chapter 1 on Language Processing (https://www.nltk.org/book/ch01.html)
    # FREQUENCY DISTRIBUTIONS
    # Top 50 adjective phrases
    _freq = nltk.FreqDist(_phrases)

    print('Top %s phrases by frequency (%s): ' %(desc, text))
    for word, freq in _freq.most_common(50):
        print(word, freq)

            
    #print the list of our sentences:
    print('Length of %s phrase sentences (%s): %d' %(desc, text, len(_tags)))
    
    return _tags

grammar_adjph = "ADJPH: {<RB.?>+<JJ.?>}"
# This regex reads as: "find groups ("< >") of RBs (adverbs) together with groups of JJs (adjectives), with groups defineds as
# RBs with any ending (the "." is a placeholder or wildcard for the "R" and the "S" at the end of RBR and RBS, 
# while "?" indicates "optional character" so RB can be found alone as well). Same regex operators apply to JJs.

caesar_adjph_tags = chunking(grammar_adjph, caesar_tagged, {'text':'Caesar', 'label':'ADJPH', 'desc':'adjective'})
print('---------')
hamlet_adjph_tags = chunking(grammar_adjph, hamlet_tagged, {'text':'Hamlet', 'label':'ADJPH', 'desc':'adjective'})

First 10 adjective phrases (Caesar): ['Truly sir ', 'Truly sir ', 'then senslesse ', 'there haue ', 'most exalted ', 'not mou ', 'yet againe ', 'once againe ', 'too stubborne ', 'too strange ']
Top adjective phrases by frequency (Caesar): 
so much  7
too much  4
so great  3
then thy  3
so good  3
Truly sir  2
so many  2
most Noble  2
more worthy  2
not backe  2
not meete  2
not so haue  2
so strong  2
then senslesse  1
there haue  1
most exalted  1
not mou  1
yet againe  1
once againe  1
too stubborne  1
too strange  1
not deceiu  1
very much  1
Therefore good  1
not iealous  1
thus much  1
not dangerous  1
well giuen  1
very dangerous  1
so sad  1
then other  1
very loath  1
so hee  1
thus sad  1
so firme  1
too sawcie  1
Not sensible  1
very pleasing  1
most mightie  1
yet prodigious  1
most strong  1
So euery  1
so bestow  1
thus seal  1
too bold  1
Then secret  1
very strong  1
not fit  1
not meet  1
so well belou  1
Length of adjective phrase sentences (Caesar): 143
---------
Firs

In [48]:
# Now we look for "adverb phrases" or chunks that have 2 consecutive adverbs ('RB')
# First step: writing a grammar that defines POS rules of the adverb phrase the chunk
# we name this grammar "ADVPH" ("ADVerb PHrase")
grammar_advph = "ADVPH: {<RB>+<RB>}"

caesar_advph_tags = chunking(grammar_advph, caesar_tagged, {'text':'Caesar', 'label':'ADVPH', 'desc':'adverb'})
print('---------')
hamlet_advph_tags = chunking(grammar_advph, hamlet_tagged, {'text':'Hamlet', 'label':'ADVPH', 'desc':'adverb'})

First 10 adverb phrases (Caesar): ['art not ', 'So well ', 'heere so long ', 'as well ', 'as well ', 'so indeed ', 'till now ', 'not so ', 'So soone ', 'not then ']
Top adverb phrases by frequency (Caesar): 
not so  9
not well  4
as well  3
heere so  3
So well  2
art not  1
heere so long  1
so indeed  1
till now  1
So soone  1
not then  1
not laugh  1
so conioyntly  1
right well  1
so soundly  1
so much  1
beene often  1
directly heere  1
so well  1
not Wrathfully  1
faile not then  1
too impatiently  1
too much  1
not sicke  1
so farre  1
so earely too  1
so neere  1
marke well  1
not yet  1
not fond  1
fast together  1
yet vnknowne  1
So often  1
so lowe  1
thee well  1
Though now  1
as fast  1
heere lye  1
'Twere best  1
so poore  1
so vnkindely  1
doth therefore  1
not thus  1
durst not  1
then yee  1
as much  1
better then  1
very wisely  1
well aueng  1
not dye  1
Length of adverb phrase sentences (Caesar): 74
---------
First 10 adverb phrases (Hamlet): ['spoke too ', 'Thus twice

In [62]:
# Top 50 tokens (grouped by Adjective, Adverb or Nouns)

def top_tokens(taggedtext, pos_list):
    _tokens = []
    for sentence in taggedtext:
        for word, pos in sentence:
            if pos in pos_list: 
                if len(word)>1:
                    _tokens.append(word)
    freq_pos = nltk.FreqDist(_tokens)

    for word, freq in freq_pos.most_common(50):
        print(word,freq)

# Top 50 adjective tokens
print('Top 50 Adjective tokens:')
top_tokens(caesar_tagged, ['JJ', 'JJR', 'JJS'])
print('----')
top_tokens(hamlet_tagged, ['JJ', 'JJR', 'JJS'])

Top 50 Adjective tokens:
good 48
thy 41
Noble 32
great 25
thou 24
such 23
much 22
true 18
Good 18
many 15
dead 14
other 13
thee 13
bad 12
best 10
mine 10
better 10
full 10
strong 10
most 10
common 9
sure 9
wrong 9
more 8
strange 8
worthy 8
dangerous 8
last 8
Most 8
Honourable 8
hard 7
haue 7
vs 7
gentle 7
Such 7
same 6
high 6
owne 6
euery 6
old 5
angry 5
sicke 5
first 5
welcome 5
bloody 5
mou 4
againe 4
new 4
free 4
feeble 4
----
good 76
thy 54
more 37
such 34
most 30
much 25
dead 25
true 21
Good 21
thou 20
great 19
owne 18
old 16
other 15
sweet 15
mine 15
many 15
same 13
Most 12
deere 12
Noble 12
mad 12
Other 12
thee 11
last 11
farre 10
first 10
little 10
second 10
selfe 9
seene 9
strange 9
young 9
full 9
late 9
excellent 9
free 9
welcome 8
haue 8
heere 8
whole 8
better 8
common 8
best 8
vs 8
fine 8
againe 7
oft 7
long 7
bad 7


In [63]:
# Top 50 adverb tokens
print('Top 50 Adverb tokens:')
top_tokens(caesar_tagged, ['RB', 'RBR', 'RBS'])
print('-----')
top_tokens(hamlet_tagged, ['RB', 'RBR', 'RBS'])

Top 50 Adverb tokens:
not 255
so 103
then 79
well 40
now 39
too 30
Then 28
yet 27
heere 26
So 24
more 24
Now 24
there 15
once 13
thus 13
else 12
very 12
away 12
still 11
as 11
most 9
directly 8
Not 8
rather 8
onely 7
indeed 6
alone 6
together 6
enough 6
Yet 6
much 6
first 6
hence 6
Well 5
Enter 5
Therefore 4
long 4
therefore 4
almost 4
downe 4
here 4
neere 4
presently 4
wisely 4
yong 4
Truly 3
wherefore 3
sayes 3
Indeed 3
yee 3
-----
not 313
so 139
then 75
now 68
well 53
too 50
more 46
very 44
most 35
So 33
Then 33
thus 33
Now 24
yet 23
heere 21
there 20
n't 20
away 19
indeed 18
as 18
once 17
much 16
Not 14
still 14
else 13
Well 9
Indeed 9
alone 8
long 7
together 7
therefore 7
neuer 7
better 7
Thus 6
further 6
ere 6
enough 6
neere 6
here 6
almost 6
first 6
freely 5
poore 5
thee 5
presently 5
twice 4
sometimes 4
Therefore 4
truly 4
longer 4


In [64]:
## TO DO / YOUR TURN NOW!
## NOUN EXTRACTION
## VERB EXTRACTION
## REMEMBER TO CHECK THE PENN POS TAGS LIST: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
## TO FIND ALL TAGS

print('Top 50 Noun tokens:')
top_tokens(caesar_tagged, ['NN', 'NNS', 'NNP', 'NNPS']) #Noun, Noun-plural, Noun-Proper, Noun-Proper-plural
print('----')
top_tokens(hamlet_tagged, ['NN', 'NNS', 'NNP', 'NNPS'])


print('\nTop 50 Verb tokens:')
top_tokens(caesar_tagged, ['VB', 'VBD', 'VBG', 'VBP', 'VBZ']) # Verb, Verb-past-tense, Verb-present participle,
                                                              # Verb-past participle, singular present (non-3rd)
                                                              # singular present (3rd)
print('----')
top_tokens(hamlet_tagged, ['VB', 'VBD', 'VBG', 'VBP', 'VBZ'])

Top 50 Noun tokens:
Caesar 187
Brutus 160
Bru 152
Cassi 107
Cassius 85
Antony 75
Enter 58
men 57
man 54
thou 49
Ant 48
Lord 44
day 41
Caesars 38
Caes 38
Cask 38
Rome 37
Brut 37
selfe 33
vs 33
Caska 33
'd 28
Luc 28
Will 26
hand 26
death 26
Cinna 26
Lucius 26
thee 25
Messala 25
time 24
Gods 24
Friends 23
Titinius 23
Exeunt 22
Which 22
Octauius 22
Cas 21
Come 21
night 21
word 20
Messa 19
haue 18
Haue 18
blood 18
Romans 18
thing 18
heart 18
Octa 18
Sir 17
----
Ham 334
Lord 210
King 170
Hamlet 98
Hor 95
Enter 80
Qu 62
Laer 59
Ile 58
Ophe 55
Pol 48
Sir 47
Father 45
thou 43
Rosin 43
man 42
Horatio 40
vs 40
Queene 39
Polon 38
time 37
Mother 37
Heauen 36
death 36
thee 34
selfe 34
Mar 31
life 30
night 29
loue 29
Clo 29
'd 28
Ophelia 28
hath 26
Laertes 26
Come 26
Nay 25
God 25
heart 24
thing 24
matter 24
nothing 23
vp 23
Exeunt 23
againe 22
world 22
Play 22
Ghost 21
day 21
Denmarke 20

Top 50 Verb tokens:
is 247
be 132
do 107
haue 100
are 96
was 64
know 63
did 61
am 52
let 41
come 38
were 34
say 

In [61]:
# Now we have two lists of POS tags combinations we can compare
# We need to get the sentences back from the tagging exercise and run some stats

def get_len_sent(_tags):
    # Create a list of original sentences from the ADJECTIVE/ADVERB phrase subset:
    _whole_sentences = []

    # loop over the sentences in the adjective phrase sentences we created:
    for sents in _tags:
        temp=''
        for (word,tag) in sents:
            temp += word+' '
            _whole_sentences.append(temp)
        
    return len(_whole_sentences)

# Get stats - like len() of all adjective/adverb phrase words
print('Caeser - adjph/sentences (count): %d' %(get_len_sent(caesar_adjph_tags)))
print('Hamlet - adjph/sentences (count): %d' %(get_len_sent(hamlet_adjph_tags)))
get_len_sent(hamlet_adjph_tags)

# Get stats 
print('Caeser - advph/sentences (count): %d' %(get_len_sent(caesar_advph_tags)))
print('Hamlet - advph/sentences (count): %d' %(get_len_sent(hamlet_advph_tags)))

Caeser - adjph/sentences (count): 298
Hamlet - adjph/sentences (count): 484
Caeser - advph/sentences (count): 151
Hamlet - advph/sentences (count): 176


In [68]:
# Following our NLTK textbook, Writing Structural Programs chapter
# section on Procedural vs Declarative style (http://www.nltk.org/book_1ed/ch04.html) 

## CORPUS STATISTICS--SENTENCES LENGTH

# Calculating the average length of sentences in the entire corpus
# from http://www.nltk.org/book_1ed/ch04.html
caesar_total_corpus = sum(len(sent) for sent in caesar_split) 
print('Average len of sentence (Caesar): %s' %(caesar_total_corpus / len(caesar_split)))

hamlet_total_corpus = sum(len(sent) for sent in hamlet_split) 
print('Average len of sentence (Hamlet): %s' %(hamlet_total_corpus / len(hamlet_split)))

Average len of sentence (Caesar): 69.16394472361809
Average len of sentence (Caesar): 67.78853503184713
