In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from string import punctuation

In [2]:
#load the data from the .txt file
filepath = 'datasets/12dancingprincesses.txt'
file = open(filepath, 'r')
content = file.read()
print(content)

THE TWELVE DANCING PRINCESSES

There was a king who had twelve beautiful daughters. They slept in
twelve beds all in one room; and when they went to bed, the doors were
shut and locked up; but every morning their shoes were found to be quite
worn through as if they had been danced in all night; and yet nobody
could find out how it happened, or where they had been.

Then the king made it known to all the land, that if any person could
discover the secret, and find out where it was that the princesses
danced in the night, he should have the one he liked best for his
wife, and should be king after his death; but whoever tried and did not
succeed, after three days and nights, should be put to death.

A king’s son soon came. He was well entertained, and in the evening was
taken to the chamber next to the one where the princesses lay in their
twelve beds. There he was to sit and watch where they went to dance;
and, in order that nothing might pass without his hearing it, the door
of his cham

In [3]:
#close the original text file since we no longer need it
file.close()

In [4]:
#first, change all the words to lowercase
content = content.lower()

#then tokenize each part of the text
tknz_wct = word_tokenize(content)

In [5]:
# count of parts of text (tokens) - words and punctuation items - before any removal
len(tknz_wct)

1849

In [6]:
#look at the first five tokens in the list
tknz_wct[:5]

['the', 'twelve', 'dancing', 'princesses', 'there']

In [7]:
#check frequency distribution at this point
fd_wct = FreqDist(tknz_wct)
fd_wct

FreqDist({'the': 139, ',': 102, 'and': 78, 'to': 42, '.': 35, ';': 35, 'he': 33, 'they': 32, 'of': 28, '’': 27, ...})

In [8]:
#remove the punctuation tokens from the list; leaves the words in

for token in tknz_wct:
    if token in punctuation:
        tknz_wct.remove(token)

In [9]:
#number of tokens in list after punctuation removal
len(tknz_wct)

1663

In [10]:
#check frequency distribution at this point
fd_wct = FreqDist(tknz_wct)
fd_wct

FreqDist({'the': 139, 'and': 78, 'to': 42, 'he': 33, 'they': 32, 'of': 28, '’': 27, 'in': 25, 'was': 24, 'all': 24, ...})

In [11]:
#Add extra punctuation that wasn't caught with default list
new_punct = punctuation + "‘" + "’"

In [12]:
#rerun to get rid of additional characters
for token in tknz_wct:
    if token in new_punct:
        tknz_wct.remove(token)

In [13]:
#recheck number of tokens remaining
len(tknz_wct)

1618

In [14]:
#put text back together with remaining clean words to check if all punctuation is now gone
clean_content = ' '.join(tknz_wct)

In [15]:
clean_content

'the twelve dancing princesses there was a king who had twelve beautiful daughters they slept in twelve beds all in one room and when they went to bed the doors were shut and locked up but every morning their shoes were found to be quite worn through as if they had been danced in all night and yet nobody could find out how it happened or where they had been then the king made it known to all the land that if any person could discover the secret and find out where it was that the princesses danced in the night he should have the one he liked best for his wife and should be king after his death but whoever tried and did not succeed after three days and nights should be put to death a king s son soon came he was well entertained and in the evening was taken to the chamber next to the one where the princesses lay in their twelve beds there he was to sit and watch where they went to dance and in order that nothing might pass without his hearing it the door of his chamber was left open but t

In [20]:
new_new_punct = new_punct + "‘" + "-" + "." + "--" + "."

In [21]:
#rerun to get rid of additional characters
for token in tknz_wct:
    if token in new_new_punct:
        tknz_wct.remove(token)

In [22]:
#recheck number of tokens remaining
len(tknz_wct)

1612

In [23]:
#put text back together with remaining clean words to check if all punctuation is now gone
clean_content = ' '.join(tknz_wct)
clean_content

'the twelve dancing princesses there was a king who had twelve beautiful daughters they slept in twelve beds all in one room and when they went to bed the doors were shut and locked up but every morning their shoes were found to be quite worn through as if they had been danced in all night and yet nobody could find out how it happened or where they had been then the king made it known to all the land that if any person could discover the secret and find out where it was that the princesses danced in the night he should have the one he liked best for his wife and should be king after his death but whoever tried and did not succeed after three days and nights should be put to death a king s son soon came he was well entertained and in the evening was taken to the chamber next to the one where the princesses lay in their twelve beds there he was to sit and watch where they went to dance and in order that nothing might pass without his hearing it the door of his chamber was left open but t

In [24]:
#list of english stopwords
eng_stopwords = stopwords.words('english')

In [25]:
#Keep all words NOT in the English stopwords list

new_content = []  #list to hold new words

for token in tknz_wct:
    if token not in eng_stopwords:
        new_content.append(token)

In [26]:
# number of meaningful words remaining
len(new_content)

668

In [27]:
#Make new frequency distribution dictionary and see what top words are now
fd_nc = FreqDist(new_content)
fd_nc.most_common(10)

[('soldier', 19),
 ('princesses', 17),
 ('said', 16),
 ('king', 15),
 ('twelve', 11),
 ('went', 11),
 ('came', 10),
 ('eldest', 10),
 ('one', 7),
 ('night', 7)]

In [28]:
#shows the top 10 words in the text from before, for comparison
fd_wct.most_common(10)

[('the', 139),
 ('and', 78),
 ('to', 42),
 ('he', 33),
 ('they', 32),
 ('of', 28),
 ('’', 27),
 ('in', 25),
 ('was', 24),
 ('all', 24)]