In [32]:
from urllib.request import urlopen
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import FreqDist

# Download and tokenise a book
In this exercise, we will download the text from a book, we will use Les Miserables by Victor Hugo for other sections, but you can try with any other book available from Project Gutenberg or elsewhere.

In [4]:
target_url0 = 'http://www.gutenberg.org/files/135/135-0.txt'
book_raw = urlopen(target_url0).read().decode('utf-8')

In [5]:
type(book_raw)

str

In [6]:
len(book_raw)

3324222

In [10]:
book_raw[1:250]

'The Project Gutenberg eBook of Les Misérables, by Victor Hugo\r\n\r\nThis eBook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it aw'

In [9]:
print(book_raw[1:250])

The Project Gutenberg eBook of Les Misérables, by Victor Hugo

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it aw


In [29]:
word_tokens = word_tokenize(book_raw)
print(word_tokens[1:40])

['Project', 'Gutenberg', 'eBook', 'of', 'Les', 'Misérables', ',', 'by', 'Victor', 'Hugo', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'United', 'States', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever']


In [30]:
stop_words = (stopwords.words('english'))
print(stop_words[1:40])

['me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this']


In [31]:
# Remove stop words from tokenised works
word_tokens_filtered = [word for word in word_tokens if word not in stop_words]

# Print the answer
print(word_tokens_filtered[1:40])

['Project', 'Gutenberg', 'eBook', 'Les', 'Misérables', ',', 'Victor', 'Hugo', 'This', 'eBook', 'use', 'anyone', 'anywhere', 'United', 'States', 'parts', 'world', 'cost', 'almost', 'restrictions', 'whatsoever', '.', 'You', 'may', 'copy', ',', 'give', 'away', 're-use', 'terms', 'Project', 'Gutenberg', 'License', 'included', 'eBook', 'online', 'www.gutenberg.org', '.', 'If']


# Lengthy books

Can you find other books that are longer than Les Miserables? For example, compare with The count of Monte-cristo:

https://www.gutenberg.org/files/1184/1184-0.txt  
Or what about that very very very long book by Leo Tolstoy?

https://www.gutenberg.org/files/2600/2600-0.txt  

You can try books in other languages as well, like Don Quijote in Spanish:
http://www.gutenberg.org/cache/epub/2000/pg2000.txt  

Find out which book has more words. (Be aware that Project Gutenberg adds a preamble and then some notices after the book, this may have a stronger impact in shorter books). 

**NOTE**: if you get an error dowloading a book, try a different version, for instance for Don Quijote you can try the English version:

https://www.gutenberg.org/cache/epub/996/pg996.txt

Count how **many sentences** each book has. We could expect that a longer book will have more sentences, but one interesting comparison is how long is the average sentence in each book. Find out the **average length of a sentence** in each book.

In [35]:
sentence_tokens = sent_tokenize(book_raw)

In [36]:
print(sentences_tokens[1001])

He very soon returned to D—— He was
interrogated as to this speedy return, and he replied: _“I embarrassed
them.


In [37]:
# How many sentences a book has
len(sentence_tokens)

29649

In [40]:
# Find the average length
print("Average length: " + str(len(book_raw)/ len(sentence_tokens)))

Average length: 112.11919457654558


Finally, do authors repeat many times the same words or do they have an large vocabulary and use many different words? We can explore that with another function from nltk

In [41]:
# Getting a frequency
freq_dist = FreqDist(word_tokens)
freq_dist

FreqDist({',': 48757, 'the': 36547, '.': 26190, 'of': 19596, 'and': 14028, 'a': 13412, 'to': 13325, 'in': 10239, 'was': 8536, 'that': 7252, ...})

In [46]:
# Put it frequency in the dict
freq_dist_dict = dict((word, freq) for word, freq in freq_dist.items())

# Print the resolution
print(len(freq_dist_dict))

30354
