In [0]:
import numpy as np
import os 
import sys 
import nltk
import numpy
import re


nltk.download('punkt')
nltk.download('stopwords')
import nltk
nltk.download('averaged_perceptron_tagger')




In [0]:
# Let's download the complete text of War and Peace (translated into English by Louise and Aylmer Maude)
URL='http://www.gutenberg.org/files/2600/2600-0.txt'
!wget -O WaP.txt $URL
# The exclamation point above is changing the programming language from Python to Bash.
# If you don't understand this, don't worry about it for now.
# It's possible to download the data using the Python package "requests", but it makes things a little messier.
# Feel free to ask me more about this at the end of the workshop :)


#  Reads ‘WaP.txt’ file 
wap = open("WaP.txt", "r") 
raw = wap.read()


# We just assigned the text of War and Peace to a variable called "raw"


# Some info on the type of variable and the how long it is
print('The variable "raw" contains a', type(raw), 'type object.')
print('The variable "raw" is', len(raw), 'characters long.')

In [0]:
#We are only interested in analyzing the text of the novel itself, so we need to get rid of the title, puiblication info, table of contents, etc.

#To do this, we need to find the point in the text where the actual novel begins and ends.

#Let's look at the first 1000 characters. We can do this by printing the characters from 0 - 1000. 

#Note: "printing" just means displaying something on the screen.


print(raw[0:1000])

In [0]:
print(raw[1001:10000])

In [0]:
print(raw[-20000:])

In [0]:
#Project Gutenberg ebooks headers and footers have a lot of info about the text we need to cut out.
#We have identified the beginning and end of the text we want.
#So let's find the index of the start of the text and the end.
#(Note that in a Python string, "\n" indicates a new line).

print(raw.find("BOOK ONE: 1805\n\n\n\n\n\nCHAPTER I"))
print(raw.find("End of the Project Gutenberg EBook of War and Peace, by Leo Tolstoy"))

In [0]:
#Overwrite Previous Data with Cleaned
#######

#We previously worked with the variable "raw"
#The goal is to overwrite that variable with the new, trimmed data


#Overwrite raw with the new slice containing only the text using the identified character indexes
raw = raw[7257:3208883]

In [0]:
#Now let's verify that we've done that correctly by checking the first and last several characters of "raw":

print('beginning:\n\n', raw[:200], '\n\n\n\n\n')
print('end:\n\n', raw[-200:])

In [0]:
#Here we normalize the text a bit and make all letters lowercase.
#The function we use to do this is the lower() function.
#Why do you think we might want to do this?

ti_lower = raw.lower()


# Let's quickly compare the two texts to make sure that worked.
print(raw[100:300], '\n\n\n')
print(ti_lower[100:300])

In [0]:
# Break the text into tokens, which are individual word or punctuation forms.
tokens = nltk.word_tokenize(ti_lower)

# We can also break the text into sentences (which is trickier than you might think!)
# This is useful when we want to analyse things like sentence structure or parts of speech in a text. I will send more materials on this after the workshop.
sent_tokens = nltk.sent_tokenize(ti_lower)

# Let's take a look:
print(tokens[100:200])
print(sent_tokens[100:200])


In [0]:
# It looks like this text has no spaces between long dashes and adjacent words.
# The presence of newline characters (\n) also seems to messing with the sentence tokenizer.
# Here we will use regular expressions to pad the long dashes with white space and remove the newline characters, replacing them with a single space.
# Regular expressions are a handy way of specifying patterns in a text that we want to remove, replace, find, etc.

fixed_dashes = re.sub(r'—', ' — ', ti_lower)
fixed_newlines = re.sub(r'\n', ' ', fixed_dashes)

In [0]:
tokens = nltk.word_tokenize(fixed_newlines)
sent_tokens = nltk.sent_tokenize(fixed_newlines)
# Note that we are overwriting the variables "tokens" and "sent_tokens".

print(tokens[100:200])
print(sent_tokens[100:200])


In [0]:
ti_text = nltk.Text(tokens)

# We can create a concordance around the word "war"


ti_text.concordance('war')


# Based on this concordance, how would you say 'war' (or some other concept or character) is characterized in War and Peace?
# Experiment with other words!



In [0]:
#Frequency distributions
from nltk import FreqDist


#We can calculate frequency distrubutions


FreqDist(tokens).most_common()[:25]
#Our frequency doesn't work very well since it is mostly punctuation and stop words

In [0]:
#Let's start by removing the punctuation

#There are a lot of hyphenated words and contractions in War and Peace.
#Let's use the regex tokenizer to keep them, while removing all other punctuation.

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r"(?x)\w+(?:[-']\w+)*")
better_tokens = tokenizer.tokenize(fixed_newlines)


# Now let's compare
print(tokens[:200])
print(better_tokens[:200])

In [0]:
#Now let's remove stopwords (words like "of", "the", "a", "to", "and", etc.)
#NLTK comes with a handy list of stopwords in a few languages
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
#We just created a "set", which you can just think of as a list of words to scan through

filtered_tokens = []

for word in better_tokens:
    if word not in stopWords:
        filtered_tokens.append(word)
#This is a loop!
#It performs an action (in this case looking at each word in the text and adding them to the list "filtered_tokens")
#over and over until it reaches some stop criterion (in this case, reaching the end of the text).

In [0]:
#Let's see what the same slice of text looks like now
print(better_tokens[:200])
print(filtered_tokens[:200])

In [0]:
#You might notice how it starts to look very different from slice that contained all the stop words very fast.
#That just shows you how many stop words there are in a text! 
#Let's look further ahead in the text to get an idea of the difference.
#Then we can see how much shorter the text is after removing the stop words.


print(better_tokens[600:650])
print(filtered_tokens[600:650])

In [0]:
bt_length = len(better_tokens)
ft_length = len(filtered_tokens)
token_diff = bt_length - ft_length
percent_stop_words = 100 * (token_diff / bt_length)

print('Including stop words, the text was', bt_length, 'tokens long.')
print('But after removing the stop words, the text was only', ft_length, 'tokens long.')
print('That means there were', token_diff, "stop words in the original text. That's", str(percent_stop_words)+'% of the original text!')


In [0]:
#Now let's try a word frequency again

word_freq = FreqDist(filtered_tokens)
word_freq.most_common()[:25]

 
#Nice!
#What does this frequency list suggest about the most common themes in the novel?
#This is a useful way of looking at thematic trends in the novel, but can you think of ways in which this could be somewhat misleading? What about different word forms?

In [0]:
# It's worth noting that sometimes we want to keep stop words for our analysis.
# Stop words are very useful in identifying a particular author's style.
# They, along with punctuation, are often crucial for tasks like part-of-speech tagging, syntactic parsing, and many others.


# NLTK makes graphing basic frequencies very easy with the built-in .plot function

word_freq.plot(20)

In [0]:
# We can also see the most common bigrams (sequence of two words) in the text.
# We'll use the older list of tokens leaving the stopwords in so the bigrams are intact.
ti_bigrams = list(nltk.bigrams(better_tokens))
FreqDist(ti_bigrams).most_common()[:25]

In [0]:
# And now let's try it again with the stop words removed:

ti_filt_bigrams = list(nltk.bigrams(filtered_tokens))
FreqDist(ti_filt_bigrams).most_common()[:25]

Comparative analysis:


*   Try to think about how you might use these tools to compare different novels.

*   How would the bigram frequencies differ from those of Alice in Wonderland?

*   Could we compare word frequencies across different Tolstoy novels?

*   What if we broke up War and Peace into sections to see how the frequencies change over time?



Shortcomings & pitfalls:


*   When might this type of analysis be misleading or problematic?

*   What problems might we run into if we are dealing with languages other than English?

Non-literary analysis:

*   How might some of these approaches translate into the social sciences?

