# Load the book

In [1]:
with open('miracle_in_the_andes.txt', 'r', encoding='utf-8') as file:
    book = file.read()

# Count the chapters

In [2]:
book.count('Chapter')

11

# With regex

In [3]:
import re

In [4]:
# Create a regex type
# The + is because there are more than one digit to some chapters
pattern = re.compile("Chapter [0-9]+")
pattern

re.compile(r'Chapter [0-9]+', re.UNICODE)

In [5]:
# Look for all the chapters inside the book
findings = re.findall(pattern, book)
findings

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 10']

In [6]:
len(findings)

10

# Which are the sentences where "love" was used?

In [7]:
# Everything but a period. [^.]
# Only capital letters once [A-Z]{1}
# Not a letter [^a-zA-Z]
# not a period 0 or more times [^.]* This is what the asterisk stands for
# + means more than one in a row.
pattern = re.compile("[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.")
findings = re.findall(pattern, book)
len(findings)

67

# What are the most common words in the book

In [8]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())
findings[:5]

['chapter', 'before', 'it', 'was', 'friday']

In [9]:
# Creating a blank dictionary
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word]+1
    else:
        d[word] = 1

In [10]:
# In order to count it we will need to turn it into a list of tupples
d_list = [(value, key) for (key, value) in d.items()]
sorted_d_list = sorted(d_list, reverse=True)

# Extract the paragraphs where 'love' was used

In [11]:
pattern = re.compile("[A-Z]{1}[^\n]*[^a-zA-Z]+love[^a-zA-Z]+[^\n]*.")
findings = re.findall(pattern, book)

# Extract the chapter titles

In [12]:
pattern = re.compile("[a-zA-Z ,]+\n\n") #[\n]+[^A-Z][$\n]
findings = re.findall(pattern, book)
findings = [item.strip("\n\n") for item in findings]
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

In [13]:
pattern = re.compile("([a-zA-Z ,]+)\n\n") #[\n]+[^A-Z][$\n]
findings = re.findall(pattern, book)
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

# A function that finds the occurrence of any word

In [14]:
# def find(word):
#     pattern = re.compile("[a-zA-Z]+")
#     findings = re.findall(pattern, book.lower())
#     d = {}
#     for w in findings:
#         if w in d.keys():
#             d[w] = d[w]+1
#         else:
#             d[w] = 1
#     if word in d.keys():
#         countings = d[word]
#     else:
#         countings = f'The word "{word}" you are looking for does not exist in this book'
#     return countings
def find(word):
	pattern = re.compile("[a-zA-Z]+")
	findings = re.findall(pattern, book.lower())
	d = {}
	for w in findings:
		if w in d.keys():
			d[w] = d[w]+1
		else:
			d[w] = 1
	try:
			countings = d[word]
	except:
			countings = f'The word "{word}" you are looking for does not exist in this book'
	return countings

# Call the function

In [15]:
find('love')

83

In [16]:
find('hate')

'The word "hate" you are looking for does not exist in this book'

# Find the most common non article words in the book

In [28]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())
d = {}
for w in findings:
	if w in d.keys():
		d[w] = d[w]+1
	else:
		d[w] = 1
d_list = [(value, key) for (key, value) in d.items()]
d_list = sorted(d_list, reverse=True)
d_list[:10]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

In [30]:
from platform import python_version
python_version()

'3.10.8'

In [33]:
!pip3.10 install nltk




[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shaked\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [45]:
english_stopwords = stopwords.words("english")

In [50]:
filtered_words = []
for count, word in d_list:
    if word not in english_stopwords:
        filtered_words.append((count, word))

# Using nltk for sentiment analysis: What is the most positive and most negative chapters in the book?

In [54]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Shaked\AppData\Roaming\nltk_data...


True

In [56]:
analyzer = SentimentIntensityAnalyzer()

In [57]:
analyzer.polarity_scores('I hate this life')

{'neg': 0.649, 'neu': 0.351, 'pos': 0.0, 'compound': -0.5719}

In [58]:
analyzer.polarity_scores('I love this life')

{'neg': 0.0, 'neu': 0.323, 'pos': 0.677, 'compound': 0.6369}

In [59]:
import re

pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)
chapters = chapters[1:]

In [66]:
for nr, chapter in enumerate(chapters):
    score = analyzer.polarity_scores(chapter)
    print("Chapter ", nr+1, ' - ', score)

Chapter  1  -  {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
Chapter  2  -  {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
Chapter  3  -  {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
Chapter  4  -  {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
Chapter  5  -  {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
Chapter  6  -  {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
Chapter  7  -  {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
Chapter  8  -  {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
Chapter  9  -  {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
Chapter  10  -  {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
