# Reading the book 

In [1]:
with open("miracle_in_the_andes.txt","r") as file:
    book = file.read()

# Processing Class

In [2]:
# Counter class from collections has methods to perform counting on words in text 
from collections import Counter 

# Import regex module 
import re 

class Regex_processing:
    
    def __init__(self, text):
        self.text = text
        
    def find_word(self,regex_statement):
        pattern = re.compile(regex_statement) 
        finding = re.findall(pattern,self.text) 
        return finding 

    def count_word(self,word):
        # Compile words (add plus so you don't just get a list of letters)  
        common_pattern = re.compile("[a-zA-Z]+") 

        # findall 
        common_findings = re.findall(common_pattern,self.text.lower()) 

        # Loop through the common findings and count which is the most common word using Counter
        word_count = [word for word in common_findings]
        c = Counter(word_count) 
        return c[word] 
    
    def most_common_word(self):
        # Compile words (add plus so you don't just get a list of letters)  
        common_pattern = re.compile("[a-zA-Z]+") 

        # findall 
        common_findings = re.findall(common_pattern,self.text.lower()) 

        # Loop through the common findings and count which is the most common word using Counter
        word_count = [word for word in common_findings]
        c = Counter(word_count) 
        return c.most_common()
    
    def split_text(self, regex_statement):
        # Compile the statement that is called to the method 
        pattern = re.compile(regex_statement) 
        
        #Split the text 
        split_text = re.split(regex_statement,self.text) 
        
        return split_text

In [3]:
# Instantiate Object `
rg = Regex_processing(book) 

# Find the most used words that aren't articles, i.e., 'a', 'the', etc

In [4]:
import nltk
from nltk.corpus import stopwords

In [5]:
# List of common article words in data 
english_stopwords = stopwords.words("english")

In [6]:
# Remove the articles from the word list 

filtered_words = []
raw_word_lst = rg.most_common_word()

# Loop over the raw list of the most common words, if the iteration is not in stopwords, add it to a new list 
for word,count in raw_word_lst:
    if word not in english_stopwords:
        filtered_words.append((word,count))
        
filtered_words[:10]

[('would', 575),
 ('us', 519),
 ('said', 292),
 ('roberto', 284),
 ('could', 252),
 ('one', 249),
 ('snow', 227),
 ('mountain', 183),
 ('time', 182),
 ('like', 165)]

# Sentiment Analysis: Find the most positive and negative chapter

In [7]:
# Import sentiment intensity analyzer 
from nltk.sentiment import SentimentIntensityAnalyzer

In [8]:
# Instantiate intensity analyzer object 
analyzer = SentimentIntensityAnalyzer() 

In [25]:
# Get the chapters 
chapters = rg.split_text("Chapter [0-9]+")

# There is a whitespace at the start of the split chapters, remove this 
chapters = chapters[1:]

# Get the title of each chapter 
chapter_title = rg.find_word("Chapter [0-9]+")

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 10']

In [28]:
# Loop through the chapters 
for individual_chapter,title in zip(chapters,chapter_title):
    sentiment_score = analyzer.polarity_scores(individual_chapter)
    print(f"{title} = {sentiment_score}")

Chapter 1 = {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
Chapter 2 = {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
Chapter 3 = {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
Chapter 4 = {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
Chapter 5 = {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
Chapter 6 = {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
Chapter 7 = {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
Chapter 8 = {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
Chapter 9 = {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
Chapter 10 = {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}


# Interpretation 
### We can see from the results that the book is written in a mostl