# Reading Frankenstein

*This Notebook was prepared by Russell Williams, but the real under-the-bonnet work was done by Ruth Corran. We both work at the American University of Paris. 

In [None]:
import urllib.request

with urllib.request.urlopen("https://raw.githubusercontent.com/rwilliamsparis/AUPCL1099/main/corpora/FrankensteinLetters.txt") as f:
    frank_text=f.read().decode()

In [None]:
print(frank_text[:2000])

In [None]:
def get_num_sents(string):
    num_sents = string.count('.') + string.count('?') + string.count('!')
    return num_sents

In [None]:
def get_num_words(string):
    # @TODO: Is this right?
    return string.count(' ')

In [None]:
def get_words_per_sent(string):
    num_sents = get_num_sents(string)
    num_words = get_num_words(string)
    return num_words / num_sents

In [None]:
get_words_per_sent(frank_text)

*Your challenge is to use your coding skills to calculate the average number of words per sentence in Cat Person and the opening four chapters of Dracula. (Meaningless bonus points awarded if you are able to provide an analysis of an alternative text...). Use the following empty cell to experiment.

In [None]:
# numpy is your basic numerical analysis package for Python
# np is shorthand for numpy
import numpy as np

# matplotlib allows us to plot graphs. Here 'fivethirtyeight' is the style guide: https://matplotlib.org/stable/gallery/style_sheets/fivethirtyeight.html 
import matplotlib
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# pandas is python numerical data science. It allows us to construct dataframes: https://pandas.pydata.org/docs/user_guide/10min.html#min
import pandas as pd
from pandas import Series, DataFrame


In [None]:
# I'm going to be working initially with Mary Shelley's Frankenstein, as available in Project Gutenberg, so I need to pull that into the notebook for analysis
# This allows me to import URLs and the instructions allow the text to be read and cleaned up:

from urllib.request import urlopen 
import re
def read_url(url): 
    return re.sub('\\s+', ' ', urlopen(url).read().decode())

In [None]:
shelley_url = 'https://www.gutenberg.org/cache/epub/42324/pg42324.txt'
shelley_text = read_url(shelley_url)

In [None]:
print(shelley_text[:2000])

In [None]:
shelley_text.count('monster')

In [None]:
#What words would you like to search for, using your by now excellent Python coding skills?

print(shelley_text.count('put'), shelley_text.count('your'),shelley_text.count('words'),shelley_text.count('here'))

In [None]:
#Let's lose the Project Gutenberg stuff at the very start and end of the text

shelley_body=shelley_text.split('*** ')[2]
shelley_body[:2000]

In [None]:
shelley_body[-2000:]

In [None]:
shelley_intro=shelley_text.split('INTRODUCTION. ')[1].split('PREFACE. ')[0]
print(shelley_intro[:100])
print(shelley_intro[-100:])

In [None]:
shelley_preface=shelley_text.split('PREFACE. ')[1].split('FRANKENSTEIN;')[0]
print(shelley_preface[:100])
print(shelley_preface[-100:])

In [None]:
# This splits Frankenstein into chapters (splitting the text whenever the word 'CHAPTER' occurs)

shelley_chapters = shelley_text.split('CHAPTER ')[1:]

In [None]:
len(shelley_chapters)
for chapter in shelley_chapters:
  print(chapter[:100])

In [None]:
#Let's check it's there by creating a wordcloud

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=40, 
        scale=3,
        random_state=1 
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
show_wordcloud(shelley_chapters)

In [None]:
show_wordcloud(shelley_chapters[0])

In [None]:
show_wordcloud(shelley_chapters[6])

In [None]:
#Let's build some graphs

word_1='Elizabeth'
word_2='Victor'
word_3='monster'
word_4='creature'
word_5='devil'
word_6='fiend'

In [None]:
T1_counts = {word_1: np.char.count(shelley_chapters, word_1),
        word_2: np.char.count(shelley_chapters, word_2),
        word_3: np.char.count(shelley_chapters, word_3),
        word_4: np.char.count(shelley_chapters, word_4),
        word_5: np.char.count(shelley_chapters, word_5),
        word_6: np.char.count(shelley_chapters, word_6)}
type(T1_counts)

In [None]:
T1_counts_DF=pd.DataFrame(T1_counts)
type(T1_counts_DF)

In [None]:
T1_counts_DF['Creature']=T1_counts_DF['monster']+T1_counts_DF['creature']+T1_counts_DF['devil']+T1_counts_DF['fiend']

In [None]:
#These are the various ways the creature might be described. Are there any more you can think of?

del T1_counts_DF['monster']
del T1_counts_DF['creature']
del T1_counts_DF['devil']
del T1_counts_DF['fiend']


In [None]:
T1_counts_DF.head()

In [None]:
T1_counts_DF['Chapter']=np.arange(1,len(shelley_chapters)+1,1)

In [None]:
T1_counts_DF.plot('Chapter')
plots.title('Number of Times Names Appear', y=1.08);

In [None]:
#Graph of words used to describe the "monster"

name_1='monster'
name_2='creature'
name_3='devil'
name_4='fiend'

In [None]:
T2_counts = {name_1: np.char.count(shelley_chapters, name_1),
        name_2: np.char.count(shelley_chapters, name_2),
        name_3: np.char.count(shelley_chapters, name_3),
        name_4: np.char.count(shelley_chapters, name_4)}
type(T2_counts)

In [None]:
T2_counts_DF=pd.DataFrame(T2_counts)
type(T2_counts_DF)

In [None]:
T2_counts_DF.head()

In [None]:
T2_counts_DF['Chapter']=np.arange(1,len(shelley_chapters)+1,1)

In [None]:
T2_counts_DF.plot('Chapter')
plots.title('Words used to describe the creature', y=1.08);

In [None]:
#Words used to describe the environment

term_1='ice'
term_2='cold'
term_3='mountain'
term_4='green'
term_5='river'
term_6='sublime'

In [None]:
T3_counts = {term_1: np.char.count(shelley_chapters, term_1),
        term_2: np.char.count(shelley_chapters, term_2),
        term_3: np.char.count(shelley_chapters, term_3),
        term_4: np.char.count(shelley_chapters, term_4),
        term_5: np.char.count(shelley_chapters, term_5),
        term_6: np.char.count(shelley_chapters, term_6)}
type(T3_counts)

In [None]:
T3_counts_DF=pd.DataFrame(T3_counts)
type(T3_counts_DF)

In [None]:
T3_counts_DF.head()

In [None]:
T3_counts_DF['Chapter']=np.arange(1,len(shelley_chapters)+1,1)

In [None]:
T3_counts_DF.plot('Chapter')
plots.title('Evironment words', y=1.08);