In [1]:
import requests
import re
from bs4 import BeautifulSoup
import string

In [2]:
page = requests.get("http://www.vatican.va/archive/bible/genesis/documents/bible_genesis_en.html")
soup = BeautifulSoup(page.content, 'html.parser')

for script in soup(["script", "style"]):
    script.decompose()

In [3]:
genesis = soup.get_text()
genesis = re.sub(r'[\n]+','\n', genesis)

## Questions

1) How many times the word 'God' appear (as an isolated word)?

2) What are the 5 most common words?

3) What are the words that appear only once (hapaxes)?

**for** loops and **if** statements are required to answer the questions

In [4]:
# Removing undesirable characters
for j in string.punctuation:
    genesis = genesis.replace(j,' ')

### Question 1

In [5]:
# wrong way! We are not counting 'God' as an isolated word here.
genesis.count('God')

227

In [6]:
genesis.split().count('God')

227

Coincidentally, there are no words that contain 'God'. Ex: Goddess

### Question 2

To accomplish this task we need to convert all uppercase characters into lowercase characters

In [7]:
genesis = genesis.lower()

**word_list**: list that contains every word of the string (no duplicates)

In [8]:
word_list = [] 
# using the data structure 'set' to remove duplicates
for isolated_string in list(set(genesis.split())):
# ignoring numeric strings  
    if isolated_string.isalpha():   
        word_list.append(isolated_string)

**count_list**: list that contains the frequency of every element of **word_list**

In [9]:
count_list = []
for word in word_list:
    count_list.append(genesis.split().count(word))

For the same index, we have the word and the associated frequency. The natural data structure here would be dictionaries or dataframe.

In [10]:
len(count_list) == len(word_list)

True

Making a copy in order to create an ordered version of **count_list**. Bearing in mind that `list.sort` is an inplace method, i.e., our **count_list**'s order would be modified and we want to preserve it.

In [11]:
sorted_count_list = count_list.copy()

In [12]:
sorted_count_list.sort(reverse = True)

In [13]:
list_5_freq = sorted_count_list[:5]

In [14]:
list_5_freq

[2475, 2018, 1271, 1078, 650]

In [15]:
#Let's find the index in count_list to find the index in word_list

for i in range(5):
    word_index = count_list.index(list_5_freq[i])
    print(word_list[word_index])

the
and
of
to
you


### Question 3

Let's create **list_one_time** to receive the **hapaxes**

In [16]:
list_one_time = []
for freq,word in zip(count_list,word_list):
    if freq == 1:
        list_one_time.append(word)

In [17]:
list_one_time

['scatter',
 'caphtorim',
 'delighted',
 'confused',
 'fury',
 'carrying',
 'kedar',
 'trained',
 'migrated',
 'cakes',
 'abimael',
 'change',
 'reserve',
 'gracious',
 'muppim',
 'perish',
 'naphish',
 'posterity',
 'want',
 'doubt',
 'daylight',
 'surrounded',
 'fourteenth',
 'prayer',
 'interpreter',
 'breeding',
 'carcasses',
 'list',
 'reward',
 'entreat',
 'close',
 'welfare',
 'misfortunes',
 'hang',
 'hashum',
 'songs',
 'pangs',
 'guni',
 'impossible',
 'stranger',
 'paid',
 'dreamer',
 'clings',
 'pray',
 'joint',
 'anointed',
 'reumah',
 'falsely',
 'stepped',
 'thousand',
 'shemeber',
 'plane',
 'longed',
 'eternal',
 'paying',
 'conspired',
 'sprung',
 'lud',
 'olive',
 'marries',
 'subdue',
 'perpetuated',
 'foremost',
 'pleaded',
 'tools',
 'sered',
 'unstable',
 'emim',
 'quiver',
 'captured',
 'suitable',
 'befall',
 'pau',
 'tamarisk',
 'pays',
 'using',
 'belly',
 'seba',
 'ludim',
 'clothing',
 'throne',
 'disaster',
 'plainly',
 'south',
 'meshech',
 'oversight',
 

In [18]:
len(list_one_time)

1106

Double check

In [19]:
for word in list_one_time:
    counting = genesis.split().count(word)
    if counting > 1:
        print(word)