Coffee and Brimstone Notebook
-----------------------------

This notebook contains code that is used to calculate the key statistics for the "Coffee and Brimstone" project completed at Foundations and Applications of Humanities Analytics, July 18-22, 2022. Note that due to the probabilsitic nature of Word2Vec, exact statistics may differ between runs.

In [1]:
# Importing modules
import re
import numpy as np
import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud
from pprint import pprint
from bs4 import BeautifulSoup
import requests

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidkinney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The following block of code defines all functions necessary to run the code.

In [2]:
def data_import(fname):
    # Read csv file as list of lists. 
    # Then clean the list of lists 

    with open(fname, newline = '') as f:
            reader = f.readlines() # read the JSON file as a Python object 
            data = reader

    data = [re.sub(r'\n|\\\\t|\'s', '', word) for word in data] # remove line breaks, tab breaks, and possessive "s"
    data = [re.sub(r'[^\w\s]|_', '', word) for word in data] # remove punctuation and underscore
    data = [re.sub(r'\d{1, 3}', '', word) for word in data] # remove digits that are a minimum of 1 and a maximum of 3
    data = [re.sub(r'\w*\d\w*', '', word) for word in data] # remove character strings that contain a digit
        
    data = [word.lower() for word in data]
    data = [word.split() for word in data]

    return data

def word_count(word_list, text_data):
    # Count the words in each text.
    counter_dict = {} # make an empty dictionary

    for word in word_list: # loop through the list of words we want to look up in text
        counter = 0 # placeholder word counter
        for sentence in text_data: # loop through the sentences in the text
            for text_word in sentence: # loop through individual words in each sentence in the text
                if text_word == word:
                    counter = counter + 1 # if the word is same as in our list, increment counter
        counter_dict[word] = counter # update our dictionary with the word count
        
    return counter_dict # return the dictionary of word counts

def rich_valuation(richwords,goodwords,badwords,text,model):
    #Calculate the difference in average similarity between words associated with wealth and positively valenced words, and\
    #words associated with wealth and negatively valenced words. 
    richwords_total = np.sum([word_count(richwords,text)[word] for word in richwords])
    goodwords_total = np.sum([word_count(goodwords,text)[word] for word in goodwords])
    badwords_total = np.sum([word_count(badwords,text)[word] for word in badwords])
    richwords_weights = [word_count(richwords,text)[word]/richwords_total  for word in richwords]
    goodwords_weights = [word_count(goodwords,text)[word]/goodwords_total for word in goodwords]
    badwords_weights = [word_count(badwords,text)[word]/badwords_total for word in badwords]
    
    return np.dot(richwords_weights,[np.dot(goodwords_weights,[model.wv.similarity(richword,goodword) for goodword in goodwords]) for richword in richwords]) -\
    np.dot(richwords_weights,[np.dot(badwords_weights,[model.wv.similarity(richword,badword) for badword in badwords]) for richword in richwords])\
    

def poor_valuation(poorwords,goodwords,badwords,text,model):
    #Calculate the difference in average similarity between words associated with poverty and positively valenced words, and\
    #words associated with poverty and negatively valenced words.
    poorwords_total = np.sum([word_count(poorwords,text)[word] for word in poorwords])
    goodwords_total = np.sum([word_count(goodwords,text)[word] for word in goodwords])
    badwords_total = np.sum([word_count(badwords,text)[word] for word in badwords])
    poorwords_weights = [word_count(poorwords,text)[word]/poorwords_total for word in poorwords]
    goodwords_weights = [word_count(goodwords,text)[word]/goodwords_total for word in goodwords]
    badwords_weights = [word_count(badwords,text)[word]/badwords_total for word in badwords]
    
    return np.dot(poorwords_weights,[np.dot(goodwords_weights,[model.wv.similarity(poorword,goodword) for goodword in goodwords]) for poorword in poorwords]) -\
    np.dot(poorwords_weights,[np.dot(badwords_weights,[model.wv.similarity(poorword,badword) for badword in badwords]) for poorword in poorwords])

The next block of code calculates all revelavant statistics for the King James Bible.

In [3]:
fname = 'kjb.txt'
data = data_import(fname)
stop_words = stopwords.words('english')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

bible_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['poor', 'wretched', 'least', 'beggar', 'destitute', 'poverty', 'needy', 'hungry', 'deprived', 'famished', 'indebted', 'austere']

rich_words = ['rich', 'wealthy', 'gold', 'wealth', 'riches', 'ruler', 'fat', 'prosperous', 'lavish', 'abundance']

good_words = ['good', 'blessed', 'righteous', 'virtuous', 'devout', 'valued', 'clean', 'holy', 'glorious', 'pure', 'proper']

bad_words = ['bad', 'woe', 'covet', 'oppress', 'smite', 'inferior', 'evil', 'thieves', 'sinful', 'unholy']

bible_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,bible_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,bible_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,bible_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,bible_model)]
bible_stats

[-0.026667942384106413, -0.09202150037075718, 0.06535355798665077]

The next block of code calculates all revelavant statistics for the Siri Guru Granth Sahib.

In [None]:
fname = 'siri.txt'
data = data_import(fname)
stop_words = stopwords.words('english')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

siri_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['poor', 'wretched', 'least', 'beggar', 'destitute', 'poverty', 'pauper', 'unfortunate', 'needy', 'hungry', 'homeless', 'deprived', 'austere', 'ascetic']

rich_words = ['rich', 'wealthy', 'gold', 'wealth', 'riches', 'ruler', 'prosperous', 'dynasty', 'abundance']

good_words = ['good', 'blessed', 'righteous', 'virtuous', 'pious', 'superior', 'valued', 'clean', 'holy', 'glorious', 'pure', 'proper']

bad_words = ['bad', 'covet', 'oppress', 'exploit', 'immoral', 'inferior', 'evil', 'thieves', 'sinful', 'dirty', 'impure']

siri_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,siri_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,siri_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,siri_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,siri_model)]
siri_stats

The next block of code calculates all revelavant statistics for the Kojiki.

In [None]:
fname = 'Kojiki_Horne.txt'
data = data_import(fname)
stop_words = stopwords.words('english')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

koji_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['least']

rich_words = ['ruler', 'jeweled']

good_words = ['good', 'pure']

bad_words = ['bad', 'woe', 'smite', 'evil']

koji_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,koji_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,koji_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,koji_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,koji_model)]
koji_stats

The next block of code calculates all revelavant statistics for the Quran.

In [None]:
fname = 'Quran.txt'
data = data_import(fname)
stop_words = stopwords.words('english')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

quran_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['poor', 'wretched', 'least', 'beggar', 'destitute', 'poverty', 'unfortunate', 'needy', 'hungry', 'deprived']

rich_words = ['rich', 'wealthy', 'affluent', 'gold', 'wealth', 'riches', 'ruler', 'fat', 'luxury', 'luxurious', 'abundance']

good_words = ['good', 'blessed', 'righteous', 'virtuous', 'pious', 'devout', 'superior', 'clean', 'holy', 'glorious', 'pure', 'proper']

bad_words = ['bad', 'woe', 'covet', 'oppress', 'smite', 'evil', 'thieves', 'sinful', 'repentant']

quran_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,quran_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,quran_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,quran_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,quran_model)]
quran_stats

The next block of code calculates all revelavant statistics for the Upanishads.

In [None]:
fname = 'Upanishads'
data = data_import(fname)
stop_words = stopwords.words('english')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

upanishads_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['least']

rich_words = ['rich', 'gold', 'wealth', 'ruler', 'abundance']

good_words = ['good', 'blessed', 'righteous', 'pious', 'superior', 'holy', 'glorious', 'pure', 'proper']

bad_words = ['bad', 'covet', 'inferior', 'evil', 'impure']


upanishads_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,upanishads_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,upanishads_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,upanishads_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,upanishads_model)]
upanishads_stats

The next block of code calculates all revelavant statistics for the Popul Vuh.

In [None]:
fname = 'Popul_Vuh'
data = data_import(fname)
stop_words = stopwords.words('english')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

popul_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['poor', 'least', 'hungry']

rich_words = ['wealthy', 'ruler']

good_words = ['good', 'glorious', 'pure', 'proper']

bad_words = ['bad', 'evil']

popul_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,popul_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,popul_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,popul_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,popul_model)]
popul_stats

The next block of code calculates all revelavant statistics for the Bhagavad Gita.

In [None]:
with open('Bhagavad-Gita', 'r') as f:
     page_content = BeautifulSoup(f, 'html.parser').find_all('p')

data = []
for paragraph in page_content: # do some additional cleaning. For each paragraph, "strip" the strings (i.e. get rid of anchors left over from the HTML)
    data.extend(paragraph.stripped_strings)

stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

bg_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['least', 'beggar', 'deprived']

rich_words = ['rich', 'gold', 'wealth', 'abundance']

good_words = ['good', 'blessed', 'righteous', 'pious', 'devout', 'superior', 'clean', 'holy', 'glorious', 'pure']

bad_words = ['bad', 'woe', 'evil', 'sinful']


bg_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,bg_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,bg_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,bg_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,bg_model)]
bg_stats

The next block of code calculates all revelavant statistics for the Dao De Jing.

In [None]:
with open('Dao_De_Jing_html', 'r') as f:
     page_content = BeautifulSoup(f, 'html.parser').find_all('p')

data = []
for paragraph in page_content: # do some additional cleaning. For each paragraph, "strip" the strings (i.e. get rid of anchors left over from the HTML)
    data.extend(paragraph.stripped_strings)

stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

ddj_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['least']

rich_words = ['gold', 'wealth', 'ruler']

good_words = ['good', 'virtuous', 'moral', 'superior', 'pure', 'proper']

bad_words = ['bad', 'inferior', 'evil', 'thieves']

ddj_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,ddj_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,ddj_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,ddj_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,ddj_model)]
ddj_stats

The next block of code calculates all revelavant statistics for the Zend Avesta.

In [None]:
fname = 'zend_avesta.txt'
data = data_import(fname)
stop_words = stopwords.words('english')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

zend_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['poor', 'wretched', 'least', 'poverty', 'unfortunate', 'deprived', 'indebted']

rich_words = ['rich', 'wealthy', 'affluent', 'gold', 'wealth', 'riches', 'ruler', 'fat', 'prosperous', 'dynasty', 'exorbitant']

good_words = ['good', 'blessed', 'righteous', 'virtuous', 'pious', 'devout', 'orthodox', 'moral', 'superior', 'valued', 'clean', 'ethical', 'holy', 'glorious', 'pure', 'proper']

bad_words = ['bad', 'woe', 'oppress', 'immoral', 'smite', 'inferior', 'evil', 'bandits', 'thieves', 'sinful', 'unholy', 'impure']

zend_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,zend_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,zend_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,zend_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,zend_model)]
zend_stats

The next block of code calculates all revelavant statistics for the Tanakh.

In [None]:
fname = 'Tanakh1917.txt'
data = data_import(fname)
stop_words = stopwords.words('english')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

tanakh_model = gensim.models.Word2Vec(sentences = data_words, workers = 8, min_count = 0, vector_size = 100) 

poor_words = ['poor', 'least', 'destitute', 'poverty', 'unfortunate', 'needy', 'hungry', 'deprived', 'famished']

rich_words = ['rich', 'gold', 'wealth', 'riches', 'ruler', 'fat', 'prosperous', 'lavish', 'abundance']

good_words = ['good', 'blessed', 'righteous', 'virtuous', 'valued', 'clean', 'holy', 'glorious', 'pure', 'proper']

bad_words = ['bad', 'woe', 'covet', 'oppress', 'smite', 'inferior', 'evil', 'thieves', 'sinful', 'impure']

tanakh_stats = [rich_valuation(rich_words,good_words,bad_words,data_words,tanakh_model),\
               poor_valuation(poor_words,good_words,bad_words,data_words,tanakh_model),\
               rich_valuation(rich_words,good_words,bad_words,data_words,tanakh_model)-\
               poor_valuation(poor_words,good_words,bad_words,data_words,tanakh_model)]
tanakh_stats