In [None]:
import wordcloud as wc
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

Make sure to download the model first using:

`spacy download nl_core_news_sm`

In [None]:
import spacy
nlp = spacy.load('nl_core_news_sm')

Load data from CSV file using pandas

In [None]:
df = pd.read_csv('data/BGT-Bijbel.csv', sep='|')

books = df['book'].unique()
print(books)

old_books = books[:39]
print(old_books)

Generate word counts

In [None]:
poss = ('NOUN', 'PROPN', 'ADJ', 'VERB')

def get_counts(book):
    text = ' '.join(df[df['book'] == book]['text'])
    doc = nlp(text)
    tokens = (token.text for token in doc if token.pos_ in poss)
    return Counter(tokens)

# get_counts('Genesis').most_common(10)

Generate word clouds

In [None]:
def get_wordcloud(counts, colormap='rainbow'):
    wordcloud = wc.WordCloud(
        width=800, 
        height=500, 
        colormap=colormap, 
        background_color=None,
        mode='RGBA',
    ).generate_from_frequencies(counts)
    
    return wordcloud

Generate word clouds for all books using matplotlib and/or save them to svg 

In [None]:
plot = False
svg = True

old_counts = Counter()
new_counts = Counter()

for book in books:
    counts = get_counts(book)
    wordcloud = get_wordcloud(counts)

    if book in old_books:
        old_counts.update(counts)
    else:
        new_counts.update(counts)
    
    if plot:
        fig, ax = plt.subplots(figsize=(6,6))
    
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_axis_off()
        ax.set_title(book)
        plt.show()
    if svg:
        with open(f'{book}.svg', 'w') as f:
            f.write(wordcloud.to_svg(embed_font=True))
        print(f'wrote {book}.svg' )   

Generate word cloud for the old testament, new testament, and the entire bible

In [None]:
svg = True
plot = True

for name, counts in (
        ('old_testament', old_counts),
        ('new_testament', new_counts),
        ('bible', old_counts + new_counts),
    ):
    
    wordcloud = get_wordcloud(counts, colormap='rainbow')
    
    if plot:
        fig, ax = plt.subplots(figsize=(10,10))
    
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_axis_off()
        ax.set_title(name.replace('_', ' ').capitalize())
        plt.show()
        
    if svg:
        with open(f'{name}.svg', 'w') as f:
            f.write(wordcloud.to_svg(embed_font=True))
        print(f'wrote {name}.svg' )