In [None]:
# Word mapping with python WordCloud and pdfminer
# ART350
# April 2021
#----------------------------------
!pip install pdfminer

In [16]:
import os, sys
from io import BytesIO
import requests
import numpy
import pandas
import PIL
from PIL import Image
import matplotlib.pyplot as plt
import nltk
from collections import Counter
from wordcloud import WordCloud, STOPWORDS

In [None]:
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

output_string = StringIO()

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
start = '/content/drive/MyDrive/ART/data_visualization/'
datapath = start + 'data/'

In [None]:
!wget https://antilogicalism.com/wp-content/uploads/2018/04/malcom-x.pdf -O 'malcom.pdf'

with open('malcom.pdf', 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

text = output_string.getvalue()

In [None]:
print(text)

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

token_text = word_tokenize(text)
print(token_text)

In [14]:
# To see the words that matter, let's filter based on some conditions
def filter_words(x):
    # Not in common English words
    cond_1 = x.lower() not in STOPWORDS
    # Not a number
    cond_2 = not x.isnumeric()
    # Length of at least 3
    cond_3 = len(x)>2
    return cond_1 and cond_2 and cond_3

In [None]:
# Counts of each word
counts = Counter(token_text)

# Convert to DataFrame for our viewing pleasure
counts_df = pandas.DataFrame.from_dict(counts, orient='index', columns=['count'])
counts_df.index.name = 'word'
counts_df.reset_index(inplace=True)

# Apply word filter from function above
counts_df = counts_df[counts_df['word'].apply(filter_words)]
counts_df.set_index('word', inplace=True)

# Sort by count descending and show the top 20 words
print(counts_df.sort_values('count', ascending=False).head(20))

In [None]:
# Alternate image access approach to use a custom made background image---------
# 1. copy the image (.png) to the data folder of your google cloud (we mounted that drive above)
# 2. get the file into a variable as below
# 3. skip the code block just after this one 

img_path = datapath + 'T_silhouette.png'
img = Image.open(img_path)

In [27]:
# Select a background image found on the web. List the URL ---------------------
url = 'https://i2.wp.com/saccityexpress.com/wp-content/uploads/2014/02/MLK.jpg?w=1900&ssl=1'

# Use requests to get the image data and then uses BytesIO and Image.open() to import the image
response = requests.get(url)
img = Image.open(BytesIO(response.content))

In [28]:
# Make mask, define the dimensions
xdim = 1000
ydim = 1000

img = img.resize((xdim, ydim),Image.ANTIALIAS) 

In [None]:
# Create the wave_mask by converting the image data into a numpy array (same dimensions)
wave_mask = numpy.array(img)
wordcloud = WordCloud(width=xdim, height=ydim, mask=wave_mask, random_state=80,  contour_width=1, contour_color='orange').generate(" ".join(token_text))

# Create matplotlib figure
fig = plt.figure(figsize=(8, 8))

#display and save
image = wordcloud.to_image()
image.show()
quality_val = 100
image.save(datapath+'wc_mask_malcom.jpg', 'JPEG', quality=quality_val)

#show the result in a preview
plt.axis('off')
plt.imshow(wordcloud)
plt.show()