# Goal:

Find the average word length from a Shakespeare text after removing stopwords.  This example is taken from the book <u>Teach Yourself Apache Spark in 24 Hours</u> by Jeffrey Aven (Chapter 12)

### Setup

In [1]:
import pandas as pd
import numpy as np
from pyspark import SparkContext

sc = SparkContext()

stopwords = pd.read_csv("https://s3.amazonaws.com/sty-spark/stopwords/stop-word-list.csv", header=None).values
stopwords = sc.broadcast(stopwords)

### Accumulators

In [2]:
word_count = sc.accumulator(0)
total_len = sc.accumulator(0.0)

def add_values(word, word_count, total_len):
    word_count += 1
    total_len  += len(word)

### Load words

In [3]:
words = (sc.textFile("data/shakespeare.txt")
         .flatMap(lambda line: line.split())
         .map(lambda x: x.lower())
         .filter(lambda x: x not in stopwords.value))

### Find average word length

In [4]:
words.foreach(lambda x: add_values(x, word_count, total_len))
avgwordlen = total_len.value / word_count.value

print(f"Total number of words: {str(word_count.value)}")
print(f"Average word length: {str(avgwordlen)}")

Total number of words: 966958
Average word length: 3.608722405730135


### Quit Context

In [5]:
sc.stop()