### Packages

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType

### SparkContext

In [3]:
# Create SparkContext object with 4 executors. It is the delegate between head and executors
sc = SparkContext(master="local[4]")

In [4]:
print(sc)
#sc.stop() ; # to stop context. Do this before starting new one. Only 1 at a time

<SparkContext master=local[4] appName=pyspark-shell>


In [5]:
sc.version

'2.2.1'

### Download data

In [6]:
# Download from 
url = "https://mas-dse-open.s3.amazonaws.com/Moby-Dick.txt"
# to
data_dir = './data/'
data_file = './data/Moby-Dick.txt'

In [7]:
# option 2
from os.path import split,join,exists

if exists(data_file):
    print(data_file,"exists")
else:
    command="wget %s -P %s "%(url, data_dir)
    print(command)
    !$command
!ls -lh $data_file

./data/Moby-Dick.txt exists
-rw-rw-r-- 1 sb sb 1.2M Apr  6  2016 ./data/Moby-Dick.txt


### Word count

In [8]:
# Open file
txt_rdd = sc.textFile(data_file)
type(txt_rdd)

pyspark.rdd.RDD

In [9]:
%%time

# Split text to words, remove empty words, count using reducebykey
words = txt_rdd.flatMap(lambda txt: txt.split(" "))\
                .filter(lambda word: word!='')
word_count = words.map(lambda word: (word,1))\
                    .reduceByKey(lambda cnt1, cnt2: cnt1+cnt2)

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 63.5 ms


In [10]:
words.toDebugString().decode().split('\n')

['(2) PythonRDD[6] at RDD at PythonRDD.scala:48 []',
 ' |  ./data/Moby-Dick.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []',
 ' |  ./data/Moby-Dick.txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []']

In [11]:
word_count.toDebugString().decode().split('\n')

['(2) PythonRDD[7] at RDD at PythonRDD.scala:48 []',
 ' |  MapPartitionsRDD[5] at mapPartitions at PythonRDD.scala:436 []',
 ' |  ShuffledRDD[4] at partitionBy at NativeMethodAccessorImpl.java:0 []',
 ' +-(2) PairwiseRDD[3] at reduceByKey at <timed exec>:4 []',
 '    |  PythonRDD[2] at reduceByKey at <timed exec>:4 []',
 '    |  ./data/Moby-Dick.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []',
 '    |  ./data/Moby-Dick.txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []']

In [12]:
%%time

# Number of different words
word_count.count()

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 767 ms


33781

In [13]:
%%time

# Total number of words
word_count.map(lambda wordcnt: wordcnt[1])\
            .reduce(lambda cnt1, cnt2: cnt1+cnt2)

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 54.9 ms


215133

In [14]:
%%time

# Total number of words - again to check if anything is cached
word_count.map(lambda wordcnt: wordcnt[1])\
            .reduce(lambda cnt1, cnt2: cnt1+cnt2)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 52.8 ms


215133

In [15]:
%%time

# Total number of words - again with explicit cache
word_count.cache().map(lambda wordcnt: wordcnt[1])\
            .reduce(lambda cnt1, cnt2: cnt1+cnt2)

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 102 ms


215133

In [16]:
%%time

# Total number of words - again with explicit cache used
word_count.map(lambda wordcnt: wordcnt[1])\
            .reduce(lambda cnt1, cnt2: cnt1+cnt2)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 46 ms


215133

### Find common words

In [17]:
%%time

# Option 1: Python way: Collect and compute at head node

txt = word_count.collect()
print("# of words:",len(txt))
txt.sort(key = lambda word_cnt: word_cnt[1])
prnt = "\n".join(['%s: %d'%wcpair for wcpair in txt[-5:]])
print("common words:\n" + prnt)

# of words: 33781
common words:
to: 4510
a: 4533
and: 5951
of: 6587
the: 13766
CPU times: user 28 ms, sys: 4 ms, total: 32 ms
Wall time: 56.8 ms


In [18]:
%%time

# Option 2: Spark way: Reverse word_count to count_word and sortbykey

# Sort
count_word = word_count.map(lambda wordcnt: (wordcnt[1],wordcnt[0]))
count_word_sorted = count_word.sortByKey(ascending=False)

# Execution plan
[print(x) for x in count_word_sorted.toDebugString().decode().split('\n')]

# Execute
txt = count_word_sorted.take(10)
prnt = "\n".join(['%d: %s'%wcpair for wcpair in txt[:5]])
print("common words:\n" + prnt)

(2) PythonRDD[19] at RDD at PythonRDD.scala:48 []
 |  MapPartitionsRDD[18] at mapPartitions at PythonRDD.scala:436 []
 |  ShuffledRDD[17] at partitionBy at NativeMethodAccessorImpl.java:0 []
 +-(2) PairwiseRDD[16] at sortByKey at <timed exec>:6 []
    |  PythonRDD[15] at sortByKey at <timed exec>:6 []
    |  PythonRDD[7] at RDD at PythonRDD.scala:48 []
    |      CachedPartitions: 2; MemorySize: 630.8 KB; ExternalBlockStoreSize: 0.0 B; DiskSize: 0.0 B
    |  MapPartitionsRDD[5] at mapPartitions at PythonRDD.scala:436 []
    |  ShuffledRDD[4] at partitionBy at NativeMethodAccessorImpl.java:0 []
    +-(2) PairwiseRDD[3] at reduceByKey at <timed exec>:4 []
       |  PythonRDD[2] at reduceByKey at <timed exec>:4 []
       |  ./data/Moby-Dick.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []
       |  ./data/Moby-Dick.txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []
common words:
13766: the
6587: of
5951: and
4533: a
4510: to
CPU times: user 16 ms, 