In [1]:
# Find path to PySpark.
import findspark
findspark.init('/Users/Steve/spark-2.0.2-bin-hadoop2.6')

# Import PySpark and initialize SparkContext object.
import pyspark
sc = pyspark.SparkContext()

In [2]:

#Print first 5 elements of RDD
raw_data = sc.textFile("hamlet.txt")
raw_data.take(5)


['\tHAMLET', '', '', '\tDRAMATIS PERSONAE', '']

In [3]:
#From the output above, we see the data is delimited by "\t". Let's apply a map function to splice each entry.

splice_data = raw_data.map(lambda splice : splice.split('\t'))
splice_data.take(5)

[['', 'HAMLET'], [''], [''], ['', 'DRAMATIS PERSONAE'], ['']]

In [4]:
#Goal: Find lines where Hamlet speaks

def hamlet_speaks_filter(row):
    id = row[0]
    hamlet_speaks = False
    
    if 'HAMLET' in row[0]:
        hamlet_speaks = True
        
    if hamlet_speaks:
        yield id, 'Hamlet Speaks!' 

In [5]:
hamlet_lines = splice_data.flatMap(lambda x: hamlet_speaks_filter(x))
hamlet_lines.take(10)

[('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!'),
 ('HAMLET', 'Hamlet Speaks!')]

In [7]:
#Goal: Use filter to return lines of Hamlet dialogue

def hamlet_speaks_filter2(x):
    if 'HAMLET' in x[0]:
        return  True
    else:
        return False

hamlet_lines2 = splice_data.filter(lambda row: hamlet_speaks_filter2(row))
hamlet_lines2.take(5)

[['HAMLET', 'son to the late, and nephew to the present king.'],
 ['HAMLET', '[Aside]  A little more than kin, and less than kind.'],
 ['HAMLET', "Not so, my lord; I am too much i' the sun."],
 ['HAMLET', 'Ay, madam, it is common.'],
 ['HAMLET', "Seems, madam! nay it is; I know not 'seems.'"]]

In [16]:
#Remove the first line, which is a header

hamlet_first_line = hamlet_lines2.take(1)
hamlet_lines3 = hamlet_lines2.filter(lambda line: line[1] != hamlet_first_line[0][1])
hamlet_lines3.take(5)

[['HAMLET', '[Aside]  A little more than kin, and less than kind.'],
 ['HAMLET', "Not so, my lord; I am too much i' the sun."],
 ['HAMLET', 'Ay, madam, it is common.'],
 ['HAMLET', "Seems, madam! nay it is; I know not 'seems.'"],
 ['HAMLET', 'I shall in all my best obey you, madam.']]

In [17]:
#Find number of lines where Hamlet Spoke
num_hamlet_lines = hamlet_lines3.count()
print ('Number of Hamlet Lines = ', num_hamlet_lines)
hamlet_dialogue = hamlet_lines3.collect()
hamlet_line_101 = hamlet_dialogue[100][1]
print ('Hamlet\'s 101\'st line is: ', hamlet_line_101)
        

Number of Hamlet Lines =  265
Hamlet's 101'st line is:  [Aside]  Nay, then, I have an eye of you.--If you


In [18]:
sc.stop()
