# Transformations and Actions

We will perform some text analysis on the file hamlet.txt that contains the entire text of Shakespeare's play Hamlet.

### Cleaning and reformatting the Data

In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
sc = pyspark.SparkContext()

In [5]:
raw_hamlet = sc.textFile("hamlet.txt")

In [6]:
raw_hamlet.take(5)

['hamlet@0\t\tHAMLET',
 'hamlet@8',
 'hamlet@9',
 'hamlet@10\t\tDRAMATIS PERSONAE',
 'hamlet@29']

### The Map method

In [8]:
split_hamlet = raw_hamlet.map(lambda x: x.split('\t'))

In [9]:
split_hamlet.take(5)

[['hamlet@0', '', 'HAMLET'],
 ['hamlet@8'],
 ['hamlet@9'],
 ['hamlet@10', '', 'DRAMATIS PERSONAE'],
 ['hamlet@29']]

### The FlatMap method

In [10]:
def hamlet_speaks(line):
    id = line[0]
    speaketh = False
    
    if "HAMLET" in line:
        speaketh = True
    
    if speaketh:
        yield id,"hamlet speaketh!"

In [11]:
hamlet_spoken = split_hamlet.flatMap(lambda x: hamlet_speaks(x))

In [12]:
hamlet_spoken.take(10)

[('hamlet@0', 'hamlet speaketh!'),
 ('hamlet@75', 'hamlet speaketh!'),
 ('hamlet@1004', 'hamlet speaketh!'),
 ('hamlet@9144', 'hamlet speaketh!'),
 ('hamlet@12313', 'hamlet speaketh!'),
 ('hamlet@12434', 'hamlet speaketh!'),
 ('hamlet@12760', 'hamlet speaketh!'),
 ('hamlet@12858', 'hamlet speaketh!'),
 ('hamlet@14821', 'hamlet speaketh!'),
 ('hamlet@15261', 'hamlet speaketh!')]

### Filtering using a named function

In [13]:
def filter_hamlet_speaks(line):
    if "HAMLET" in line:
        return True
    else:
        return False

In [14]:
hamlet_spoken_lines = split_hamlet.filter(lambda line: filter_hamlet_speaks(line))

In [15]:
hamlet_spoken_lines.take(10)

[['hamlet@0', '', 'HAMLET'],
 ['hamlet@75', 'HAMLET', 'son to the late, and nephew to the present king.'],
 ['hamlet@1004', '', 'HAMLET'],
 ['hamlet@9144', '', 'HAMLET'],
 ['hamlet@12313',
  'HAMLET',
  '[Aside]  A little more than kin, and less than kind.'],
 ['hamlet@12434', 'HAMLET', "Not so, my lord; I am too much i' the sun."],
 ['hamlet@12760', 'HAMLET', 'Ay, madam, it is common.'],
 ['hamlet@12858', 'HAMLET', "Seems, madam! nay it is; I know not 'seems.'"],
 ['hamlet@14821', 'HAMLET', 'I shall in all my best obey you, madam.'],
 ['hamlet@15261', 'HAMLET', 'O, that this too too solid flesh would melt']]

### Actions

In [16]:
spoken_count = 0
spoken_101 = list()
spoken_count = hamlet_spoken_lines.count()
spoken_count

381

In [17]:
spoken_collect = hamlet_spoken_lines.collect()
spoken_101 = spoken_collect[100]
spoken_101

['hamlet@58478', 'HAMLET', 'A goodly one; in which there are many confines,']