In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
sc = pyspark.SparkContext.getOrCreate();

In [5]:
raw_data = sc.textFile('hamlet.txt')

In [6]:
split_data = raw_data.map(lambda x : x.split('\t'))

In [7]:
split_data.take(10)

[['hamlet@0', '', 'HAMLET'],
 ['hamlet@8'],
 ['hamlet@9'],
 ['hamlet@10', '', 'DRAMATIS PERSONAE'],
 ['hamlet@29'],
 ['hamlet@30'],
 ['hamlet@31', 'CLAUDIUS', 'king of Denmark. (KING CLAUDIUS:)'],
 ['hamlet@74'],
 ['hamlet@75', 'HAMLET', 'son to the late, and nephew to the present king.'],
 ['hamlet@131']]

Convert lines in text and line ids

In [8]:
def format_ids(line):
    results = []
    id = line[0].split('@')[1]
    results.append(id)
    if len(line) > 1:
        for word in line[1:]:
            results.append(word)
    return results

In [9]:
hamlet_with_ids = split_data.map(lambda x : format_ids(x))
hamlet_with_ids.take(5)

[['0', '', 'HAMLET'], ['8'], ['9'], ['10', '', 'DRAMATIS PERSONAE'], ['29']]

Clear spaces and pipes from lines

In [10]:
real_text = hamlet_with_ids.filter(lambda x : len(x) > 1)

In [11]:
hamlet_text_only = real_text.map(lambda x : [y for y in x if y!=""])
hamlet_text_only.take(5)

[['0', 'HAMLET'],
 ['10', 'DRAMATIS PERSONAE'],
 ['31', 'CLAUDIUS', 'king of Denmark. (KING CLAUDIUS:)'],
 ['75', 'HAMLET', 'son to the late, and nephew to the present king.'],
 ['132', 'POLONIUS', 'lord chamberlain. (LORD POLONIUS:)']]

In [12]:
def clean_pipes(line):
    results = []
    for word in line:
        if word == '|':
            pass
        elif '|' in word:
            word = word.replace('|', '')
            results.append(word)
        else:
            results.append(word)
    return results

In [13]:
clean_hamlet = hamlet_text_only.map(lambda x : clean_pipes(x))
clean_hamlet.take(5)

[['0', 'HAMLET'],
 ['10', 'DRAMATIS PERSONAE'],
 ['31', 'CLAUDIUS', 'king of Denmark. (KING CLAUDIUS:)'],
 ['75', 'HAMLET', 'son to the late, and nephew to the present king.'],
 ['132', 'POLONIUS', 'lord chamberlain. (LORD POLONIUS:)']]