In [None]:
# out in the terminal download the NASA weblog
# wget ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz

In [1]:
from pyspark import SparkContext
sc = SparkContext('local', 'weblog_analysis')

In [2]:
# Instead of using parallelize to send data to the workers from here,
# we'll have the workers read the data themselves from a text file to
# create an RDD
logs_rdd = sc.textFile('NASA_access_log_Jul95.gz')

In [3]:
logs_rdd.take(5)

['199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245',
 'unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985',
 '199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085',
 'burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0',
 '199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0" 200 4179']

In [8]:
# We didn't specify the number of partitions, so let's see what Spark default to
logs_rdd.getNumPartitions()

1

In [4]:
# How many lines are in this file?  Is this an ACTION, or a TRANSFORMATION?
num_lines = logs_rdd.count()
print(num_lines)

1891715


In [6]:
# Create an RDD that contains the length (number of characters) in each line
line_lengths_rdd = logs_rdd.map(lambda x: len(x))
line_lengths_rdd.persist()  # inform Spark that I'm going to use this later, so keep it in memory
line_lengths_rdd.take(5)

[86, 97, 116, 104, 119]

## Exercise

Figure out the total number of characters in the file (excluding newlines)

In [7]:
total_chars = line_lengths_rdd.reduce(lambda x, y: x+y)
total_chars

203350652