In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext

import py4j
conf = SparkConf().setAppName("SparkJupyterIntroRdd").setMaster("local[2]")
sc = SparkContext(conf=conf)

try:
    # Try to access HiveConf, it will raise exception if Hive is not added
    sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
    sqlContext = HiveContext(sc)
except py4j.protocol.Py4JError:
    sqlContext = SQLContext(sc)
except TypeError:
    sqlContext = SQLContext(sc)
sc

<pyspark.context.SparkContext at 0x1130cbfd0>

In [2]:
rdd = sc.parallelize(xrange(10, 0, -1)).cache()

In [3]:
rdd.count()

10

In [4]:
rdd.collect()

[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]

In [5]:
rdd.sample(False, 0.2).collect()

[4, 3]

In [6]:
rdd.sortBy(lambda x: x, ascending=True).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [7]:
rdd.filter(lambda x: x>5).collect()

[10, 9, 8, 7, 6]

In [8]:
rdd.map(lambda x: x*2).collect()

[20, 18, 16, 14, 12, 10, 8, 6, 4, 2]

In [9]:
from operator import add
rdd.reduce(add)

55

In [10]:
from random import random
def f(_):
    x = random() * 2 - 1
    y = random() * 2 - 1
    return 1 if x ** 2 + y ** 2 < 1 else 0
n = 1000000
count = sc.parallelize(xrange(1, n)).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))

Pi is roughly 3.140120


In [11]:
sc.stop()