In [1]:
import keras

Using TensorFlow backend.


# RDD 

Spark uses Resilient Distributed Datasets (RDD) to perform parallel processing across a cluster or computer processors.

In [1]:
from pyspark import SparkContext
sc = SparkContext()

In [3]:
pythonList = [2.3,3.4,4.3,2.4,2.3,4.0]

In [4]:
parPythonData = sc.parallelize(pythonList,2)

In [5]:
parPythonData.collect()

[2.3, 3.4, 4.3, 2.4, 2.3, 4.0]

In [6]:
parPythonData.first()

2.3

In [7]:
parPythonData.take(2)

[2.3, 3.4]

In [8]:
parPythonData.getNumPartitions()

2

In [9]:
tempData = [59,57.2,53.6,55.4,51.8,53.6,55.4]

In [10]:
parTempData = sc.parallelize(tempData,2)

In [11]:
parTempData.collect()

[59, 57.2, 53.6, 55.4, 51.8, 53.6, 55.4]

In [12]:
def fahrenheitToCentigrade(temperature):
    centigrade = (temperature-32)*5/9
    return centigrade

In [13]:
fahrenheitToCentigrade(59)

15.0

In [14]:
parCentigradeData = parTempData.map(fahrenheitToCentigrade)

In [15]:
parCentigradeData.collect()

[15.0, 14.000000000000002, 12.0, 13.0, 10.999999999999998, 12.0, 13.0]

In [16]:
def tempMoreThanThirteen(temperature):
    return temperature >=13

In [17]:
filteredTemprature = parCentigradeData.filter(tempMoreThanThirteen)

In [18]:
filteredTemprature.collect()

[15.0, 14.000000000000002, 13.0, 13.0]

In [19]:
filteredTemprature = parCentigradeData.filter(lambda x : x>=13)

In [20]:
filteredTemprature.collect()

[15.0, 14.000000000000002, 13.0, 13.0]

## Work with text data

In [9]:
rdd = sc.textFile("for_pyspark.txt")
words = rdd.flatMap(lambda x: x.split(" "))
result = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

In [10]:
result.collect()

[('this', 2), ('is', 2), ('book', 2), ('a', 1), ('about', 1), ('DS', 1)]

In [11]:
result

PythonRDD[28] at collect at <ipython-input-10-e43ab5090625>:1

In [13]:
words.collect()

['this', 'is', 'a', 'book', 'this', 'book', 'is', 'about', 'DS']

In [15]:
words.map(lambda x: (x, 1)).collect()

[('this', 1),
 ('is', 1),
 ('a', 1),
 ('book', 1),
 ('this', 1),
 ('book', 1),
 ('is', 1),
 ('about', 1),
 ('DS', 1)]

## Map Running Time

In [2]:
parPythonData = sc.parallelize(range(1000))

In [3]:
a = parPythonData.map(lambda x: x*x)

In [4]:
b = a.collect()

b[0:100]

[0,
 1,
 4,
 9,
 16,
 25,
 36,
 49,
 64,
 81,
 100,
 121,
 144,
 169,
 196,
 225,
 256,
 289,
 324,
 361,
 400,
 441,
 484,
 529,
 576,
 625,
 676,
 729,
 784,
 841,
 900,
 961,
 1024,
 1089,
 1156,
 1225,
 1296,
 1369,
 1444,
 1521,
 1600,
 1681,
 1764,
 1849,
 1936,
 2025,
 2116,
 2209,
 2304,
 2401,
 2500,
 2601,
 2704,
 2809,
 2916,
 3025,
 3136,
 3249,
 3364,
 3481,
 3600,
 3721,
 3844,
 3969,
 4096,
 4225,
 4356,
 4489,
 4624,
 4761,
 4900,
 5041,
 5184,
 5329,
 5476,
 5625,
 5776,
 5929,
 6084,
 6241,
 6400,
 6561,
 6724,
 6889,
 7056,
 7225,
 7396,
 7569,
 7744,
 7921,
 8100,
 8281,
 8464,
 8649,
 8836,
 9025,
 9216,
 9409,
 9604,
 9801]