In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *

spark = SparkSession.builder\
       .appName("Pyspark")\
       .getOrCreate()

In [2]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data) # to create RDD on an iterable

In [14]:
distData.reduce(lambda a, b: a + b) # reduce function will aggregate elements of rdd

15

In [5]:
distFile = sc.textFile("data.txt") # load rdd from textfile
#he textFile method also takes an optional second argument for controlling the number of partitions of the file. 
#By default, Spark creates one partition for each block of the file (blocks being 128MB by default in HDFS),
#but you can also ask for a higher number of partitions by passing a larger value.
#Note that you cannot have fewer partitions than blocks.

In [11]:
sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x)).collect() # map function operates on each element of rdd


[(1, 'a'), (2, 'aa'), (3, 'aaa')]

In [None]:
#By default, each transformed RDD may be recomputed each time you run an action on it. 
#However, you may also persist an RDD in memory using the persist (or cache) method, 
#in which case Spark will keep the elements around on the cluster for much faster access the next time you query it. 
#There is also support for persisting RDDs on disk, or replicated across multiple nodes.

In [7]:
spark.version

'2.3.2'

In [21]:
nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7])
nums.collect()

[1, 2, 3, 4, 5, 6, 7]

In [20]:
filtered1 = nums.filter(lambda x : x % 2 == 1)
filtered1.collect()

[1, 3, 5, 7]

In [22]:
filtered2 = nums.filter(lambda x : x % 2 == 0)
filtered2.collect()

[2, 4, 6]

In [37]:
sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest.txt").take(5)

['this is my test file']

In [14]:
R = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest.txt")
R.take(5)

['k1,v1', 'k1,v2', 'k2,v3', 'k2,v4', 'k3,v7']

In [45]:
S= sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest1.txt")
S.take(5)

['k1,v11', 'k1,v22', 'k1,v33', 'k2,v55', 'k4,v77']

In [43]:
r1=R.map(lambda x:x.split(','))
r1.take(5)

[['k1', 'v1'], ['k1', 'v2'], ['k2', 'v3'], ['k2', 'v4'], ['k3', 'v7']]

In [63]:
#r2 = r1.map(lambda s: (s[0],s[1]))
#r2.take(10)

r2 = r1.flatMap(lambda s: [(s[0],s[1])])
r2.take(10)

[('k1', 'v1'),
 ('k1', 'v2'),
 ('k2', 'v3'),
 ('k2', 'v4'),
 ('k3', 'v7'),
 ('k3', 'v8'),
 ('k3', 'v9')]

In [65]:
s1 = S.map(lambda s: s.split(","))
s1.take(10)

[['k1', 'v11'], ['k1', 'v22'], ['k1', 'v33'], ['k2', 'v55'], ['k4', 'v77']]

In [66]:
s2 = s1.flatMap(lambda s: [(s[0],s[1])])
s2.take(10)

[('k1', 'v11'),
 ('k1', 'v22'),
 ('k1', 'v33'),
 ('k2', 'v55'),
 ('k4', 'v77'),
 ('k5', 'v88')]

In [68]:
RjoinedS = r2.join(s2)
RjoinedS.take(10)

[('k1', ('v1', 'v11')),
 ('k1', ('v1', 'v22')),
 ('k1', ('v1', 'v33')),
 ('k1', ('v2', 'v11')),
 ('k1', ('v2', 'v22')),
 ('k1', ('v2', 'v33')),
 ('k2', ('v3', 'v55')),
 ('k2', ('v4', 'v55'))]

In [69]:
nums = sc.parallelize([1, 2, 3, 4, 5])

In [77]:
sum2 = nums.map(lambda x: x + 2)
sum2.take(10)

[3, 4, 5, 6, 7]

In [72]:
sq = nums.map(lambda x: x * x)
sq.take(10)

[1, 4, 9, 16, 25]

In [90]:
n = sc.parallelize([1, 2, 3, 4])
n.reduce(lambda x, y: x * y)
#n.fold(1,(lambda x, y: x * y))


24

In [4]:
rdd1 = sc.parallelize(["b", "a", "c"])
sorted(rdd1.map(lambda x: (x, 1)).collect())


[('a', 1), ('b', 1), ('c', 1)]

In [8]:
rdd1 = sc.parallelize(["b", "a", "c"])
rdd1.flatMap(lambda x: (x, 1)).collect()

['b', 1, 'a', 1, 'c', 1]

In [18]:
lines = sc.textFile('file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest3.txt')
lines.take(10)

['crazy crazy fox jumped',
 'crazy fox jumped',
 'fox is fast',
 'fox is smart',
 'dog is smart']

In [23]:
lines.flatMap(lambda x: x.split(' ')).take(10)

['crazy',
 'crazy',
 'fox',
 'jumped',
 'crazy',
 'fox',
 'jumped',
 'fox',
 'is',
 'fast']

In [29]:
assigninitial = lines.flatMap(lambda x: x.split(' ')).map(lambda w:(w,1))
assigninitial.take(10)

[('crazy', 1),
 ('crazy', 1),
 ('fox', 1),
 ('jumped', 1),
 ('crazy', 1),
 ('fox', 1),
 ('jumped', 1),
 ('fox', 1),
 ('is', 1),
 ('fast', 1)]

In [44]:
frequencies=assigninitial.reduceByKey(lambda x,y:x+y)
x=frequencies.take(10)
x

[('crazy', 3),
 ('is', 3),
 ('smart', 2),
 ('fox', 4),
 ('jumped', 2),
 ('fast', 1),
 ('dog', 1)]

In [31]:
frequencies.count()

7

In [41]:
frequencies.keys().take(10)

['crazy', 'is', 'smart', 'fox', 'jumped', 'fast', 'dog']

In [38]:
frequencies.zipWithIndex().take(10)

[(('crazy', 3), 0),
 (('is', 3), 1),
 (('smart', 2), 2),
 (('fox', 4), 3),
 (('jumped', 2), 4),
 (('fast', 1), 5),
 (('dog', 1), 6)]

In [None]:
frequencies.first()

In [53]:
frequencies.sortByKey().collect()

[('crazy', 3),
 ('dog', 1),
 ('fast', 1),
 ('fox', 4),
 ('is', 3),
 ('jumped', 2),
 ('smart', 2)]

In [54]:
frequencies.sortByKey(ascending=False).collect() #frequencies.sortByKey(False).collect()

[('smart', 2),
 ('jumped', 2),
 ('is', 3),
 ('fox', 4),
 ('fast', 1),
 ('dog', 1),
 ('crazy', 3)]

In [62]:
tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
sc.parallelize(tmp).sortBy(lambda x: x[0]).collect() #sortBy() needs key func as argument

[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]

In [60]:
tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]
tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])

sc.parallelize(tmp2).sortByKey(True,1,keyfunc=lambda k: k[0].lower()).collect() # sc.parallelize(tmp2).sortByKey(True,1,keyfunc=lambda k: k.lower()).collect()

[('a', 3),
 ('fleece', 7),
 ('had', 2),
 ('little', 4),
 ('lamb', 5),
 ('Mary', 1),
 ('whose', 6),
 ('was', 8),
 ('white', 9)]

In [70]:
numbers = sc.parallelize([1, 2, 3, 4])
sum1 = numbers.fold(0, (lambda x, y: x + y))#alt way fr addition of all elements
print(sum1)
mult1=numbers.fold(1, (lambda x, y: x * y))#way fr multiplication of all elements
mult1

10


24

In [68]:
numbers = sc.parallelize([1, 2, 3, 4])
sum1 = numbers.sum() #numbers.subtract()
sum1

10

In [80]:
d1= [('k1', 1), ('k2', 2), ('k3', 5)]
d2= [('k1', 3), ('k2',4), ('k4', 8)]
rdd1 = sc.parallelize(d1)
rdd2 = sc.parallelize(d2)

In [81]:
rdd3=rdd1.union(rdd2)
rdd3.take(10)

[('k1', 1), ('k2', 2), ('k3', 5), ('k1', 3), ('k2', 4), ('k4', 8)]

In [84]:
rdd3.reduceByKey(lambda x,y: x+y).take(10)


[('k1', 4), ('k3', 5), ('k2', 6), ('k4', 8)]

In [92]:
lines = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest3.txt")
lines.take(10)

['crazy crazy fox jumped',
 'crazy fox jumped',
 'fox is fast',
 'fox is smart',
 'dog is smart']

In [102]:
lines = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest3.txt")

r1 = lines.map(lambda s : s.split(" ")).collect()
r1

[['crazy', 'crazy', 'fox', 'jumped'],
 ['crazy', 'fox', 'jumped'],
 ['fox', 'is', 'fast'],
 ['fox', 'is', 'smart'],
 ['dog', 'is', 'smart']]

In [103]:
r = lines.map(lambda s : s.split(" ")).flatMap(lambda s: [((s[i],s[i+1]),1) for i in range(len(s)-1)]).take(10)                                                                                                        
r

[(('crazy', 'crazy'), 1),
 (('crazy', 'fox'), 1),
 (('fox', 'jumped'), 1),
 (('crazy', 'fox'), 1),
 (('fox', 'jumped'), 1),
 (('fox', 'is'), 1),
 (('is', 'fast'), 1),
 (('fox', 'is'), 1),
 (('is', 'smart'), 1),
 (('dog', 'is'), 1)]

In [105]:
f=lines.map(lambda s : s.split(" ")).flatMap(lambda s: [((s[i],s[i+1]),1) for i in range(len(s)-1)]).reduceByKey(lambda x,y:x+y)
f.take(10)

[(('crazy', 'crazy'), 1),
 (('fox', 'jumped'), 2),
 (('is', 'smart'), 2),
 (('crazy', 'fox'), 2),
 (('fox', 'is'), 2),
 (('is', 'fast'), 1),
 (('dog', 'is'), 1)]

In [106]:
f.count()

7

In [114]:
data2 = ["abc,de", "xyz,deeee,ze", "abc,de,ze,pe", "xyz,bababa"]
rdd4 = sc.parallelize(data2)
rdd5 = rdd4.map(lambda x : (x.split(",")[0],x.split(",")[1])) # rdd5 = rdd4.map(lambda x : tuple(x.split(",")))
rdd5.take(5)

[('abc', 'de'), ('xyz', 'deeee'), ('abc', 'de'), ('xyz', 'bababa')]

In [129]:
nums = [10, 1, 2, 9, 3, 4, 5, 6, 7]
sc.parallelize(nums).top(3) #top (in desc order) n elements
#sc.parallelize(nums).sortBy(lambda x:-x).take(3)# sc.parallelize(nums).sortBy(lambda x:x,False).take(3)
#sc.parallelize(nums).takeOrdered(3, lambda x: -x) #sc.parallelize(nums).takeOrdered(3, key=lambda x: -x)

[10, 9, 7]

In [126]:
nums = [10, 1, 2, 9, 3, 4, 5, 6, 7]
sc.parallelize(nums).takeOrdered(3) #ascending by default
#sc.parallelize(nums).sortBy(lambda x:x).take(3)


[1, 2, 3]

In [131]:
kv = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")]
sc.parallelize(kv).takeOrdered(3)

[(1, 'z2'), (2, 'z3'), (3, 'z5')]

In [134]:
sc.parallelize(kv).takeOrdered(3, lambda x:-x[0])

[(10, 'z1'), (9, 'z4'), (7, 'z9')]

In [135]:
sc.parallelize(kv).top(3) #top (in desc order by key) n elements

[(10, 'z1'), (9, 'z4'), (7, 'z9')]

In [136]:
input = [("k1", 1), ("k1", 2), ("k1", 3), ("k1", 4), ("k1", 5), 
             ("k2", 6), ("k2", 7), ("k2", 8), 
             ("k3", 10), ("k3", 12)]
rdd_c=sc.parallelize(input)

In [140]:
rddc=rdd_c.combineByKey((lambda v:(v,1)),(lambda x,v1:(x[0]+v1,x[1]+1)),(lambda x,y:(x[0]+y[0],x[1]+y[1])))
rddc.take(10)

[('k1', (15, 5)), ('k2', (21, 3)), ('k3', (22, 2))]

In [142]:
rddc.mapValues(lambda v : v[0]/float(v[1])).take(10)

[('k1', 3.0), ('k2', 7.0), ('k3', 11.0)]

Standard deviation using combinebykey():

In [145]:
data=[ ('A', 2.0),  ('A', 4.0),  ('A', 9.0),  ('B', 10.0),  ('B', 20.0),  ('Z', 3.0), ('Z', 5.0),  ('Z', 8.0),  ('Z', 12.0)]
rdd = sc.parallelize( data )

sumCount = rdd.combineByKey(lambda value: (value, value*value, 1),lambda x, value: (x[0] + value, x[1] + value*value, x[2] + 1),lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]))

sumCount.take(10)



[('A', (15.0, 101.0, 3)), ('B', (30.0, 500.0, 2)), ('Z', (28.0, 242.0, 4))]

In [148]:
import math
def  stdDev( sumX, sumSquared, n ):
                mean = sumX / n
                stdDeviation = math.sqrt ((sumSquared - n*mean*mean) /n)
                return (mean, stdDeviation)


meanAndStdDev = sumCount.mapValues(lambda x : stdDev(x[0], x[1], x[2]))
meanAndStdDev.take(10)


[('A', (5.0, 2.943920288775949)),
 ('B', (15.0, 5.0)),
 ('Z', (7.0, 3.391164991562634))]

In [162]:
def f(iterator):
    for x in iterator:
        print(x)
sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f)


In [163]:
def f(x):
    print(x)
sc.parallelize([1, 2, 3, 4, 5]).foreach(f)

In [149]:
data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
rdd = sc.parallelize(data, 3)

In [150]:
rdd.getNumPartitions()

3

In [151]:
rdd.collect()

[10, 20, 3, 4, 5, 2, 2, 20, 20, 10]

In [166]:
print(rdd.getStorageLevel())

Serialized 1x Replicated


In [168]:
rdd.glom().collect() # to combine elements in different partitions

[[10, 20, 3], [4, 5, 2], [2, 20, 20, 10]]

Min and max of elements using mapPartitions():

In [194]:
def minmax(iterator):
    i = 0
    for x in iterator:
        if i == 0:
            min1 = x
            max1 = x
            i = 1
        else:
            if x > max1:
                max1 = x
            if x < min1:
                min1 = x
    return (min1, max1)

In [201]:
minmaxlist = rdd.mapPartitions(minmax).collect() #get min and max of each partition
minmaxlist

[3, 20, 2, 5, 2, 20]

In [None]:
min(minmaxlist)  # Min of elements

In [None]:
max(minmaxlist)   # Max of elements