In [101]:
import pyspark
from pyspark.sql import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *

spark = SparkSession.builder\
       .appName("Pyspark")\
       .getOrCreate()

In [3]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data) # to create RDD on an iterable

In [4]:
distData.reduce(lambda a, b: a + b) # reduce function will aggregate elements of rdd  #action

15

In [5]:
distFile = sc.textFile("data.txt") # load rdd from textfile
#he textFile method also takes an optional secongroud argument for controlling the number of partitions of the file. 
#By default, Spark creates one partition for each block of the file (blocks being 128MB by default in HDFS),
#but you can also ask for a higher number of partitions by passing a larger value.
#Note that you cannot have fewer partitions than blocks.

In [6]:
sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x)).collect() # map function operates on each element of rdd


[(1, 'a'), (2, 'aa'), (3, 'aaa')]

In [7]:
#By default, each transformed RDD may be recomputed each time you run an action on it. 
#However, you may also persist an RDD in memory using the persist (or cache) method, 
#in which case Spark will keep the elements around on the cluster for much faster access the next time you query it. 
#There is also support for persisting RDDs on disk, or replicated across multiple nodes.

In [8]:
spark.version

'2.3.2'

In [9]:
nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7])
nums.collect()

[1, 2, 3, 4, 5, 6, 7]

In [10]:
filtered1 = nums.filter(lambda x : x % 2 == 1) #odd numbers
filtered1.collect()

[1, 3, 5, 7]

In [11]:
filtered2 = nums.filter(lambda x : x % 2 == 0) # even numbers
filtered2.collect()

[2, 4, 6]

In [3]:
sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest.txt").take(5)

['k1,v1', 'k1,v2', 'k2,v3', 'k2,v4', 'k3,v7']

In [4]:
R = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest.txt")
R.take(5)

['k1,v1', 'k1,v2', 'k2,v3', 'k2,v4', 'k3,v7']

In [5]:
S= sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest1.txt")
S.take(5)

['k1,v11', 'k1,v22', 'k1,v33', 'k2,v55', 'k4,v77']

In [6]:
r1=R.map(lambda x:x.split(','))
r1.take(5)

[['k1', 'v1'], ['k1', 'v2'], ['k2', 'v3'], ['k2', 'v4'], ['k3', 'v7']]

In [10]:
r3=R.flatMap(lambda x:x.split(','))
r3.take(11)

['k1', 'v1', 'k1', 'v2', 'k2', 'v3', 'k2', 'v4', 'k3', 'v7', 'k3']

In [8]:
#r2 = r1.map(lambda s: (s[0],s[1]))
#r2.take(10)

r2 = r1.flatMap(lambda s: (s[0],s[1]))
r2.take(10)

['k1', 'v1', 'k1', 'v2', 'k2', 'v3', 'k2', 'v4', 'k3', 'v7']

In [17]:
s1 = S.map(lambda s: s.split(","))
s1.take(10)

[['k1', 'v11'],
 ['k1', 'v22'],
 ['k1', 'v33'],
 ['k2', 'v55'],
 ['k4', 'v77'],
 ['k5', 'v88']]

In [18]:
s2 = s1.flatMap(lambda s: [(s[0],s[1])])
s2.take(10)

[('k1', 'v11'),
 ('k1', 'v22'),
 ('k1', 'v33'),
 ('k2', 'v55'),
 ('k4', 'v77'),
 ('k5', 'v88')]

In [19]:
RjoinedS = r2.join(s2)
RjoinedS.take(10)

[('k1', ('v1', 'v11')),
 ('k1', ('v1', 'v22')),
 ('k1', ('v1', 'v33')),
 ('k1', ('v2', 'v11')),
 ('k1', ('v2', 'v22')),
 ('k1', ('v2', 'v33')),
 ('k2', ('v3', 'v55')),
 ('k2', ('v4', 'v55'))]

In [20]:
nums = sc.parallelize([1, 2, 3, 4, 5])

In [21]:
sum2 = nums.map(lambda x: x + 2)
sum2.take(10)

[3, 4, 5, 6, 7]

In [22]:
sq = nums.map(lambda x: x * x)
sq.take(10)

[1, 4, 9, 16, 25]

In [23]:
n = sc.parallelize([1, 2, 3, 4])
n.reduce(lambda x, y: x * y)
#n.fold(1,(lambda x, y: x * y))


24

In [24]:
rdd1 = sc.parallelize(["b", "a", "c"])
sorted(rdd1.map(lambda x: (x, 1)).collect())


[('a', 1), ('b', 1), ('c', 1)]

In [25]:
rdd1 = sc.parallelize(["b", "a", "c"])
rdd1.flatMap(lambda x: (x, 1)).collect()

['b', 1, 'a', 1, 'c', 1]

In [26]:
lines = sc.textFile('file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest3.txt')
lines.take(10)

['crazy crazy fox jumped',
 'crazy fox jumped',
 'fox is fast',
 'fox is smart',
 'dog is smart']

In [27]:
lines.flatMap(lambda x: x.split(' ')).take(10)

['crazy',
 'crazy',
 'fox',
 'jumped',
 'crazy',
 'fox',
 'jumped',
 'fox',
 'is',
 'fast']

In [28]:
assigninitial = lines.flatMap(lambda x: x.split(' ')).map(lambda w:(w,1))
assigninitial.take(10)

[('crazy', 1),
 ('crazy', 1),
 ('fox', 1),
 ('jumped', 1),
 ('crazy', 1),
 ('fox', 1),
 ('jumped', 1),
 ('fox', 1),
 ('is', 1),
 ('fast', 1)]

In [31]:
frequencies=assigninitial.reduceByKey(lambda x,y:x+y)
x=frequencies.take(10)
x

[('crazy', 3),
 ('is', 3),
 ('smart', 2),
 ('fox', 4),
 ('jumped', 2),
 ('fast', 1),
 ('dog', 1)]

In [32]:
frequencies.count()

7

In [122]:
frequencies.keys() #action

# to view
frequencies.keys().take(12)

['crazy', 'is', 'smart', 'fox', 'jumped', 'fast', 'dog']

In [123]:
frequencies.values() #action

# to view
frequencies.values().take(12)

[3, 3, 2, 4, 2, 1, 1]

In [38]:
frequencies.zipWithIndex().take(10) # create tuple of element and index

[(('crazy', 3), 0),
 (('is', 3), 1),
 (('smart', 2), 2),
 (('fox', 4), 3),
 (('jumped', 2), 4),
 (('fast', 1), 5),
 (('dog', 1), 6)]

In [35]:
frequencies.first()  #action

('crazy', 3)

In [36]:
frequencies.sortByKey().collect()

[('crazy', 3),
 ('dog', 1),
 ('fast', 1),
 ('fox', 4),
 ('is', 3),
 ('jumped', 2),
 ('smart', 2)]

In [37]:
frequencies.sortByKey(ascending=False).collect() #frequencies.sortByKey(False).collect()

[('smart', 2),
 ('jumped', 2),
 ('is', 3),
 ('fox', 4),
 ('fast', 1),
 ('dog', 1),
 ('crazy', 3)]

In [38]:
tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
sc.parallelize(tmp).sortBy(lambda x: x[0]).collect() #sortBy() needs key func as argument ,default ascending

[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]

In [39]:
tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]
tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])

sc.parallelize(tmp2).sortByKey(True,1,lambda k: k[0].lower()).collect() # sc.parallelize(tmp2).sortByKey(True,1,keyfunc=lambda k: k.lower()).collect()

[('a', 3),
 ('fleece', 7),
 ('had', 2),
 ('little', 4),
 ('lamb', 5),
 ('Mary', 1),
 ('whose', 6),
 ('was', 8),
 ('white', 9)]

In [70]:
numbers = sc.parallelize([1, 2, 3, 4])
sum1 = numbers.fold(0, (lambda x, y: x + y))#alt way fr addition of all elements #action
print(sum1)
mult1=numbers.fold(1, (lambda x, y: x * y))#way fr multiplication of all elements
mult1

10


24

In [71]:
numbers = sc.parallelize([1, 2, 3, 4])
sum1 = numbers.sum() #action (subtract() is a transformation)
sum1

10

In [134]:
x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 3)])
y = sc.parallelize([("a", 3), ("c", None)])
sorted(x.subtract(y).collect()) # subtract() is a transformation


[('a', 1), ('b', 4), ('b', 5)]

In [72]:
d1= [('k1', 1), ('k2', 2), ('k3', 5)]
d2= [('k1', 3), ('k2',4), ('k4', 8)]
rdd1 = sc.parallelize(d1)
rdd2 = sc.parallelize(d2)

In [73]:
rdd3=rdd1.union(rdd2)
rdd3.take(10)

[('k1', 1), ('k2', 2), ('k3', 5), ('k1', 3), ('k2', 4), ('k4', 8)]

In [74]:
rdd3.reduceByKey(lambda x,y: x+y).take(10)

# reduce() is an action whereas reduceByKeyI) isa transformation

[('k1', 4), ('k3', 5), ('k2', 6), ('k4', 8)]

In [75]:
lines = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest3.txt")
lines.take(10)

['crazy crazy fox jumped',
 'crazy fox jumped',
 'fox is fast',
 'fox is smart',
 'dog is smart']

In [76]:
lines = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\mytest3.txt")

r1 = lines.map(lambda s : s.split(" ")).collect()
r1

[['crazy', 'crazy', 'fox', 'jumped'],
 ['crazy', 'fox', 'jumped'],
 ['fox', 'is', 'fast'],
 ['fox', 'is', 'smart'],
 ['dog', 'is', 'smart']]

In [77]:
x= lines.map(lambda s : s.split(" ")).\
flatMap(lambda s: [((s[i],s[i+1]),1) for i in range(len(s)-1)])                                                                                             
x.take(12)

[(('crazy', 'crazy'), 1),
 (('crazy', 'fox'), 1),
 (('fox', 'jumped'), 1),
 (('crazy', 'fox'), 1),
 (('fox', 'jumped'), 1),
 (('fox', 'is'), 1),
 (('is', 'fast'), 1),
 (('fox', 'is'), 1),
 (('is', 'smart'), 1),
 (('dog', 'is'), 1),
 (('is', 'smart'), 1)]

In [78]:
f=x.reduceByKey(lambda x,y:x+y) # counting number of pairs
f.take(12)

[(('crazy', 'crazy'), 1),
 (('fox', 'jumped'), 2),
 (('is', 'smart'), 2),
 (('crazy', 'fox'), 2),
 (('fox', 'is'), 2),
 (('is', 'fast'), 1),
 (('dog', 'is'), 1)]

In [79]:
f.count()

7

In [3]:
data2 = ["abc,de", "xyz,deeee,ze", "abc,de,ze,pe", "xyz,bababa"]
rdd4 = sc.parallelize(data2)
rdd5 = rdd4.map(lambda x : (x.split(",")[0],x.split(",")[1])) 
# rdd5 = rdd4.map(lambda x : tuple(x.split(",")))
rdd5.take(5)

[('abc', 'de'), ('xyz', 'deeee'), ('abc', 'de'), ('xyz', 'bababa')]

In [81]:
nums = [10, 1, 2, 9, 3, 4, 5, 6, 7]
sc.parallelize(nums).top(3) #top (in desc order) n elements  #action
#sc.parallelize(nums).sortBy(lambda x:-x).take(3)# sc.parallelize(nums).sortBy(lambda x:x,False).take(3)
#sc.parallelize(nums).takeOrdered(3, lambda x: -x) #sc.parallelize(nums).takeOrdered(3, key=lambda x: -x)

[10, 9, 7]

In [82]:
nums = [10, 1, 2, 9, 3, 4, 5, 6, 7]
sc.parallelize(nums).takeOrdered(3) #ascending by default .Can also specify function as second argument  #action
#sc.parallelize(nums).sortBy(lambda x:x).take(3)


[1, 2, 3]

In [5]:
kv = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")]
kv1 = {(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")}
sc.parallelize(kv1).takeOrdered(3) # order (default ascending) on basis of key

[(1, 'z2'), (2, 'z3'), (3, 'z5')]

In [84]:
sc.parallelize(kv).takeOrdered(3, lambda x:-x[0])

[(10, 'z1'), (9, 'z4'), (7, 'z9')]

In [85]:
sc.parallelize(kv).top(3) #top (in desc order by key) n elements

[(10, 'z1'), (9, 'z4'), (7, 'z9')]

In [86]:
input = [("k1", 1), ("k1", 2), ("k1", 3), ("k1", 4), ("k1", 5), 
             ("k2", 6), ("k2", 7), ("k2", 8), 
             ("k3", 10), ("k3", 12)]
rdd_c=sc.parallelize(input)

In [87]:
rddc=rdd_c.combineByKey((lambda v:(v,1)),(lambda x,v1:(x[0]+v1,x[1]+1)),(lambda x,y:(x[0]+y[0],x[1]+y[1]))) # gives tuple of key and tuple of reduced ouput,no. key instances
rddc.take(10)

[('k1', (15, 5)), ('k2', (21, 3)), ('k3', (22, 2))]

In [88]:
rddc.mapValues(lambda v : v[0]/float(v[1])).take(10)  #action

[('k1', 3.0), ('k2', 7.0), ('k3', 11.0)]

Standard deviation using combinebykey():

In [89]:
data=[ ('A', 2.0),  ('A', 4.0),  ('A', 9.0),  ('B', 10.0),  ('B', 20.0),  ('Z', 3.0), ('Z', 5.0),  ('Z', 8.0),  ('Z', 12.0)]
rdd = sc.parallelize( data )

sumCount = rdd.combineByKey(lambda value: (value, value*value, 1),lambda x, value: (x[0] + value, x[1] + value*value, x[2] + 1),lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]))

sumCount.take(10)



[('A', (15.0, 101.0, 3)), ('B', (30.0, 500.0, 2)), ('Z', (28.0, 242.0, 4))]

In [6]:
import math
def stdDev( sumX, sumSquared, n ):
                mean = sumX / n
                stdDeviation = math.sqrt ((sumSquared - n*mean*mean) /n)
                return (mean, stdDeviation)


meanAndStdDev = sumCount.mapValues(lambda x : stdDev(x[0], x[1], x[2]))
meanAndStdDev.take(10)


NameError: name 'sumCount' is not defined

In [91]:
def f(iterator):
    for x in iterator:
        print(x)
sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f)  #action


In [92]:
def f(x):
    print(x)
sc.parallelize([1, 2, 3, 4, 5]).foreach(f)  #action

In [3]:
data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
rdd = sc.parallelize(data, 3)

In [94]:
rdd.getNumPartitions() # get number of partitions

3

In [95]:
rdd.collect()

[10, 20, 3, 4, 5, 2, 2, 20, 20, 10]

In [115]:
rdd.min()  #action

2

In [116]:
rdd.max()  #action

20

In [117]:
rdd.mean()  # no avg() #action 

9.6

In [119]:
rdd.stdev()

7.323933369440223

In [120]:
rdd.cache()  #action

ParallelCollectionRDD[224] at parallelize at PythonRDD.scala:194

In [96]:
print(rdd.getStorageLevel()) # get storage level  #action


Serialized 1x Replicated


In [97]:
rdd.glom().collect() # glom() to combine elements in different partitions

[[10, 20, 3], [4, 5, 2], [2, 20, 20, 10]]

Min and max of elements using mapPartitions():

In [4]:
def minmax(iterator):
    i = 0
    for x in iterator:
        if i == 0:
            min1 = x
            max1 = x
            i = 1
        else:
            if x > max1:
                max1 = x
            if x < min1:
                min1 = x
    return (min1, max1)

In [8]:
minmax = rdd.mapPartitions(minmax)#get min and max of each partition
minmax.take(10)

[3, 20, 2, 5, 2, 20]

In [9]:
minmax.min() # Min of list of elements #action

2

In [10]:
minmax.max()  # Max of list of elements #action

20

DNA base count (counting letters in a text file):

In [8]:
def letcount(it):
    newdict={}
    for i in list(it):
        if i in newdict:
            newdict[i]+=1
        else:
            newdict[i]=1
        
    kv=[(x,newdict[x]) for x in newdict]
    return kv
                   
        

In [9]:
recs=sc.textFile('file:///D:\\spark-2.3.2-bin-hadoop2.7\\dna.txt')
lc=recs.flatMap(letcount) 
#lc=recs.flatMap(lambda c: [(i,1) for i in list(c)])
letcount=lc.reduceByKey(lambda x,y:x+y)
letcount.collect()

Py4JJavaError: An error occurred while calling o70.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/D:/spark-2.3.2-bin-hadoop2.7/dna.txt
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:194)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
	at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [112]:
recs.take(5)

['ATATCCCCGGGAT', 'ATCGATCGATAT']

In [108]:
lines = sc.parallelize(["pandas", "i like pandas"])
lines.countByValue()

defaultdict(int, {'pandas': 1, 'i like pandas': 1})

In [109]:
lines = sc.parallelize(["pandas", "i like pandas"])
result = lines.flatMap(lambda x: x.split()).countByValue() #action (countByKey() is also #action)

# count based on value(k-v pair or simple element) and return dictionary of element and count
x1=[]
for k,v in result.items():
    x={k:v}
    x1.append(x)

print(x1)

[{'pandas': 2}, {'i': 1}, {'like': 1}]


In [111]:
def perKeyAvg(nums):
    """Compute the avg"""
    sumCount = nums.combineByKey((lambda x: (x, 1)),
                                 (lambda x, y: (x[0] + y, x[1] + 1)),
                                 (lambda x, y: (x[0] + y[0], x[1] + y[1])))
    return sumCount.collect()

nums = sc.parallelize([("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)])
avg = perKeyAvg(nums)  # example of combinebykey()
avg

[('coffee', (4, 2)), ('pandas', (2, 1)), ('very', (4, 1))]

In [114]:
m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap() # returns (k,v) pair as dictionary
m

{1: 2, 3: 4}

In [128]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
#rdd.countByKey() # return sictionary keys and their count  #action
sorted(rdd.countByKey().items()) # itms() returns dictionary in list form containing tuples of key and count


[('a', 2), ('b', 1)]

In [129]:
rdd1=sc.parallelize([1,2,3,4,2,3])
rdd1.distinct().take(2) # distinct drop duplicates in rdd

[4, 1]

In [131]:
b = sc.broadcast([1, 2, 3, 4, 5]) # broadcast()
b.value



[1, 2, 3, 4, 5]

In [132]:
b.destroy()

In [133]:
x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 2)])
y = sc.parallelize([("a", 3), ("c", None)])
sorted(x.subtractByKey(y).collect()) # subtractByKey is a transformation

[('b', 4), ('b', 5)]

In [135]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)])
sorted(y.rightOuterJoin(x).collect()) # all rows from x # return tupe of key and tuple of values


[('a', (2, 1)), ('b', (None, 4))]

In [138]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)])
sorted(x.leftOuterJoin(y).collect())


[('a', (1, 2)), ('b', (4, None))]

In [137]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)])
sorted(y.fullOuterJoin(x).collect())

[('a', (2, 1)), ('b', (None, 4))]

In [139]:
rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)])
rdd2 = rdd.repartitionAndSortWithinPartitions(2) # repartiotion and ort with in partitions  take num of partitions,partition func if reqd,sort order (asc by default) and key func for sorting if reqd
rdd2.glom().collect()  # glom() to collect as list value of partitions

[[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]]

In [140]:
sc.parallelize([]).isEmpty() # to check if rdd is empty or not

True

In [146]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
sorted(x.join(y).collect()) # for inner join on rdd's

[('a', (1, 2)), ('a', (1, 3))]

In [14]:
x = sc.parallelize(["Joseph", "Jimmy", "Tina", 
                    "Thomas", "James", "Cory",
                    "Christine", "Jackeline", "Juan"], 3) #3 is number of partitions

y = x.groupBy(lambda word: word[0]) #groupBy function

for t in y.collect():
    print((t[0],[i for i in t[1]]))

('J', ['Joseph', 'Jimmy', 'James', 'Jackeline', 'Juan'])
('T', ['Tina', 'Thomas'])
('C', ['Cory', 'Christine'])


In [19]:
x = sc.parallelize([
    ("USA", 1), ("USA", 2), ("India", 1),
    ("UK", 1), ("India", 4), ("India", 9),
    ("USA", 8), ("USA", 3), ("India", 4),
    ("UK", 6), ("UK", 9), ("UK", 5)], 3)

y = x.groupByKey() #groupByKey function

 
## With predefined Partitions
#y = x.groupByKey(2)
#print('Output: ',y.getNumPartitions())
## Output: 2

for i in y.collect():
    print(i[0],[x for x in i[1]])

Output:  2
USA [1, 2, 8, 3]
India [1, 4, 9, 4]
UK [1, 6, 9, 5]


In [21]:
y.mapValues(lambda s: min(s)).collect() # to get minimum value for each group produced by groupByKey.Use MapValues()

[('USA', 1), ('India', 1), ('UK', 1)]

In [22]:
#finding min of group using reduceByKey() (efficient)

y = x.reduceByKey(lambda a,b: a if a<b else b) #reduceByKey function

y.collect()



[('USA', 1), ('UK', 1), ('India', 1)]