In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext("local", "transformation 1")

In [2]:
data = sc.parallelize(range(20), 4).map(lambda x: (x, x))

### partitionBy(numPartitions, partitionFunc=&lt;function portable_hash&gt;)

Return a copy of the RDD partitioned using the specified partitioner.

In [3]:
data.glom().collect()

[[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],
 [(5, 5), (6, 6), (7, 7), (8, 8), (9, 9)],
 [(10, 10), (11, 11), (12, 12), (13, 13), (14, 14)],
 [(15, 15), (16, 16), (17, 17), (18, 18), (19, 19)]]

In [5]:
data.partitionBy(4).glom().collect() # 해쉬함수에 의해 정해진대로 partition

[[(0, 0), (4, 4), (8, 8), (12, 12), (16, 16)],
 [(1, 1), (5, 5), (9, 9), (13, 13), (17, 17)],
 [(2, 2), (6, 6), (10, 10), (14, 14), (18, 18)],
 [(3, 3), (7, 7), (11, 11), (15, 15), (19, 19)]]

In [6]:
print(f'{data.partitioner}')

None


In [8]:
data1 = sc.parallelize(range(20), 4).map(lambda x: (x, x)).partitionBy(4)
data2 = sc.parallelize(range(20), 4).map(lambda x: (x, x)).partitionBy(2)

# partitioner가 다르면  narrow transformation안되고 wide가 된다
data1.partitioner == data2.partitioner
# False

False

In [10]:
data3 = sc.parallelize(range(20), 4).map(lambda x: (x, x)).partitionBy(4, lambda x: x*37)
data1.partitioner == data3.partitioner
# False

False

### union(other)
Return the union of this RDD and another one.

In [17]:
data1 = sc.parallelize([1, 1, 1, 1, 2, 2, 3, 3, 4], 2)
data2 = sc.parallelize([3, 3, 4, 4, 1, 1, 2], 2)

# 중복제거하지 않음
data1.union(data2).collect()
# [1, 1, 1, 1, 2, 2, 3, 3, 4, 3, 3, 4, 4, 1, 1, 2]

data1.union(data2).glom().collect()
# [[1, 1, 1, 1], [2, 2, 3, 3, 4], [3, 3, 4], [4, 1, 1, 2]]
# [[1, 1, 1, 1, 2, 2, 3, 3, 4], [3, 3, 4, 4, 1, 1, 2]]

[[1, 1, 1, 1], [2, 2, 3, 3, 4], [3, 3, 4], [4, 1, 1, 2]]

In [21]:
data1.glom().collect()

[[1, 1, 1, 1], [2, 2, 3, 3, 4]]

In [22]:
data2.glom().collect()

[[3, 3, 4], [4, 1, 1, 2]]

### intersection(other)
Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.

**Note** This method performs a shuffle internally.

In [20]:
# 중복제거
data1 = sc.parallelize([1, 1, 1, 1, 2, 2, 3, 3, 4], 2)
data2 = sc.parallelize([3, 3, 4, 4, 1, 1, 2], 2)
data1.intersection(data2).collect()

[4, 1, 2, 3]

In [23]:
# 파티션의 이동 shuffle 일어난다  # wide transformation
data1.intersection(data2).glom().collect()

[[4], [1], [2], [3]]

### cogroup(other, numPartitions=None)
For each key k in self or other, return a resulting RDD that contains a tuple with the list of values for that key in self as well as other.

In [26]:
rdd1 = sc.parallelize(range(20),4).map(lambda x: (x, x))
rdd2 = sc.parallelize(range(20),4).map(lambda x: (x, x))

# groupByKey()는 동일 rdd내에서, cogroup은 다른 2개의 RDD의 key값을 중심으로 (key, tuple(lterable, lterable,,))
#rdd1.cogroup(rdd2).collect()
#     [(0,
#       (<pyspark.resultiterable.ResultIterable at 0x7fe4d2b23a20>,
#        <pyspark.resultiterable.ResultIterable at 0x7fe4d2b23128>)),
#      (8,
#       (<pyspark.resultiterable.ResultIterable at 0x7fe4d2b23518>,
#        <pyspark.resultiterable.ResultIterable at 0x7fe4d2b239b0>)),,,,]

In [27]:
rdd1.cogroup(rdd2).mapValues(lambda t: ( list(t[0]), list(t[1]) )).collect()
# [(0, ([0], [0])),
#  (8, ([8], [8])),
#  (16, ([16], [16])),
#  (1, ([1], [1])),
#  (9, ([9], [9])),
#  (17, ([17], [17])),
#  (2, ([2], [2])),
#  (10, ([10], [10])),
#  (18, ([18], [18])),

[(0, ([0], [0])),
 (8, ([8], [8])),
 (16, ([16], [16])),
 (1, ([1], [1])),
 (9, ([9], [9])),
 (17, ([17], [17])),
 (2, ([2], [2])),
 (10, ([10], [10])),
 (18, ([18], [18])),
 (3, ([3], [3])),
 (11, ([11], [11])),
 (19, ([19], [19])),
 (4, ([4], [4])),
 (12, ([12], [12])),
 (5, ([5], [5])),
 (13, ([13], [13])),
 (6, ([6], [6])),
 (14, ([14], [14])),
 (7, ([7], [7])),
 (15, ([15], [15]))]

In [29]:
rdd1.glom().collect()
# [[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],
#  [(5, 5), (6, 6), (7, 7), (8, 8), (9, 9)],
#  [(10, 10), (11, 11), (12, 12), (13, 13), (14, 14)],
#  [(15, 15), (16, 16), (17, 17), (18, 18), (19, 19)]]

[[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],
 [(5, 5), (6, 6), (7, 7), (8, 8), (9, 9)],
 [(10, 10), (11, 11), (12, 12), (13, 13), (14, 14)],
 [(15, 15), (16, 16), (17, 17), (18, 18), (19, 19)]]

In [30]:
rdd2.glom().collect()
# [[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],
#  [(5, 5), (6, 6), (7, 7), (8, 8), (9, 9)],
#  [(10, 10), (11, 11), (12, 12), (13, 13), (14, 14)],
#  [(15, 15), (16, 16), (17, 17), (18, 18), (19, 19)]]

[[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],
 [(5, 5), (6, 6), (7, 7), (8, 8), (9, 9)],
 [(10, 10), (11, 11), (12, 12), (13, 13), (14, 14)],
 [(15, 15), (16, 16), (17, 17), (18, 18), (19, 19)]]

In [31]:
# wide transformation # 파티션 개수가 늘어남 8개로 
rdd1.cogroup(rdd2).mapValues(lambda t: ( list(t[0]), list(t[1]) )).glom().collect()
# [[(0, ([0], [0])), (8, ([8], [8])), (16, ([16], [16]))],
#  [(1, ([1], [1])), (9, ([9], [9])), (17, ([17], [17]))],
#  [(2, ([2], [2])), (10, ([10], [10])), (18, ([18], [18]))],
#  [(3, ([3], [3])), (11, ([11], [11])), (19, ([19], [19]))],
#  [(4, ([4], [4])), (12, ([12], [12]))],
#  [(5, ([5], [5])), (13, ([13], [13]))],
#  [(6, ([6], [6])), (14, ([14], [14]))],
#  [(7, ([7], [7])), (15, ([15], [15]))]]

[[(0, ([0], [0])), (8, ([8], [8])), (16, ([16], [16]))],
 [(1, ([1], [1])), (9, ([9], [9])), (17, ([17], [17]))],
 [(2, ([2], [2])), (10, ([10], [10])), (18, ([18], [18]))],
 [(3, ([3], [3])), (11, ([11], [11])), (19, ([19], [19]))],
 [(4, ([4], [4])), (12, ([12], [12]))],
 [(5, ([5], [5])), (13, ([13], [13]))],
 [(6, ([6], [6])), (14, ([14], [14]))],
 [(7, ([7], [7])), (15, ([15], [15]))]]

In [33]:
rdd1=rdd1.partitionBy(4)
rdd1.glom().collect()
# [[(0, 0), (4, 4), (8, 8), (12, 12), (16, 16)],
#  [(1, 1), (5, 5), (9, 9), (13, 13), (17, 17)],
#  [(2, 2), (6, 6), (10, 10), (14, 14), (18, 18)],
#  [(3, 3), (7, 7), (11, 11), (15, 15), (19, 19)]]

[[(0, 0), (4, 4), (8, 8), (12, 12), (16, 16)],
 [(1, 1), (5, 5), (9, 9), (13, 13), (17, 17)],
 [(2, 2), (6, 6), (10, 10), (14, 14), (18, 18)],
 [(3, 3), (7, 7), (11, 11), (15, 15), (19, 19)]]

In [35]:
print(f'{rdd1.partitioner}') # <pyspark.rdd.Partitioner object at 0x7fe4d2a77e10> # 파티셔너가 있다
print(f'{rdd2.partitioner}') # None

<pyspark.rdd.Partitioner object at 0x7fe4d2a77e10>
None


In [38]:
rdd3=rdd1.cogroup(rdd2, 4).mapValues(lambda t: ( list(t[0]), list(t[1]) ))
rdd3.glom().collect() # rdd1은 파티셔너가 있어서 그대로 넘어오고, # rdd2는 shuffle이 일어나는 것을 알 수 있다
# [[(0, ([0], [0])),
#   (4, ([4], [4])),
#   (8, ([8], [8])),
#   (12, ([12], [12])),
#   (16, ([16], [16]))],
#  [(1, ([1], [1])),
#   (5, ([5], [5])),
#   (9, ([9], [9])),
#   (13, ([13], [13])),
#   (17, ([17], [17]))],
#  [(2, ([2], [2])),
#   (6, ([6], [6])),
#   (10, ([10], [10])),
#   (14, ([14], [14])),
#   (18, ([18], [18]))],
#  [(3, ([3], [3])),
#   (7, ([7], [7])),
#   (11, ([11], [11])),
#   (15, ([15], [15])),
#   (19, ([19], [19]))]]

[[(0, ([0], [0])),
  (4, ([4], [4])),
  (8, ([8], [8])),
  (12, ([12], [12])),
  (16, ([16], [16]))],
 [(1, ([1], [1])),
  (5, ([5], [5])),
  (9, ([9], [9])),
  (13, ([13], [13])),
  (17, ([17], [17]))],
 [(2, ([2], [2])),
  (6, ([6], [6])),
  (10, ([10], [10])),
  (14, ([14], [14])),
  (18, ([18], [18]))],
 [(3, ([3], [3])),
  (7, ([7], [7])),
  (11, ([11], [11])),
  (15, ([15], [15])),
  (19, ([19], [19]))]]

In [39]:
rdd1.partitioner == rdd3.partitioner
# True

True

In [40]:
### rdd2의 파티셔너 같게 하자 partitionBy 제공하여 
rdd2 = rdd2.partitionBy(4)
rdd2.glom().collect()


[[(0, 0), (4, 4), (8, 8), (12, 12), (16, 16)],
 [(1, 1), (5, 5), (9, 9), (13, 13), (17, 17)],
 [(2, 2), (6, 6), (10, 10), (14, 14), (18, 18)],
 [(3, 3), (7, 7), (11, 11), (15, 15), (19, 19)]]

In [41]:
print(f'{rdd1.partitioner == rdd2.partitioner}') # True
print(f'{rdd2.partitioner == rdd3.partitioner}') # True

True
True


In [42]:
# narrow transformation
rdd4=rdd1.cogroup(rdd2, 4).mapValues(lambda t: ( list(t[0]), list(t[1]) ))
rdd4.glom().collect()
# [[(0, ([0], [0])),
#   (4, ([4], [4])),
#   (8, ([8], [8])),
#   (12, ([12], [12])),
#   (16, ([16], [16]))],
#  [(1, ([1], [1])),
#   (5, ([5], [5])),
#   (9, ([9], [9])),
#   (13, ([13], [13])),
#   (17, ([17], [17]))],
#  [(2, ([2], [2])),
#   (6, ([6], [6])),
#   (10, ([10], [10])),
#   (14, ([14], [14])),
#   (18, ([18], [18]))],
#  [(3, ([3], [3])),
#   (7, ([7], [7])),
#   (11, ([11], [11])),
#   (15, ([15], [15])),
#   (19, ([19], [19]))]]

[[(0, ([0], [0])),
  (4, ([4], [4])),
  (8, ([8], [8])),
  (12, ([12], [12])),
  (16, ([16], [16]))],
 [(1, ([1], [1])),
  (5, ([5], [5])),
  (9, ([9], [9])),
  (13, ([13], [13])),
  (17, ([17], [17]))],
 [(2, ([2], [2])),
  (6, ([6], [6])),
  (10, ([10], [10])),
  (14, ([14], [14])),
  (18, ([18], [18]))],
 [(3, ([3], [3])),
  (7, ([7], [7])),
  (11, ([11], [11])),
  (15, ([15], [15])),
  (19, ([19], [19]))]]

### join(other, numPartitions=None)
Return an RDD containing all pairs of elements with matching keys in self and other.

Each pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in self and (k, v2) is in other.

Performs a hash join across the cluster.

In [43]:
rdd1 = sc.parallelize([1,1,2,2,]).map(lambda x: (x,x))
rdd2 = sc.parallelize([1,4,2,2,]).map(lambda x: (x,x))
rdd1.join(rdd2).collect()
# [(2, (2, 2)), (2, (2, 2)), (2, (2, 2)), (2, (2, 2)), (1, (1, 1)), (1, (1, 1))]

[(2, (2, 2)), (2, (2, 2)), (2, (2, 2)), (2, (2, 2)), (1, (1, 1)), (1, (1, 1))]