In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext("local", "transformation 1")

### groupBy(f, numPartitions=None, partitionFunc="function portable_hash")

Return an RDD of grouped items.

In [2]:
# pandas groupby 처럼 aggration함수가 필요
# groupby(함수) 함수 결과값을 key로 하는 object 생성
data = sc.parallelize ([1, 1, 2, 3, 4, 5, 5, 6, 6, 6])
result = data.groupBy(lambda x: x%3)
print(result.collect())

[(1, <pyspark.resultiterable.ResultIterable object at 0x7ff1d2b332b0>), (2, <pyspark.resultiterable.ResultIterable object at 0x7ff1d2b33320>), (0, <pyspark.resultiterable.ResultIterable object at 0x7ff1d2b33390>)]


In [3]:
data = sc.parallelize ([1, 1, 2, 3, 4, 5, 5, 6, 6, 6])
result = data.groupBy(lambda x: x%3).mapValues(list).collect()
print(result)

[(1, [1, 1, 4]), (2, [2, 5, 5]), (0, [3, 6, 6, 6])]


### groupByKey(numPartitions=None, partitionFunc="function portable_hash")

Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with numPartitions partitions.

Note If you are grouping in order to perform an aggregation (such as a sum or average) over each key, using reduceByKey or aggregateByKey will provide much better performance.

In [4]:
data = sc.parallelize([("a", 1), ("b", 2), ("c", 3), ("a", 3)])
print(data.groupByKey().mapValues(list).collect())

[('a', [1, 3]), ('b', [2]), ('c', [3])]


In [5]:
data = sc.parallelize([("a", 1), ("b", 2), ("c", 3), ("a", 3)])
print(data.groupByKey().mapValues(len).collect())

[('a', 2), ('b', 1), ('c', 1)]


### GroupByKey vs. ReduceByKey

In [7]:
# reduceByKey network cost가 적다
data = sc.parallelize(range(10)).map(lambda a: (a%3, a))
print(data.collect())
data.reduceByKey(lambda a, b: a+b).collect()

[(0, 0), (1, 1), (2, 2), (0, 3), (1, 4), (2, 5), (0, 6), (1, 7), (2, 8), (0, 9)]


[(0, 18), (1, 12), (2, 15)]

In [8]:
# groupByKey()결과는 리스트[0] key, 리스트[1] value 리턴된다고 생각
# network cost가 크다
data.groupByKey().map(lambda t: (t[0], sum(t[1]))).collect()

[(0, 18), (1, 12), (2, 15)]

### sortByKey(ascending=True, numPartitions=None, keyfunc="lambda function")

Sorts this RDD, which is assumed to consist of (key, value) pairs.

In [9]:
data = sc.parallelize([("b", 1), ("a", 1), ("c", 1), ("a", 1)])
data.sortByKey().collect()

[('a', 1), ('a', 1), ('b', 1), ('c', 1)]

In [10]:
data = sc.parallelize([("b", 1), ("a", 1), ("c", 1), ("a", 1)])
data.sortByKey(False).collect()

[('c', 1), ('b', 1), ('a', 1), ('a', 1)]

### sortBy(keyfunc, ascending=True, numPartitions=None)
Sorts this RDD by the given keyfunc

In [12]:
# sortBy(함수)  함수결과값으로 하는 sort
data = sc.parallelize([4, 3, 2, 4, 2, 7, 9, 4, 5, 2])
data.sortBy(lambda x: x).collect()

[2, 2, 2, 3, 4, 4, 4, 5, 7, 9]

In [13]:
# sortBy(함수)  함수결과값으로 하는 sort
data = sc.parallelize([4, 3, 2, 4, 2, 7, 9, 4, 5, 2])
data.sortBy(lambda x: x, False).collect()

[9, 7, 5, 4, 4, 4, 3, 2, 2, 2]

### coalesce(numPartitions, shuffle=False)
Return a new RDD that is reduced into numPartitions partitions.

In [15]:
data = sc.parallelize(range(15), 4)
data.glom().collect()

[[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10], [11, 12, 13, 14]]

In [17]:
data = sc.parallelize(range(15), 4)
data.coalesce(3).glom().collect()

[[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10, 11, 12, 13, 14]]

In [18]:
# shuffle 없이 파티션을 줄이는 함수 # 증가시키지는 못함
data = sc.parallelize(range(15), 4)
data.coalesce(10).glom().collect() 

[[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10], [11, 12, 13, 14]]

### repartition(numPartitions)
Return a new RDD that has exactly numPartitions partitions.

Can increase or decrease the level of parallelism in this RDD. Internally, this uses a shuffle to redistribute data. ***If you are decreasing the number of partitions in this RDD, consider using coalesce, which can avoid performing a shuffle.***

In [19]:
data.repartition(2).glom().collect()

[[0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14], [3, 4, 5, 6]]

In [20]:
# 해쉬함수에 의해 분배
data.repartition(3).glom().collect()

[[], [0, 1, 2, 7, 8, 9, 10], [3, 4, 5, 6, 11, 12, 13, 14]]

### sample(withReplacement, fraction, seed=None)
Return a sampled subset of this RDD.

Parameters
* withReplacement – can elements be sampled multiple times (replaced when sampled out)
* fraction – expected size of the sample as a fraction of this RDD’s size without replacement: probability that each element is chosen; fraction must be \[0, 1\] with replacement: expected number of times each element is chosen; fraction must be >= 0
* seed – seed for the random number generator

In [21]:
# 중복추출여부, fraction expeted size 기대값이므로 절대적으로 50%를 가져오지 않음, seed
data.sample(True, 0.5, 13).collect()

[0, 2, 2, 2, 3, 5, 12, 13, 14]

### distinct(numPartitions=None)
Return a new RDD containing the distinct elements in this RDD.

In [22]:
data = sc.parallelize([1, 1, 2, 3, 4, 5, 6, 2, 3, 5])
data.distinct().collect()

[1, 2, 3, 4, 5, 6]