In [4]:
from pyspark import SparkContext

# SparkContext 생성
sc = SparkContext("local", "MyApp")  # local: 로컬에서 실행, "MyApp": 앱 이름

In [5]:
### 1. RDD 생성과 기본 트랜스포메이션

# SparkContext 생성 (이미 sc가 존재한다고 가정)
# RDD 생성
numbers = sc.parallelize([1, 2, 3, 4, 5])

# 기본 트랜스포메이션 예제
mapped_rdd = numbers.map(lambda x: x * 2)
filtered_rdd = numbers.filter(lambda x: x % 2 == 0)
flattened_rdd = sc.parallelize([[1, 2], [3, 4]]).flatMap(lambda x: x)

# 결과 확인
print("Mapped RDD:", mapped_rdd.collect())  # [2, 4, 6, 8, 10]
print("Filtered RDD:", filtered_rdd.collect())  # [2, 4]
print("Flattened RDD:", flattened_rdd.collect())  # [1, 2, 3, 4]

Mapped RDD: [2, 4, 6, 8, 10]
Filtered RDD: [2, 4]
Flattened RDD: [1, 2, 3, 4]


In [6]:
### 4. 액션 연산 예제
# 액션 연산
count = numbers.count()
first_element = numbers.first()
collected = numbers.collect()
taken = numbers.take(3)

In [7]:
### 2. Narrow 트랜스포메이션 예제

# 1:1 변환 (Narrow Transformation)
sample_rdd = numbers.sample(False, 0.5)
union_rdd = numbers.union(sc.parallelize([6, 7, 8]))

# Key-Value RDD에서 mapValues 사용
kv_rdd = sc.parallelize([("apple", 1), ("banana", 2), ("cherry", 3)])
mapped_values = kv_rdd.mapValues(lambda x: x * 10)

print("Sample RDD:", sample_rdd.collect())  # 무작위 샘플링된 결과
print("Union RDD:", union_rdd.collect())  # [1, 2, 3, 4, 5, 6, 7, 8]
print("MapValues RDD:", mapped_values.collect())  # [("apple", 10), ("banana", 20), ("cherry", 30)]

Sample RDD: [2, 5]
Union RDD: [1, 2, 3, 4, 5, 6, 7, 8]
MapValues RDD: [('apple', 10), ('banana', 20), ('cherry', 30)]


In [8]:
### 3. Wide 트랜스포메이션 예제

# 셔플링이 필요한 변환 (Wide Transformation)
distinct_rdd = sc.parallelize([1, 1, 2, 2, 3, 3]).distinct()
grouped_rdd = kv_rdd.groupByKey()
reduced_rdd = kv_rdd.reduceByKey(lambda x, y: x + y)

In [9]:
# 두 RDD 조인하기
rdd1 = sc.parallelize([("apple", 2), ("banana", 1)])
rdd2 = sc.parallelize([("apple", "fruit"), ("banana", "fruit"), ("carrot", "vegetable")])

# 두 RDD 조인
joined_rdd = rdd1.join(rdd2)
print(joined_rdd.collect())
# 출력 결과: [('apple', (2, 'fruit')), ('banana', (1, 'fruit'))]

print("Distinct RDD:", distinct_rdd.collect())  # [1, 2, 3]
print("Joined RDD:", joined_rdd.collect())  # [("apple", (2, "fruit")), ("banana", (1, "fruit"))]

[('apple', (2, 'fruit')), ('banana', (1, 'fruit'))]
Distinct RDD: [1, 2, 3]
Joined RDD: [('apple', (2, 'fruit')), ('banana', (1, 'fruit'))]
