In [1]:
from pyspark import SparkConf, SparkContext

In [5]:
conf = SparkConf().setMaster("local").setAppName("rdd_study")
sc = SparkContext(conf=conf)

In [6]:
directory = "/Users/robertmin/PycharmProjects/study/data_engineering/spark_review/data"
filename = "restaurant_reviews.csv"

In [7]:
lines = sc.textFile(f"file:///{directory}/{filename}")

In [8]:
lines.collect()

                                                                                

['id,item,cateogry,reviews',
 '0,짜장면,중식,125',
 '1,짬뽕,중식,235,',
 '2,김밥,분식,32',
 '3,떡볶이,분식,534',
 '4,라멘,일식,223   ',
 '5,돈가스,일식,52',
 '6,우동,일식,12',
 '7,쌀국수,아시안,312',
 '8,햄버거,패스트푸드,12',
 '9,치킨,패스트푸드,23']

In [9]:
header = lines.first()

                                                                                

In [10]:
filtered_lines = lines.filter(lambda row: row != header)
filtered_lines.collect()

['0,짜장면,중식,125',
 '1,짬뽕,중식,235,',
 '2,김밥,분식,32',
 '3,떡볶이,분식,534',
 '4,라멘,일식,223   ',
 '5,돈가스,일식,52',
 '6,우동,일식,12',
 '7,쌀국수,아시안,312',
 '8,햄버거,패스트푸드,12',
 '9,치킨,패스트푸드,23']

In [11]:
def parse(row):
    fields = row.split(",")
    category = fields[2]
    reviews = int(fields[3])
    return  (category, reviews)

In [13]:
categoryReviews = filtered_lines.map(parse)
categoryReviews.collect()

[('중식', 125),
 ('중식', 235),
 ('분식', 32),
 ('분식', 534),
 ('일식', 223),
 ('일식', 52),
 ('일식', 12),
 ('아시안', 312),
 ('패스트푸드', 12),
 ('패스트푸드', 23)]

In [14]:
# category : key, reviews : value
categoryReviewsCount = categoryReviews.mapValues(lambda x: (x, 1))
categoryReviewsCount.collect()

[('중식', (125, 1)),
 ('중식', (235, 1)),
 ('분식', (32, 1)),
 ('분식', (534, 1)),
 ('일식', (223, 1)),
 ('일식', (52, 1)),
 ('일식', (12, 1)),
 ('아시안', (312, 1)),
 ('패스트푸드', (12, 1)),
 ('패스트푸드', (23, 1))]

In [15]:
categoryReviews = filtered_lines.map(parse).persist()
result1 = categoryReviews.take(10)
result2 = categoryReviews.mapValues(lambda x: (x, 1)).collect()

                                                                                

In [16]:
sc.parallelize([1, 2, 3, 4]).reduce(lambda x, y: (x*2) + y)

26

In [17]:
sc.parallelize([1, 2, 3, 4], 2).reduce(lambda x, y: (x*2) + y)

18

In [18]:
rdd = sc.parallelize([1, 3, 5, 1, 2, 6, 8])
result = rdd.groupBy(lambda x: x % 2).collect()
sorted([(x, sorted(y)) for (x, y) in result])

[(0, [2, 6, 8]), (1, [1, 1, 3, 5])]

In [19]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
sorted(rdd.groupByKey().mapValues(len).collect())

[('a', 2), ('b', 1)]

In [20]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
sorted(rdd.reduceByKey(lambda x, y: x+y).collect())

[('a', 2), ('b', 1)]

In [21]:
# mapValues()
x = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])])
def f(x):
    return len(x)
x.mapValues(f).collect()

[('a', 3), ('b', 1)]

In [22]:
# countByKey()
rdd = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])])
sorted(rdd.countByKey().items())

[('a', 1), ('b', 1)]

In [23]:
# keys()
rdd = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])]).keys()
rdd.collect()

['a', 'b']

In [24]:
sc.stop()