## Apache Spark는 다양한 배포 모드 중 로컬 모드 (Local Mode)
```
setMaster("local[*]"):
local[*]: 사용 가능한 모든 CPU 코어를 사용.
local[2]: CPU 코어 2개 사용.
local: 단일 스레드 사용.
```

In [1]:
from pyspark import SparkConf, SparkContext
# 스파크 환경 설정 객체 생성
conf = SparkConf().setMaster("local").setAppName("restaurant-review-average")
spark = SparkContext(conf=conf).getOrCreate()

24/12/06 17:59:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
data = [
    (0, "짜장면", "중식", 125),
    (1, "짬뽕", "중식", 235),
    (2, "김밥", "분식", 32),
    (3, "떡볶이", "분식", 534),
    (4, "라멘", "일식", 223),
    (5, "돈가스", "일식", 52),
    (6, "우동", "일식", 12),
    (7, "쌀국수", "아시안", 312),
    (8, "햄버거", "패스트푸드", 12),
    (9, "치킨", "패스트푸드", 23),
]

In [3]:
rdd = spark.parallelize(data)
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [4]:
rdd.take(3)

                                                                                

[(0, '짜장면', '중식', 125), (1, '짬뽕', '중식', 235), (2, '김밥', '분식', 32)]

In [5]:
import os
directory = os.path.join(os.getcwd(), "data")
filename = "restaurant_reviews.csv"
filepath = os.path.join(directory, filename)
lines = spark.textFile("file:///"+filepath.replace("\\", "/"))
lines

file:////home/lab05/src/data/restaurant_reviews.csv MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0

In [6]:
lines.take(5)

['id,item,cateogry,reviews,',
 '0,짜장면,중식,125,',
 '1,짬뽕,중식,235,',
 '2,김밥,분식,32,',
 '3,떡볶이,분식,534,']

In [7]:
header = lines.first()
filtered_lines = lines.filter(lambda row : row != header)

In [8]:
filtered_lines.take(5)

['0,짜장면,중식,125,',
 '1,짬뽕,중식,235,',
 '2,김밥,분식,32,',
 '3,떡볶이,분식,534,',
 '4,라멘,일식,223,']

In [9]:
def parse(row): 
    fields = row.split(",")
    category = fields[2]
    reviews = int(fields[3])
    
    return category, reviews

In [10]:
parse('0,짜장면,중식,125,')

('중식', 125)

# RDD 내 모든 row 에 대해 parse() 를 적용한 다음, map() 추출

In [11]:
category_reviews = filtered_lines.map(parse)
category_reviews.take(5)

[('중식', 125), ('중식', 235), ('분식', 32), ('분식', 534), ('일식', 223)]

In [12]:
category_review_count = category_reviews.mapValues(lambda x: (x, 1)) # map한 값에 value 입히기
category_review_count.take(3)

[('중식', (125, 1)), ('중식', (235, 1)), ('분식', (32, 1))]

'중식' (125,1) <br> 
1. 첫번째 중식 (125,1) <br> 
2. 두번째 중식 (235,1)<br> 

y 리뷰 점수의 합 = x[0] + y[0] <br> 
리뷰 건수의 합 = x[1] + y[1]<br> 

In [13]:
reduced_rdd = category_review_count.reduceByKey(lambda x,y : (x[0]+y[0], x[1]+y[1]))
reduced_rdd.collect()

[Stage 6:>                                                          (0 + 1) / 1]                                                                                

[('중식', (360, 2)),
 ('분식', (566, 2)),
 ('일식', (287, 3)),
 ('아시안', (312, 1)),
 ('패스트푸드', (35, 2))]

In [14]:
# 평균
average = reduced_rdd.mapValues(lambda x: x[0]/x[1])
average.collect()

[('중식', 180.0),
 ('분식', 283.0),
 ('일식', 95.66666666666667),
 ('아시안', 312.0),
 ('패스트푸드', 17.5)]

In [15]:
# 데이터 추가 > 메뉴별 합계와 평균

In [16]:
spark.stop()