# RDD & pandas 방식 차감 성능 테스트  
- rdd cache 이용한 차감 전체 차감 속도 측정(stream server 안에서)  
- pandas 이용한 로컬 차감 속도 측정  


In [8]:
# 필요 라이브러리 임포트  
import socket
import sys
import os
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
from os.path import abspath
# import findspark
import time 
import numpy as np 
import pandas as pd
import pyarrow 

In [3]:
# # !pip install jupyter-resource-usage jupyterlab-system-monitor jupyterlab-topbar-extension
# !pip install jupyterlab-topbar jupyterlab-topbar-text jupyterlab-topbar-extension

In [9]:
# 환경변수 정의  
scale = 1000 # 1000 만 건 수준
PRJ_ROOT = '/user/root'
APP_NAME = 'RDD-Pandas'
DB_NAME = 'inven'

# 데이터의 파일 포맷 및 파일명  
tbl_setop_name = 'inven/table-set-6m-20-1000'
file_format = 'parquet'

In [16]:
# 스파크 생성 
def spark_creation():
    spark = SparkSession.builder.master('yarn').appName(APP_NAME)\
    .config('spark.rpc.message.maxSize', '1024')\
    .config('spark.sql.execution.arrow.enabled', 'true')\
    .config('spark.driver.cores', '1').config('spark.driver.memory', '7g')\
    .config('spark.num.executors', '3')\
    .config('spark.executor.cores', '1').config('spark.executor.memory', '7g')\
    .config('spark.jars', '/hive-bin/lib/mysql-connector-java-5.1.49-bin.jar')\
    .config('spark.driver.extraClassPath', '/hive-bin/lib/mysql-connector-java-5.1.49-bin.jar').getOrCreate()
    #     .config('spark.sql.execution.arrow.enabled', 'true')\
    # spark.rpc.message.maxSize  240007497 
    sc = spark.sparkContext
    sc
    return spark

In [17]:
%%time
spark = spark_creation()
spark

CPU times: user 15.7 ms, sys: 8.45 ms, total: 24.1 ms
Wall time: 13.6 s




### 데이터 적재  

In [5]:
%%time
## 인벤 기준 정보 조회  
spark.read.format(file_format).load(tbl_setop_name).createOrReplaceTempView('setop_view')
spark.catalog.cacheTable("setop_view")
spark.catalog.isCached('setop_view')

CPU times: user 4.72 ms, sys: 1.98 ms, total: 6.7 ms
Wall time: 4.57 s


True

In [6]:
%%time
# toPandas : 80 초(2cols),   
df = spark.sql("select * from setop_view limit 3000000 ").rdd.toDF().toPandas()

CPU times: user 20.8 s, sys: 1.62 s, total: 22.4 s
Wall time: 1min 1s


In [7]:
df.shape

(3000000, 19)

In [8]:
%%time
# parq 파일로 쓰기... 500만 1초  
df.to_parquet("out-parq.par")

CPU times: user 867 ms, sys: 75.8 ms, total: 943 ms
Wall time: 872 ms


In [19]:
%%time
# pdf 를 파키 테이블로 쓰기 : par-out 경로 만들고 사용해야 함.  
# 500만 1초  
table = pyarrow.Table.from_pandas(df)
pyarrow.parquet.write_to_dataset(table , root_path="par-out")

CPU times: user 834 ms, sys: 47.7 ms, total: 882 ms
Wall time: 816 ms


In [18]:
%%time
# read
# 파키 읽기 : 1200만 2.5 초.  
df2 = pd.read_parquet("par-out")
df2.shape

CPU times: user 761 ms, sys: 452 ms, total: 1.21 s
Wall time: 1 s


(3000000, 19)

In [6]:
df2.head()

Unnamed: 0,setop,inv_rate_01,inv_val_01,inv_req_01,inv_rate_02,inv_val_02,inv_req_02,inv_rate_03,inv_val_03,inv_req_03,inv_rate_04,inv_val_04,inv_req_04,inv_rate_05,inv_val_05,inv_req_05,inv_rate_06,inv_val_06,inv_req_06
0,ST_A_0000000,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0
1,ST_A_0000001,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0
2,ST_A_0000002,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0
3,ST_A_0000003,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0
4,ST_A_0000004,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0,1e-05,1000,0


In [19]:
%%time
# rpc.maxsize 늘려야 직렬화 가능. 300만 호출 시 : 4.5 초.  
# global var로 사용해도...  
sdf = spark.createDataFrame(df2)
sdf.show(5)

+------------+--------------------+----------+----------+--------------------+----------+----------+--------------------+----------+----------+--------------------+----------+----------+--------------------+----------+----------+--------------------+----------+----------+
|       setop|         inv_rate_01|inv_val_01|inv_req_01|         inv_rate_02|inv_val_02|inv_req_02|         inv_rate_03|inv_val_03|inv_req_03|         inv_rate_04|inv_val_04|inv_req_04|         inv_rate_05|inv_val_05|inv_req_05|         inv_rate_06|inv_val_06|inv_req_06|
+------------+--------------------+----------+----------+--------------------+----------+----------+--------------------+----------+----------+--------------------+----------+----------+--------------------+----------+----------+--------------------+----------+----------+
|ST_A_0000000|9.999999747378752E-6|      1000|         0|9.999999747378752E-6|      1000|         0|9.999999747378752E-6|      1000|         0|9.999999747378752E-6|      1000|      

In [13]:
spark.stop()