# RDD 샘플 코드  

In [None]:
# test용 hdfs 데이터 생성  


## Spark context 생성  

In [1]:
# 필요 라이브러리 임포트  
import socket
import sys
import os
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
from os.path import abspath
import time 

In [55]:
# 환경변수 정의  
scale = 50 
partition_num = 2
tbl_name = 'examples/rdd-data'
file_format = 'parquet'

PRJ_ROOT = '/user/root'
APP_NAME = 'EXAMPLES-RDD'
DB_NAME = 'examples'

In [3]:
# 스파크 생성 
def spark_creation():
    spark = SparkSession.builder.master('yarn').appName(APP_NAME)\
    .config('spark.rpc.message.maxSize', '1024')\
    .config('spark.hadoop.dfs.replication', '1')\
    .config('spark.driver.cores', '1').config('spark.driver.memory', '7g')\
    .config('spark.num.executors', '3')\
    .config('spark.executor.cores', '1').config('spark.executor.memory', '7g').getOrCreate()
    sc = spark.sparkContext
    sc
    return spark

In [4]:
spark = spark_creation()
spark



In [56]:
# 샘플 데이터 생성 
def create_samples():
    setop_count = scale 
    inv_rate = 100/setop_count
    # 1개월에 최대 1000 건 청약 가정. 
    inv_val = 1000
    inv_req = 0
    setop_name = ['ST_A', 'ST_B', 'ST_C', 'ST_D', 'ST_E', 'ST_F', 'ST_G', 'ST_H', 'ST_I', 'ST_J']
    setops = []
    for s in setop_name:
        for i in range(0, int(setop_count/len(setop_name))):
            setop_id = f'{s}_{i:07d}'
            setops.append([ setop_id, s, inv_rate, inv_val, inv_req ])
    return setops

# 샘플 데이터 형식 정의. 읽기/쓰기 편의 제공. 
def define_schema():
    from pyspark.sql.types import StructType, StructField, StringType, LongType, FloatType
    columns = [
        StructField("setop", StringType())
        , StructField("stype", StringType())
        , StructField("inv_rate_01", FloatType())
        , StructField("inv_val_01", LongType())
        , StructField("inv_req_01", LongType())
    ]
    sample_schema = StructType(columns)
    return sample_schema

In [57]:
%%time 
# 샘플 데이터 생성 및 확인 
sample_data = create_samples()
sample_schema = define_schema()

rdd = spark.sparkContext.parallelize(sample_data, partition_num)
df = spark.createDataFrame(rdd, sample_schema)
df.show(10)

+------------+-----+-----------+----------+----------+
|       setop|stype|inv_rate_01|inv_val_01|inv_req_01|
+------------+-----+-----------+----------+----------+
|ST_A_0000000| ST_A|        2.0|      1000|         0|
|ST_A_0000001| ST_A|        2.0|      1000|         0|
|ST_A_0000002| ST_A|        2.0|      1000|         0|
|ST_A_0000003| ST_A|        2.0|      1000|         0|
|ST_A_0000004| ST_A|        2.0|      1000|         0|
|ST_B_0000000| ST_B|        2.0|      1000|         0|
|ST_B_0000001| ST_B|        2.0|      1000|         0|
|ST_B_0000002| ST_B|        2.0|      1000|         0|
|ST_B_0000003| ST_B|        2.0|      1000|         0|
|ST_B_0000004| ST_B|        2.0|      1000|         0|
+------------+-----+-----------+----------+----------+
only showing top 10 rows

CPU times: user 11.3 ms, sys: 2 ms, total: 13.3 ms
Wall time: 151 ms


In [58]:
%%time 
write_mode = 'overwrite'
df.write.save(path=tbl_name, format=file_format, mode=write_mode)

CPU times: user 3.1 ms, sys: 0 ns, total: 3.1 ms
Wall time: 668 ms


---  

In [59]:
sdf = spark.read.format(file_format).load(tbl_name)
sdf.persist()
rdd = sdf.rdd
rdd

MapPartitionsRDD[105] at javaToPython at NativeMethodAccessorImpl.java:0

In [67]:
pairs = rdd.map(lambda s: (s['stype'], 1))
counts = pairs.reduceByKey(lambda a, b: a + b)
# counts = pairs.reduce(lambda a, b: a+b)
counts

PythonRDD[122] at RDD at PythonRDD.scala:53

In [69]:
result = counts.collect()
# len(result)
result

[('ST_A', 5),
 ('ST_C', 5),
 ('ST_H', 5),
 ('ST_I', 5),
 ('ST_J', 5),
 ('ST_F', 5),
 ('ST_G', 5),
 ('ST_B', 5),
 ('ST_D', 5),
 ('ST_E', 5)]

In [70]:
%%time 
# spark pandas 이용하기 
import pyspark.pandas as ps

# psdf = ps.from_pandas(pdf)  
psdf = sdf.to_pandas_on_spark()
psdf.to_spark_io('zoo.parq', format="parquet")
ps.read_spark_io('zoo.parq', format="parquet").head(10)



CPU times: user 226 ms, sys: 40.7 ms, total: 267 ms
Wall time: 2.12 s


Unnamed: 0,setop,stype,inv_rate_01,inv_val_01,inv_req_01
0,ST_A_0000000,ST_A,2.0,1000,0
1,ST_A_0000001,ST_A,2.0,1000,0
2,ST_A_0000002,ST_A,2.0,1000,0
3,ST_A_0000003,ST_A,2.0,1000,0
4,ST_A_0000004,ST_A,2.0,1000,0
5,ST_B_0000000,ST_B,2.0,1000,0
6,ST_B_0000001,ST_B,2.0,1000,0
7,ST_B_0000002,ST_B,2.0,1000,0
8,ST_B_0000003,ST_B,2.0,1000,0
9,ST_B_0000004,ST_B,2.0,1000,0


# 정지  

In [71]:
spark.stop()