In [1]:
## 설치
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [773 kB]
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,898 kB]
Get:13 http://archive.ubuntu.com/ubuntu j

In [2]:
## 초기화
import findspark
findspark.init()
findspark.find()

'/usr/local/lib/python3.10/dist-packages/pyspark'

In [3]:
import pyspark
from pyspark.sql import DataFrame, SparkSession

In [4]:
## Session 생성
spark = (
    SparkSession
    .builder
    .appName("First Session")
    .getOrCreate()
)

spark

In [5]:
spark.sparkContext.getConf().getAll()

[('spark.app.startTime', '1711273036143'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),
 ('spark.executor.id', 'driver'),
 ('spark.sql.warehouse.dir

In [6]:
conf = spark.sparkContext._conf.setAll([
     ('spark.executor.memory', '2g'),
     ('spark.app.name', 'Spark Updated Conf'),
     ('spark.executor.cores', '4'),
     ('spark.cores.max', '4'),
     ('spark.driver.memory','2g')
    ])

## Session 생성
spark = (
    SparkSession
    .builder
    .config(conf=conf)
    .getOrCreate()
)

spark

In [7]:
spark.sparkContext.getConf().getAll()

[('spark.app.startTime', '1711273036143'),
 ('spark.executor.memory', '2g'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),
 ('spark.executor.id', 'dr

In [8]:
## df 생성
df = spark.range(500).toDF("number")

In [9]:
df.printSchema()

root
 |-- number: long (nullable = false)



In [10]:
df.show()
# Java Virtual Machine과 통신하는 거기 때문에 속도가 빠를 수 없다.

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



In [11]:
df.toPandas()

Unnamed: 0,number
0,0
1,1
2,2
3,3
4,4
...,...
495,495
496,496
497,497
498,498


# Transformation

In [12]:
df.select("number")

DataFrame[number: bigint]

In [13]:
import pyspark.sql.functions as f

In [14]:
# 덧셈과 곱셈
df_ = df.select((f.col("number") * 2 + 10).alias("number"))
df_

DataFrame[number: bigint]

In [15]:
# 새로운 컬럼 추가
df = df.withColumn('number2', df["number"] * 2 + 10)
df.show()

+------+-------+
|number|number2|
+------+-------+
|     0|     10|
|     1|     12|
|     2|     14|
|     3|     16|
|     4|     18|
|     5|     20|
|     6|     22|
|     7|     24|
|     8|     26|
|     9|     28|
|    10|     30|
|    11|     32|
|    12|     34|
|    13|     36|
|    14|     38|
|    15|     40|
|    16|     42|
|    17|     44|
|    18|     46|
|    19|     48|
+------+-------+
only showing top 20 rows



In [16]:
df.select(df["number"].cast("short").alias('number_short'), "number2")

DataFrame[number_short: smallint, number2: bigint]

In [17]:
df.filter(df['number'] > 3)

DataFrame[number: bigint, number2: bigint]

# SparkSQL

In [18]:
# View 생성
df.createOrReplaceTempView("df_number")

In [19]:
spark.sql("""SELECT * FROM df_number;""").show()

+------+-------+
|number|number2|
+------+-------+
|     0|     10|
|     1|     12|
|     2|     14|
|     3|     16|
|     4|     18|
|     5|     20|
|     6|     22|
|     7|     24|
|     8|     26|
|     9|     28|
|    10|     30|
|    11|     32|
|    12|     34|
|    13|     36|
|    14|     38|
|    15|     40|
|    16|     42|
|    17|     44|
|    18|     46|
|    19|     48|
+------+-------+
only showing top 20 rows



In [21]:
spark.sql("""SELECT * FROM df_number where number <10;""").show()
# lazy computation이므로 show가 없으면 결과가 나오지 않음.

+------+-------+
|number|number2|
+------+-------+
|     0|     10|
|     1|     12|
|     2|     14|
|     3|     16|
|     4|     18|
|     5|     20|
|     6|     22|
|     7|     24|
|     8|     26|
|     9|     28|
+------+-------+



# Action
- 연산을 하도록 억지로 trigger하는 것

In [22]:
df.collect()
# 리스트 안에 row들이 들어가 있다.

[Row(number=0, number2=10),
 Row(number=1, number2=12),
 Row(number=2, number2=14),
 Row(number=3, number2=16),
 Row(number=4, number2=18),
 Row(number=5, number2=20),
 Row(number=6, number2=22),
 Row(number=7, number2=24),
 Row(number=8, number2=26),
 Row(number=9, number2=28),
 Row(number=10, number2=30),
 Row(number=11, number2=32),
 Row(number=12, number2=34),
 Row(number=13, number2=36),
 Row(number=14, number2=38),
 Row(number=15, number2=40),
 Row(number=16, number2=42),
 Row(number=17, number2=44),
 Row(number=18, number2=46),
 Row(number=19, number2=48),
 Row(number=20, number2=50),
 Row(number=21, number2=52),
 Row(number=22, number2=54),
 Row(number=23, number2=56),
 Row(number=24, number2=58),
 Row(number=25, number2=60),
 Row(number=26, number2=62),
 Row(number=27, number2=64),
 Row(number=28, number2=66),
 Row(number=29, number2=68),
 Row(number=30, number2=70),
 Row(number=31, number2=72),
 Row(number=32, number2=74),
 Row(number=33, number2=76),
 Row(number=34, number2=

In [23]:
df.collect()[0]

Row(number=0, number2=10)

In [24]:
df.collect()[0].number

0

In [25]:
df.collect()[0].number2

10

In [26]:
# take는 head
df.take(4)

[Row(number=0, number2=10),
 Row(number=1, number2=12),
 Row(number=2, number2=14),
 Row(number=3, number2=16)]

In [27]:
spark.range(2).collect()

[Row(id=0), Row(id=1)]

In [28]:
df.write.parquet("content/df_dummy.parquet",mode="overwrite")

In [30]:
import pandas as pd
pd.read_parquet('/content/content/df_dummy.parquet/part-00001-e2c42200-7bb9-427e-9997-e7da1a8ef2f7-c000.snappy.parquet')

Unnamed: 0,number,number2
0,250,510
1,251,512
2,252,514
3,253,516
4,254,518
...,...,...
245,495,1000
246,496,1002
247,497,1004
248,498,1006


In [31]:
df.rdd.getNumPartitons()

AttributeError: 'RDD' object has no attribute 'getNumPartitons'

In [29]:
df.toPandas()

Unnamed: 0,number,number2
0,0,10
1,1,12
2,2,14
3,3,16
4,4,18
...,...,...
495,495,1000
496,496,1002
497,497,1004
498,498,1006


In [32]:
spark.read.parquet("content/df_dummy.parquet")

DataFrame[number: bigint, number2: bigint]

# UDF
- 판다스 코드를 쉽게 인식하는 UDF

In [33]:
from pyspark.sql.functions import PandasUDFType
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import IntegerType
import pandas as pd


# @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
@pandas_udf("short")
def double(number:pd.Series) -> pd.Series:

    return number*2

df.select(double("number")).show()

+--------------+
|double(number)|
+--------------+
|             0|
|             2|
|             4|
|             6|
|             8|
|            10|
|            12|
|            14|
|            16|
|            18|
|            20|
|            22|
|            24|
|            26|
|            28|
|            30|
|            32|
|            34|
|            36|
|            38|
+--------------+
only showing top 20 rows



In [34]:
@pandas_udf("bigint")
def double(number:pd.Series) -> pd.Series:

    return number*2

df.select(double("number")).show()
# short과 bigint 둘 다 numeric
# 여기에 string이 들어가면 오류!

+--------------+
|double(number)|
+--------------+
|             0|
|             2|
|             4|
|             6|
|             8|
|            10|
|            12|
|            14|
|            16|
|            18|
|            20|
|            22|
|            24|
|            26|
|            28|
|            30|
|            32|
|            34|
|            36|
|            38|
+--------------+
only showing top 20 rows

