In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("241210_01_SparkSQL_basic1")
sc   = SparkContext(conf=conf)

24/12/10 14:50:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
movies_rdd = sc.parallelize([
    (1, ("어벤져스", "마블")),
    (2, ("슈퍼맨", "DC")),
    (3, ("배트맨", "DC")),
    (4, ("겨울왕국", "디즈니")),
    (5, ("아이언맨", "마블"))
])


attendances_rdd = sc.parallelize([
    (1, (13934592, "KR")),
    (2, (2182227,"KR")),
    (3, (4226242, "KR")),
    (4, (10303058, "KR")),
    (5, (4300365, "KR"))
])

In [3]:
# # 관객수가 500만 이상인 영화를 가져오기

# 1. 조인 > 필터 > 영화정보
# 2. 필터 > 조인 > 영화정보

In [4]:
# 1. 조인 먼저 
movies_att = movies_rdd.join(attendances_rdd)
movies_att.filter(
    lambda x:x[1][1][0] > 5000000
).collect()

                                                                                

[(4, (('겨울왕국', '디즈니'), (10303058, 'KR'))),
 (1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [5]:
# 2. 필터 먼저
filtered_att = attendances_rdd.filter(lambda x:x[1][0] > 5000000)
movies_rdd.join(filtered_att).collect()

[(4, (('겨울왕국', '디즈니'), (10303058, 'KR'))),
 (1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [6]:
sc.stop()

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark_sql_basic").getOrCreate()
spark

In [8]:
movies = [
    (1, "어벤져스", "마블", 2012, 4, 26),
    (2, "슈퍼맨", "DC", 2013, 6, 13),
    (3, "배트맨", "DC", 2008, 8, 6),
    (4, "겨울왕국", "디즈니", 2014, 1, 16),
    (5, "아이언맨", "마블", 2008, 4, 30)
]

In [9]:
movie_schema = ['id','name','company','year','month','day']

## 1. DataFrame 만들기

In [10]:
df= spark.createDataFrame(data= movies, schema = movie_schema)

In [11]:
df.dtypes

[('id', 'bigint'),
 ('name', 'string'),
 ('company', 'string'),
 ('year', 'bigint'),
 ('month', 'bigint'),
 ('day', 'bigint')]

In [12]:
df.show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  4|겨울왕국| 디즈니|2014|    1| 16|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [13]:
#projection
df.select("name").show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [14]:
df.filter(df.year >=2010).show() #column 객체

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  4|겨울왕국| 디즈니|2014|    1| 16|
+---+--------+-------+----+-----+---+



# SQL 을 사용하기 위해서 View 에 등록

In [15]:
df.createOrReplaceTempView("movies") #View Name, Table Name

In [16]:
# view 를 select하는 문장

query = '''
SELECT * FROM movies
'''
spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  4|겨울왕국| 디즈니|2014|    1| 16|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [17]:
# 2010 년 이후 개봉한 영화 조회

query = '''
SELECT name 
FROM movies
WHERE year >= 2010
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|겨울왕국|
+--------+



In [18]:
# 컴퍼니가 마블인 영화 목록

query = '''
SELECT name 
FROM movies
WHERE company = '마블'
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|아이언맨|
+--------+



In [19]:
#-맨인 영화 추출

query = '''
SELECT name 
FROM movies
WHERE name like '%맨'
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|  슈퍼맨|
|  배트맨|
|아이언맨|
+--------+



In [20]:
#-이-인 영화 추출

query = '''
SELECT name 
FROM movies
WHERE name like '%이%'
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|아이언맨|
+--------+



In [21]:
#개봉 월이 4~8월 사이

query = '''
SELECT name 
FROM movies
WHERE month between 4 and 8
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|아이언맨|
+--------+



In [22]:
# 2가지 조건 - 1. 영화 이름 -맨, 개봉 연도가 2010년 이하

query = '''
SELECT name 
FROM movies
WHERE name like '%맨' and year <= 2010
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|  배트맨|
|아이언맨|
+--------+



In [23]:
# 2가지 조건 - 1. 영화 이름 -맨, 개봉 연도가 2010년 이하

query = '''
SELECT name 
FROM movies
WHERE company in ('마블','DC') 
'''
spark.sql(query).show()

#OR 
query = '''
SELECT name 
FROM movies
WHERE company= '마블' or company='DC'
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|아이언맨|
+--------+

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|아이언맨|
+--------+



In [24]:
# 회사가 "마"로 시작을 하거나, "나"로 끝나는 영화

query = '''
SELECT name 
FROM movies
WHERE company like '마%' or company like'%나'
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|아이언맨|
+--------+



In [25]:
# 회사가 "마"로 시작을 하거나, "나"로 끝나는 영화 중, 2010년 이후로 개봉한 영화

query = '''
SELECT name 
FROM movies
WHERE company like '마%' or company like'%나' and year >= 2010
'''
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|아이언맨|
+--------+



In [26]:
# 개봉 연도 오름차순으로 확인

query = '''
SELECT name, year 
FROM movies
ORDER BY year ASC
'''
#ASC : ascending 오름차순, default <-> DECS: descending 내림차순

spark.sql(query).show()

+--------+----+
|    name|year|
+--------+----+
|  배트맨|2008|
|아이언맨|2008|
|어벤져스|2012|
|  슈퍼맨|2013|
|겨울왕국|2014|
+--------+----+



In [27]:
# count(*), count(name)
#mean, sum

query = '''
SELECT count(*) 
FROM movies
WHERE company like '마%' or company like'%나'
'''
spark.sql(query).show()

+--------+
|count(1)|
+--------+
|       2|
+--------+



In [31]:
attendances = [
    [1, 13934592., "KR"],
    (2, 2182227.,"KR"),
    (3, 4226242., "KR"),
    (4, 10303058., "KR"),
    (5, 4300365., "KR")
]

In [32]:
from pyspark.sql.types import StringType, IntegerType, FloatType, StructType, StructField

att_schema = StructType([ # 모든 컬럼의 타입을 통칭 - 컬럼 데이터의 집합
    StructField("id", IntegerType(), True), # StructField : 컬럼
    StructField("att", FloatType(), True),
    StructField("theater_country", StringType(), True)
])

In [33]:
# 카운트 
att_df = spark.createDataFrame( data=attendances, schema = att_schema )
att_df.dtypes

[('id', 'int'), ('att', 'float'), ('theater_country', 'string')]

In [34]:
att_df.show()

+---+-----------+---------------+
| id|        att|theater_country|
+---+-----------+---------------+
|  1|1.3934592E7|             KR|
|  2|  2182227.0|             KR|
|  3|  4226242.0|             KR|
|  4|1.0303058E7|             KR|
|  5|  4300365.0|             KR|
+---+-----------+---------------+



In [47]:
att_df.createOrReplaceTempView("att")

query = '''
SELECT movies.id, movies.name, movies.company, att.att, att.theater_country
FROM movies JOIN att ON  movies.id = att.id
'''
movies_views = spark.sql(query)
movies_views.show()



+---+--------+-------+-----------+---------------+
| id|    name|company|        att|theater_country|
+---+--------+-------+-----------+---------------+
|  5|아이언맨|   마블|  4300365.0|             KR|
|  1|어벤져스|   마블|1.3934592E7|             KR|
|  3|  배트맨|     DC|  4226242.0|             KR|
|  2|  슈퍼맨|     DC|  2182227.0|             KR|
|  4|겨울왕국| 디즈니|1.0303058E7|             KR|
+---+--------+-------+-----------+---------------+



In [49]:
# VIEW 만들기
movies_views.createOrReplaceTempView("movies_views")

query = '''
select *
from movies_views
'''

spark.sql(query).show()

+---+--------+-------+-----------+---------------+
| id|    name|company|        att|theater_country|
+---+--------+-------+-----------+---------------+
|  5|아이언맨|   마블|  4300365.0|             KR|
|  1|어벤져스|   마블|1.3934592E7|             KR|
|  3|  배트맨|     DC|  4226242.0|             KR|
|  2|  슈퍼맨|     DC|  2182227.0|             KR|
|  4|겨울왕국| 디즈니|1.0303058E7|             KR|
+---+--------+-------+-----------+---------------+



In [53]:
query = '''
select * 
from movies_views
where company = '마블'
order by id
'''
spark.sql(query).show()



+---+--------+-------+-----------+---------------+
| id|    name|company|        att|theater_country|
+---+--------+-------+-----------+---------------+
|  1|어벤져스|   마블|1.3934592E7|             KR|
|  5|아이언맨|   마블|  4300365.0|             KR|
+---+--------+-------+-----------+---------------+





In [55]:
spark.stop()
sc.stop()