In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [27]:
from datetime import date, datetime
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
from datetime import date, datetime
from pyspark.sql import *

In [4]:
# !pip install pandas
# !pip install pyarrow

# 1. DataFrame 생성

- SparkSession 객체를 사용해 DataFrame을 생성할 수 있다.
- SparkSession 객체는 pyspark shell을 실행할 때 spark 라는 이름으로 미리 생성된다.



## Row 객체를 사용해 생성하기

- row : DataFrame에서의 한 행

In [26]:
df = spark.createDataFrame([
    Row(name='이서정',age=23,birth=date(1999,8,3)),
    Row(name='하명도',age=21,birth=date(2001,9,6)),
    Row(name='서상윤',age=28,birth=date(1995,6,22))
])

df.show()

+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|이서정| 23|1999-08-03|
|하명도| 21|2001-09-06|
|서상윤| 28|1995-06-22|
+------+---+----------+



In [10]:
# 스키마 확인
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- birth: date (nullable = true)



## schema를 명시하여 DataFrame 생성

In [13]:
# 튜플에 데이터를 저장하고 스키마를 직접 지정
df2 = spark.createDataFrame([
    ('김경민',17,date(2005,10,11)),
    ('김도은',18,date(2004,12,25)),
    ('김민석',11,date(2012,1,10))
], schema="name string, age int, birth date")
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- birth: date (nullable = true)



## StructType 객체를 사용해 Schema 지정

In [25]:
schema = StructType([
    StructField("name", StringType(),False),
    StructField("age", IntegerType(),False),
    StructField("birth", DateType(),False),
    StructField("is_pass", BooleanType(),False),
])

df3 = spark.createDataFrame([
    ('손지수',22,date(2001,6,8),True),
    ('유승종',24,date(1999,8,9),True),
    ('윤병우',23,date(2001,3,3),True),
], schema=schema)

df3.show()

[Stage 17:>                                                         (0 + 1) / 1]

+------+---+----------+-------+
|  name|age|     birth|is_pass|
+------+---+----------+-------+
|손지수| 22|2001-06-08|   true|
|유승종| 24|1999-08-09|   true|
|윤병우| 23|2001-03-03|   true|
+------+---+----------+-------+



                                                                                

## 중첩스키마적용

In [24]:
data = [
    ("이상엽",21,date(2002,11,11),('010','1111','2222')),
    ("이선희",26,date(1997,10,3),('010','1111','2222')),
    ("정주연",23,date(2000,4,19),('010','1111','2222')),
]                
schema = StructType([
    StructField("name", StringType(),False),
    StructField("age", IntegerType(),False),
    StructField("birth", DateType(),False),
    StructField("phone", StructType([
        StructField("phone1", StringType(),False),
        StructField("phone2", StringType(),False),
        StructField("phone3", StringType(),False),
    ]),False, metadata={'desc':'user phone number'}),
])

df4 = spark.createDataFrame(data=data,schema=schema)
df4.show()
df4.printSchema()

schema_json = df4.schema.json()
print(schema_json)

+------+---+----------+-----------------+
|  name|age|     birth|            phone|
+------+---+----------+-----------------+
|이상엽| 21|2002-11-11|{010, 1111, 2222}|
|이선희| 26|1997-10-03|{010, 1111, 2222}|
|정주연| 23|2000-04-19|{010, 1111, 2222}|
+------+---+----------+-----------------+

root
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)
 |-- birth: date (nullable = false)
 |-- phone: struct (nullable = false)
 |    |-- phone1: string (nullable = false)
 |    |-- phone2: string (nullable = false)
 |    |-- phone3: string (nullable = false)

{"fields":[{"metadata":{},"name":"name","nullable":false,"type":"string"},{"metadata":{},"name":"age","nullable":false,"type":"integer"},{"metadata":{},"name":"birth","nullable":false,"type":"date"},{"metadata":{"desc":"user phone number"},"name":"phone","nullable":false,"type":{"fields":[{"metadata":{},"name":"phone1","nullable":false,"type":"string"},{"metadata":{},"name":"phone2","nullable":false,"type":"string"},{"metada

## Pandas DataFrame으로 생성

In [29]:
pandas_df = pd.DataFrame({
    'name':['정현진','한병현','홍효정'],
    'age':[20,21,22],
    'birth':[date(2000,1,1),date(2001,2,2),date(2002,5,5)]
})
pandas_df
df5 = spark.createDataFrame(pandas_df)
df5.show()

Unnamed: 0,name,age,birth
0,정현진,20,2000-01-01
1,한병현,21,2001-02-02
2,홍효정,22,2002-05-05


+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|정현진| 20|2000-01-01|
|한병현| 21|2001-02-02|
|홍효정| 22|2002-05-05|
+------+---+----------+



## DataFrame -> Pandas

In [30]:
pandas_df2 = df5.toPandas()
pandas_df2

                                                                                

Unnamed: 0,name,age,birth
0,정현진,20,2000-01-01
1,한병현,21,2001-02-02
2,홍효정,22,2002-05-05


## DataFrame -> pyspark.pandas


In [31]:
pandas_df3 = df5.to_pandas_on_spark()
pandas_df3

                                                                                

Unnamed: 0,name,age,birth
0,정현진,20,2000-01-01
1,한병현,21,2001-02-02
2,홍효정,22,2002-05-05


## 외부파일을 사용해 DataFrame 생성

In [34]:
class_df = spark.read.csv('/dataframe/a_class_info.csv', header=True)
class_df.show(3)

[Stage 30:>                                                         (0 + 1) / 1]

+--------+------+-------------+--------+-----------+-------------+
|class_cd|school|class_std_cnt|     loc|school_type|teaching_type|
+--------+------+-------------+--------+-----------+-------------+
|     6OL| ANKYI|           20|   Urban| Non-public|     Standard|
|     ZNS| ANKYI|           21|   Urban| Non-public|     Standard|
|     2B1| CCAAW|           18|Suburban| Non-public| Experimental|
+--------+------+-------------+--------+-----------+-------------+
only showing top 3 rows



                                                                                

## DataFrame 컬럼

- withColumn

+------+---+----------+-----------------+--------+
|  name|age|     birth|            phone|우승여부|
+------+---+----------+-----------------+--------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|        |
|이제동| 20|2021-07-22|{010, 2222, 3333}|        |
|김명운| 25|2020-07-22|{010, 4444, 5555}|        |
|홍진호| 36|2018-07-22|{010, 3333, 4444}|        |
+------+---+----------+-----------------+--------+

+------+---+----------+-----------------+--------+
|  name|age|     birth|            phone|우승여부|
+------+---+----------+-----------------+--------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|    우승|
|이제동| 20|2021-07-22|{010, 2222, 3333}|    우승|
|김명운| 25|2020-07-22|{010, 4444, 5555}|    우승|
|홍진호| 36|2018-07-22|{010, 3333, 4444}|    우승|
+------+---+----------+-----------------+--------+

+------+---+----------+-----------------+---------+
|  name|age|     birth|            phone|   연령대|
+------+---+----------+-----------------+---------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|     10대|
|이제동| 20|2021-07-22|{

### column  내용  변경

+------+---+----------+-----------------+------+
|  name|age|     birth|            phone|연령대|
+------+---+----------+-----------------+------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|어린이|
|이제동| 20|2021-07-22|{010, 2222, 3333}|  청년|
|김명운| 25|2020-07-22|{010, 4444, 5555}|  청년|
|홍진호| 36|2018-07-22|{010, 3333, 4444}|  성인|
+------+---+----------+-----------------+------+



### column 이름 변경

+------+---+----------+-----------------+------+
|  name|age|     birth|            phone|  분류|
+------+---+----------+-----------------+------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|어린이|
|이제동| 20|2021-07-22|{010, 2222, 3333}|  청년|
|김명운| 25|2020-07-22|{010, 4444, 5555}|  청년|
|홍진호| 36|2018-07-22|{010, 3333, 4444}|  성인|
+------+---+----------+-----------------+------+



### column  삭제

+------+---+----------+-----------------+
|  name|age|     birth|            phone|
+------+---+----------+-----------------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|
|이제동| 20|2021-07-22|{010, 2222, 3333}|
|김명운| 25|2020-07-22|{010, 4444, 5555}|
|홍진호| 36|2018-07-22|{010, 3333, 4444}|
+------+---+----------+-----------------+

