In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("titanic_train").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/29 00:02:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
filepath = "/home/ubuntu/working/spark-examples/data/titanic_train.csv"
titanic_sdf = spark.read.csv(filepath, inferSchema=True, header=True)

titanic_sdf.show(5)

                                                                                

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [3]:
titanic_pdf = titanic_sdf.select("*").toPandas()
titanic_pdf.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 데이터 조작
- 데이터 프레임에 대한 삽입 및 수정, 삭제

## Pandas DataFrame에서 데이터 조작하기
- 컬럼에 대한 삽입과 수정을 쉽게 할 수 있다. `[ ]`을 활용해서...

In [4]:
import numpy as np

titanic_pdf_copy = titanic_pdf.copy()

# Fare에 10을 곱해서 Extra_Fare라는 컬럼에 집어넣기
titanic_pdf_copy["Extra_Fare"] = titanic_pdf_copy['Fare'] * 10
titanic_pdf_copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Extra_Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,72.5
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,712.833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,79.25
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,531.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,80.5


In [5]:
# 데이터 수정. Fare에 20 더하기
titanic_pdf_copy["Fare"] = titanic_pdf_copy["Fare"] + 20
titanic_pdf_copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Extra_Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,27.25,,S,72.5
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,91.2833,C85,C,712.833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,27.925,,S,79.25
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,73.1,C123,S,531.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,28.05,,S,80.5


## Spark Dataframed 데이터 조작⭐️
- `withColumn()` 메소드를 이용하여 기존 컬럼 값을 수정, 타입 변경, 신규 컬럼을 추가한다.
    - `withColumn('신규 또는 업데이트 되는 컬럼명', '신규 또는 업데이트 되는 값')`
- 신규 또는 업데이트 되는 값을 생성 시에 기존 컬럼을 기반으로 한다면,
    - 신규 컬럼은 **문자열로** 지정
    - 기존 컬럼은 **`col`** 을 사용한다.
- 신규 컬럼을 추가하는 것은 `select`로도 가능
- 컬럼명 변경은 `withColumnRename()` 메소드 사용

In [6]:
from pyspark.sql.functions import col

titanic_sdf_copy = titanic_sdf.select("*")

# Fare에 10을 곱해서 Extra_Fare 컬럼에 집어 넣기
titanic_sdf_copy = titanic_sdf_copy\
                    .withColumn("Extra_Fare", col("Fare") * 10)

titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Extra_Fare|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|      72.5|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|   712.833|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|     79.25|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|     531.0|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|    

In [7]:
# 기존 컬럼 Update
titanic_sdf_copy = titanic_sdf_copy.withColumn("Fare", col("Fare") + 20)
titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Extra_Fare|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  27.25| null|       S|      72.5|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|91.2833|  C85|       C|   712.833|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282| 27.925| null|       S|     79.25|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   73.1| C123|       S|     531.0|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|  28.05| null|       S|    

In [9]:
# 컬럼 타입 변환
titanic_sdf_copy = titanic_sdf_copy\
                    .withColumn("Fare", col("Fare").cast("Integer"))
titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  27| null|       S|      72.5|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  91|  C85|       C|   712.833|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  27| null|       S|     79.25|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|  73| C123|       S|     531.0|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|  28| null|       S|      80.5|
+-----------+---

## 리터럴
- 프로그래밍 언어에서 코드에 등장하는 직접적인 값들을 literal이라고 한다.
- 리터럴은 상수

In [10]:
# pandas에서 리터럴로 데이터를 삽입하거나 수정하기
titanic_pdf_copy["Extra_Fare"] = 10
titanic_pdf_copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Extra_Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,27.25,,S,10
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,91.2833,C85,C,10
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,27.925,,S,10
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,73.1,C123,S,10
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,28.05,,S,10


In [13]:
# spark에서 리터럴로 데이터를 삽입하거나 수정하기
titanic_sdf_copy = titanic_sdf_copy.withColumn("Extra_Fare", 10)
titanic_sdf_copy.show(5)

PySparkTypeError: [NOT_COLUMN] Argument `col` should be a Column, got int.

In [14]:
# 상숫값으로 업데이트 하려면 반드시 lit 함수를 사용
from pyspark.sql.functions import lit

titanic_sdf_copy = titanic_sdf_copy.withColumn("Extra_Fare", lit(10))
titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  27| null|       S|        10|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  91|  C85|       C|        10|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  27| null|       S|        10|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|  73| C123|       S|        10|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|  28| null|       S|        10|
+-----------+---

In [15]:
titanic_sdf_copy = titanic_sdf_copy.withColumn("New_Column", lit("테스트 컬럼입니다."))
titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare|        New_Column|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  27| null|       S|        10|테스트 컬럼입니다.|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  91|  C85|       C|        10|테스트 컬럼입니다.|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  27| null|       S|        10|테스트 컬럼입니다.|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|  73| C123|       S|        10|테스트 컬럼입니다.|
|          5|       0|     3|Allen, Mr

In [17]:
# select를 활용해서 컬럼 업데이트 하기
from pyspark.sql.functions import substring

# 컬럼 추가
# SQL : select *, Embarked as E from titanic_sdf_copy;
titanic_sdf_copy = titanic_sdf_copy\
                    .select("*", col("Embarked").alias("E"))

titanic_sdf_copy = titanic_sdf_copy\
                    .select("*", substring("Cabin", 0, 1)\
                                    .alias("Cabin_Section")
                    )

titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+---+---+-------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare|        New_Column|  E|  E|Cabin_Section|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+---+---+-------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  27| null|       S|        10|테스트 컬럼입니다.|  S|  S|         null|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  91|  C85|       C|        10|테스트 컬럼입니다.|  C|  C|            C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  27| null|       S|        10|테스트 컬럼입니다.|  S|  S|         null|
|          4|       1|     1|Futrelle, M

In [18]:
from pyspark.sql.functions import split

# first_name 컬럼과 last_name 컬럼을 추가. withColumn 사용
# split 활용하기. split 하고 getItem(0), getItem(1)

titanic_sdf_copy = titanic_sdf_copy\
                    .withColumn("first_name", split(col("Name"), ",").getItem(0))\
                    .withColumn("last_name", split(col("Name"), ",").getItem(1))

titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+---+---+-------------+----------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare|        New_Column|  E|  E|Cabin_Section|first_name|           last_name|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+---+---+-------------+----------+--------------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  27| null|       S|        10|테스트 컬럼입니다.|  S|  S|         null|    Braund|     Mr. Owen Harris|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  91|  C85|       C|        10|테스트 컬럼입니다.|  C|  C|            C|   Cumings| Mrs. John Bradle...|
|          3|       1|     3|Heikkin

# 컬럼 이름 변경하기

In [19]:
titanic_sdf_copy = titanic_sdf_copy\
                    .withColumnRenamed("New_Column", "새로운 컬럼")
titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+---+---+-------------+----------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare|       새로운 컬럼|  E|  E|Cabin_Section|first_name|           last_name|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+---+---+-------------+----------+--------------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  27| null|       S|        10|테스트 컬럼입니다.|  S|  S|         null|    Braund|     Mr. Owen Harris|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  91|  C85|       C|        10|테스트 컬럼입니다.|  C|  C|            C|   Cumings| Mrs. John Bradle...|
|          3|       1|     3|Heikkinen, M

In [20]:
# 없는 컬럼의 이름을 집어 넣어도 에러가 안난다.
titanic_sdf_copy = titanic_sdf_copy\
                    .withColumnRenamed("없는 컬럼 이름", "오류 안남")

titanic_sdf_copy.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+---+---+-------------+----------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare|       새로운 컬럼|  E|  E|Cabin_Section|first_name|           last_name|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+----------+------------------+---+---+-------------+----------+--------------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  27| null|       S|        10|테스트 컬럼입니다.|  S|  S|         null|    Braund|     Mr. Owen Harris|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  91|  C85|       C|        10|테스트 컬럼입니다.|  C|  C|            C|   Cumings| Mrs. John Bradle...|
|          3|       1|     3|Heikkinen, M

# Spark Dataframe의 컬럼, 로우(레코드) 삭제
- pandas의 데이터프레임은 `drop` 메소드를 사용. 행과 열 모두 삭제
- spark 데이터프레임에도 `drop` 메소드를 사용. 컬럼만 삭제 가능
    - 여러 개의 컬럼을 삭제 할 때 리스트 사용 불가
- spark에서는 데이터(row)의 삭제가 원칙적으로는 불가능.
    - 데이터 삭제가 없는 대신에 `filter`를 이용해서 필요한 것만 추출

In [21]:
titanic_pdf_drop = titanic_pdf.drop("Name", axis=1)
titanic_pdf_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int32  
 1   Survived     891 non-null    int32  
 2   Pclass       891 non-null    int32  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int32  
 6   Parch        891 non-null    int32  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int32(5), object(4)
memory usage: 59.3+ KB


In [23]:
titanic_sdf_drop = titanic_sdf.drop(col("Name"))
titanic_sdf_drop.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [24]:
# Pclass=1인 row를 삭제. 하지만 실제로는 삭제가 아닌 Pclass != 1 인 데이터만 가져오는 것
titanic_removed_pclass_1 = titanic_sdf.filter(col("Pclass") != 1)
titanic_removed_pclass_1.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|  Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|  7.25| null|       S|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282| 7.925| null|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|  8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|          330877|8.4583| null|       Q|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1|          349909|21.075| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+--------------

# Spark Dataframe의 dropna

In [26]:
# 레코드에 하나라도 Null 또는 NaN값이 있으면 삭제한 결과 Dataframe이 반환된다.
print("Dropna 이전 : ", titanic_sdf.count())
titanic_sdf_dropna_1 = titanic_sdf.dropna()
print("Dropna 이후 : ", titanic_sdf_dropna_1.count())

Dropna 이전 :  891
Dropna 이후 :  183


In [27]:
type(titanic_sdf_dropna_1)

pyspark.sql.dataframe.DataFrame

In [30]:
print("dropna 이전 : ", titanic_sdf.count())
titanic_sdf_dropna_2 = titanic_sdf.na.drop()
print("dropna 이후 : ", titanic_sdf_dropna_2.count())
print(type(titanic_sdf.na))

dropna 이전 :  891
dropna 이후 :  183
<class 'pyspark.sql.dataframe.DataFrameNaFunctions'>


```SQL
select *
from titanic_sdf
where Name is not null
  and age is not null
  and Embarked is not null
  ...
```

In [31]:
# 특정 컬럼을 지정하여 거기에 Null이 있는 경우에만 삭제
titanic_sdf_dropna_3 = titanic_sdf.na.drop(subset=["Age", "Embarked"])
titanic_sdf_dropna_3.count()

712

```SQL
select *
from titanic_sdf
where age is not null
  and Embarked is not null
```

In [32]:
# filter 사용하기
# isNotNull

titanic_sdf.filter(
    col("Age").isNotNull() & col("Embarked").isNotNull()
).count()

712

In [33]:
# dropna() 메소드를 로직으로 구현. 
where_str = ''
column_count = len(titanic_sdf.columns)
for index, column_name in enumerate(titanic_sdf.columns):
    where_str += (column_name +' IS NOT NULL ') 
    if index < column_count - 1:
        where_str += 'and '
print(where_str)

PassengerId IS NOT NULL and Survived IS NOT NULL and Pclass IS NOT NULL and Name IS NOT NULL and Sex IS NOT NULL and Age IS NOT NULL and SibSp IS NOT NULL and Parch IS NOT NULL and Ticket IS NOT NULL and Fare IS NOT NULL and Cabin IS NOT NULL and Embarked IS NOT NULL 


In [34]:
titanic_sdf.filter(where_str).count()

183

# Pandas와 Spark에서의 None, Null, NaN

In [35]:
a = None
print(type(a), a)

<class 'NoneType'> None


In [37]:
titanic_sdf.show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [38]:
titanic_pdf.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [39]:
import pandas as pd
import numpy as np

# None은 Object형 array에만 사용이 가능
array = np.array([0, 1, 2, 'hello', None])
print(array, array.dtype)

[0 1 2 'hello' None] object


In [40]:
# 숫자 형식의 array에서는 None이 NaN으로 바뀐다.
array = np.array([0, 1, 2, None], dtype=np.float32)
print(array, array.dtype)

[ 0.  1.  2. nan] float32


In [43]:
# Pandas 1.x 버전에서만 사용 가능! pandas 2.x과 Spark가 호환이 안됨.
# spark 3.2 버전에서는 NaN과 None을 전부 null로 만들어 준다.
titanic_sdf_from_pdf = spark.createDataFrame(titanic_pdf)
titanic_sdf_from_pdf.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male| NaN|    0|    0|      

[Stage 56:>                                                         (0 + 1) / 1]                                                                                

In [48]:
print(titanic_pdf[["Age", "Cabin"]].head(10))

    Age Cabin
0  22.0  None
1  38.0   C85
2  26.0  None
3  35.0  C123
4  35.0  None
5   NaN  None
6  54.0   E46
7   2.0  None
8  27.0  None
9  14.0  None


In [49]:
print(titanic_pdf[["Age", "Cabin"]].isna().head(10))

     Age  Cabin
0  False   True
1  False  False
2  False   True
3  False  False
4  False   True
5   True   True
6  False  False
7  False   True
8  False   True
9  False   True


In [50]:
print(titanic_pdf[["Age", "Cabin"]].isnull().head(10))

     Age  Cabin
0  False   True
1  False  False
2  False   True
3  False  False
4  False   True
5   True   True
6  False  False
7  False   True
8  False   True
9  False   True


In [54]:
# Spark에서 null값 검사는 isNull, isnan
# isNull -> col()에서 사용
# isnan -> functions
from pyspark.sql.functions import isnan

titanic_sdf.filter(col("Age").isNull()).show(10)
titanic_sdf.filter(isnan(col("Age"))).show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|    Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----+--------+
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|  330877|  8.4583| null|       Q|
|         18|       1|     2|Williams, Mr. Cha...|  male|null|    0|    0|  244373|    13.0| null|       S|
|         20|       1|     3|Masselmani, Mrs. ...|female|null|    0|    0|    2649|   7.225| null|       C|
|         27|       0|     3|Emir, Mr. Farred ...|  male|null|    0|    0|    2631|   7.225| null|       C|
|         29|       1|     3|"O'Dwyer, Miss. E...|female|null|    0|    0|  330959|  7.8792| null|       Q|
|         30|       0|     3| Todoroff, Mr. Lalio|  male|null|    0|    0|  349216|  7.8958| null|       S|
|         32|       1|     1

In [55]:
# pandas -> spark. 숫자 형태의 데이터가 NaN 취급을 받음. isnan 사용 가능
# spark 에서는 숫자 형태의 데이터가 null 취급 받음. isnan 사용 불가

# Null이 있는 컬럼명과 Null 개수 찾기

In [56]:
titanic_sdf.select(
    [ col(column_name).isNull() for column_name in titanic_sdf.columns ]
).show(20)

+---------------------+------------------+----------------+--------------+-------------+-------------+---------------+---------------+----------------+--------------+---------------+------------------+
|(PassengerId IS NULL)|(Survived IS NULL)|(Pclass IS NULL)|(Name IS NULL)|(Sex IS NULL)|(Age IS NULL)|(SibSp IS NULL)|(Parch IS NULL)|(Ticket IS NULL)|(Fare IS NULL)|(Cabin IS NULL)|(Embarked IS NULL)|
+---------------------+------------------+----------------+--------------+-------------+-------------+---------------+---------------+----------------+--------------+---------------+------------------+
|                false|             false|           false|         false|        false|        false|          false|          false|           false|         false|           true|             false|
|                false|             false|           false|         false|        false|        false|          false|          false|           false|         false|          false|          

In [63]:
from pyspark.sql.functions import count, when

titanic_sdf.select(
    [ count(when(col(c).isNull(), c)).alias(c) for c in titanic_sdf.columns ]
).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



# null값 처리하기

In [65]:
titanic_pdf['Age']=titanic_pdf['Age'].fillna(titanic_pdf['Age'].mean())
titanic_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int32  
 1   Survived     891 non-null    int32  
 2   Pclass       891 non-null    int32  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int32  
 7   Parch        891 non-null    int32  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int32(5), object(5)
memory usage: 66.3+ KB


In [66]:
# 스파크 데이터프레임에서 결측치 채우기
titanic_sdf.fillna(value=999).show(10)

+-----------+--------+------+--------------------+------+-----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|  Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|999.0|    0|   

In [67]:
titanic_sdf.fillna("NA").show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25|   NA|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925|   NA|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05|   NA|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [69]:
# Age에 대한 결측치를 Age에 대한 평균으로 처리
from pyspark.sql.functions import avg

avg_age = titanic_sdf.select(avg(col("Age")))
avg_age.show()
type(avg_age)

+-----------------+
|         avg(Age)|
+-----------------+
|29.69911764705882|
+-----------------+



pyspark.sql.dataframe.DataFrame

In [70]:
# fillna 수행 시에 value에 데이터프레임이 들어가면 오류
titanic_sdf.fillna(value=avg_age, subset=["Age"])

PySparkTypeError: [NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_STR] Argument `value` should be a bool, dict, float, int or str, got DataFrame.

In [73]:
avg_age_row = avg_age.collect()[0]
avg_age_row, type(avg_age_row)

(Row(avg(Age)=29.69911764705882), pyspark.sql.types.Row)

In [74]:
avg_age_value = avg_age_row[0]
avg_age_value

29.69911764705882

In [76]:
titanic_sdf.fillna(value=avg_age_value, subset=["Age"]).show()

+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|              Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|             22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|             38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|             26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|             35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|             35.0|    0|    0|          373450|

In [77]:
spark.stop()