# 타이타닉 데이터 분석 

- 데이터 출처 : https://www.kaggle.com/competitions/titanic/data

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("241212_01_MLlib_classification").getOrCreate()
spark

24/12/12 15:36:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Data Load

In [2]:
df = spark.read.format("csv")\
    .option("header",'true')\
    .option('inferSchema', 'true')\
    .load("data/titanic.csv")

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [3]:
df.show()

+--------+------+------+----+-----+-----+-------+-----------+
|survived|pclass|   sex| age|sibsp|parch|   fare|embark_town|
+--------+------+------+----+-----+-----+-------+-----------+
|       0|     3|  male|22.0|    1|    0|   7.25|Southampton|
|       1|     1|female|38.0|    1|    0|71.2833|  Cherbourg|
|       1|     3|female|26.0|    0|    0|  7.925|Southampton|
|       1|     1|female|35.0|    1|    0|   53.1|Southampton|
|       0|     3|  male|35.0|    0|    0|   8.05|Southampton|
|       0|     3|  male|null|    0|    0| 8.4583| Queenstown|
|       0|     1|  male|54.0|    0|    0|51.8625|Southampton|
|       0|     3|  male| 2.0|    3|    1| 21.075|Southampton|
|       1|     3|female|27.0|    0|    2|11.1333|Southampton|
|       1|     2|female|14.0|    1|    0|30.0708|  Cherbourg|
|       1|     3|female| 4.0|    1|    1|   16.7|Southampton|
|       1|     1|female|58.0|    0|    0|  26.55|Southampton|
|       0|     3|  male|20.0|    0|    0|   8.05|Southampton|
|       

In [4]:
df.printSchema()

root
 |-- survived: integer (nullable = true)
 |-- pclass: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: double (nullable = true)
 |-- sibsp: integer (nullable = true)
 |-- parch: integer (nullable = true)
 |-- fare: double (nullable = true)
 |-- embark_town: string (nullable = true)



# Missing Value

In [5]:
from pyspark.sql.functions import col, sum, isnan, when

# 결측치 제거

null_counts = df.select(
    [
    sum(when(col(c).isNull() | isnan(c),1).otherwise(0)).alias(c) for c in df.columns
    ]
)

null_counts.show()

+--------+------+---+---+-----+-----+----+-----------+
|survived|pclass|sex|age|sibsp|parch|fare|embark_town|
+--------+------+---+---+-----+-----+----+-----------+
|       0|     0|  0|177|    0|    0|   0|          2|
+--------+------+---+---+-----+-----+----+-----------+



# Feature Selection

In [6]:
# feature - ?? , target - survived 
data = df.select("survived", "pclass","sex","age","sibsp","parch","fare")
data.show()

+--------+------+------+----+-----+-----+-------+
|survived|pclass|   sex| age|sibsp|parch|   fare|
+--------+------+------+----+-----+-----+-------+
|       0|     3|  male|22.0|    1|    0|   7.25|
|       1|     1|female|38.0|    1|    0|71.2833|
|       1|     3|female|26.0|    0|    0|  7.925|
|       1|     1|female|35.0|    1|    0|   53.1|
|       0|     3|  male|35.0|    0|    0|   8.05|
|       0|     3|  male|null|    0|    0| 8.4583|
|       0|     1|  male|54.0|    0|    0|51.8625|
|       0|     3|  male| 2.0|    3|    1| 21.075|
|       1|     3|female|27.0|    0|    2|11.1333|
|       1|     2|female|14.0|    1|    0|30.0708|
|       1|     3|female| 4.0|    1|    1|   16.7|
|       1|     1|female|58.0|    0|    0|  26.55|
|       0|     3|  male|20.0|    0|    0|   8.05|
|       0|     3|  male|39.0|    1|    5| 31.275|
|       0|     3|female|14.0|    0|    0| 7.8542|
|       1|     2|female|55.0|    0|    0|   16.0|
|       0|     3|  male| 2.0|    4|    1| 29.125|


In [12]:
# 평균값으로 대체 
from pyspark.sql.functions import avg  # avg 함수를 임포트

#mean_age = data.select("age").agg({"age":"mean"}).collect()[0][0]
mean_age = data.select(avg("age")).collect()[0][0] #- 같은 값 나옴
mean_age

29.699117647058763

In [13]:
data = data.fillna({"age":mean_age})
data.show()

+--------+------+------+-----------------+-----+-----+-------+
|survived|pclass|   sex|              age|sibsp|parch|   fare|
+--------+------+------+-----------------+-----+-----+-------+
|       0|     3|  male|             22.0|    1|    0|   7.25|
|       1|     1|female|             38.0|    1|    0|71.2833|
|       1|     3|female|             26.0|    0|    0|  7.925|
|       1|     1|female|             35.0|    1|    0|   53.1|
|       0|     3|  male|             35.0|    0|    0|   8.05|
|       0|     3|  male|29.69911764705882|    0|    0| 8.4583|
|       0|     1|  male|             54.0|    0|    0|51.8625|
|       0|     3|  male|              2.0|    3|    1| 21.075|
|       1|     3|female|             27.0|    0|    2|11.1333|
|       1|     2|female|             14.0|    1|    0|30.0708|
|       1|     3|female|              4.0|    1|    1|   16.7|
|       1|     1|female|             58.0|    0|    0|  26.55|
|       0|     3|  male|             20.0|    0|    0| 

# Encoding
1. category type -> numeric
- SpringIndexer()

In [9]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

indexer = StringIndexer(inputCol="sex", outputCol="SexIndex")
data = indexer.fit(data).transform(data)
data.show(5)

+--------+------+------+----+-----+-----+-------+--------+
|survived|pclass|   sex| age|sibsp|parch|   fare|SexIndex|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|     0.0|
|       1|     1|female|38.0|    1|    0|71.2833|     1.0|
|       1|     3|female|26.0|    0|    0|  7.925|     1.0|
|       1|     1|female|35.0|    1|    0|   53.1|     1.0|
|       0|     3|  male|35.0|    0|    0|   8.05|     0.0|
+--------+------+------+----+-----+-----+-------+--------+
only showing top 5 rows



In [10]:
data.printSchema()

root
 |-- survived: integer (nullable = true)
 |-- pclass: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: double (nullable = false)
 |-- sibsp: integer (nullable = true)
 |-- parch: integer (nullable = true)
 |-- fare: double (nullable = true)
 |-- SexIndex: double (nullable = false)



# Feature Vector

In [14]:
# target 을 제외한 학습을 위한 피처를 어셈블 한다. 
assembler = VectorAssembler(
    inputCols=["pclass","SexIndex","age","sibsp","parch","fare"],
    outputCol="features"
)
data = assembler.transform(data)
data

DataFrame[survived: int, pclass: int, sex: string, age: double, sibsp: int, parch: int, fare: double, SexIndex: double, feature: vector, features: vector]

In [28]:
#지도학습, 분류모델을 학습시키기 위한 데이터
data.select("survived", "features").show(5)

+--------+--------------------+
|survived|            features|
+--------+--------------------+
|       0|[3.0,0.0,22.0,1.0...|
|       1|[1.0,1.0,38.0,1.0...|
|       1|[3.0,1.0,26.0,0.0...|
|       1|[1.0,1.0,35.0,1.0...|
|       0|[3.0,0.0,35.0,0.0...|
+--------+--------------------+
only showing top 5 rows



# ML 모델 : 데이터 학습 > 평가 > 모델 완성

# dataset 분할

In [29]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42 ) # 리턴하는 값 목록1, 목록2 

In [33]:
train_data.show(10), test_data.show(10)

+--------+------+------+----+-----+-----+--------+--------+--------------------+--------------------+
|survived|pclass|   sex| age|sibsp|parch|    fare|SexIndex|             feature|            features|
+--------+------+------+----+-----+-----+--------+--------+--------------------+--------------------+
|       0|     1|female| 2.0|    1|    2|  151.55|     1.0|[1.0,1.0,2.0,1.0,...|[1.0,1.0,2.0,1.0,...|
|       0|     1|female|25.0|    1|    2|  151.55|     1.0|[1.0,1.0,25.0,1.0...|[1.0,1.0,25.0,1.0...|
|       0|     1|  male|18.0|    1|    0|   108.9|     0.0|[1.0,0.0,18.0,1.0...|[1.0,0.0,18.0,1.0...|
|       0|     1|  male|19.0|    1|    0|    53.1|     0.0|[1.0,0.0,19.0,1.0...|[1.0,0.0,19.0,1.0...|
|       0|     1|  male|19.0|    3|    2|   263.0|     0.0|[1.0,0.0,19.0,3.0...|[1.0,0.0,19.0,3.0...|
|       0|     1|  male|22.0|    0|    0|135.6333|     0.0|[1.0,0.0,22.0,0.0...|[1.0,0.0,22.0,0.0...|
|       0|     1|  male|24.0|    0|    1|247.5208|     0.0|[1.0,0.0,24.0,0.0...|[1

(None, None)

# 분류 예측 model 생성 - 로지스틱 회귀

In [86]:
from pyspark.ml.classification import LogisticRegression

In [87]:
# 로지스틱 회귀 모델 생성

lr = LogisticRegression(featuresCol="features", labelCol="survived")

In [88]:
# 모델 학습
lr_model = lr.fit(train_data) #80% data 를 이용해서 y값을 결정할 수 있도록 매개 변수를 확정

In [103]:
# 모델 시험 = 테스트 데이터 > 결과 확인 - 데이털에 답이 들어 있다 > 모델이 맞춘 y값과 실제 데이터에 들어간 값을 비교하면 알 수 있다
predictions = lr_model.transform(test_data)
predictions.show(5)

+--------+------+------+-----------------+-----+-----+-------+--------+--------------------+--------------------+--------------------+--------------------+----------+
|survived|pclass|   sex|              age|sibsp|parch|   fare|SexIndex|             feature|            features|       rawPrediction|         probability|prediction|
+--------+------+------+-----------------+-----+-----+-------+--------+--------------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|             50.0|    0|    0|28.7125|     1.0|[1.0,1.0,50.0,0.0...|[1.0,1.0,50.0,0.0...|[-1.9520246347246...|[0.12433276014445...|       1.0|
|       0|     1|  male|             21.0|    0|    1|77.2875|     0.0|[1.0,0.0,21.0,0.0...|[1.0,0.0,21.0,0.0...|[-0.5063684917057...|[0.37604522093222...|       1.0|
|       0|     1|  male|             24.0|    0|    0|   79.2|     0.0|[1.0,0.0,24.0,0.0...|[1.0,0.0,24.0,0.0...|[-0.5000163743656...|[0.37753682076914...|       1.0

In [104]:
predictions.select("feature","features","rawPrediction","probability","prediction","survived").show(5)

+--------------------+--------------------+--------------------+--------------------+----------+--------+
|             feature|            features|       rawPrediction|         probability|prediction|survived|
+--------------------+--------------------+--------------------+--------------------+----------+--------+
|[1.0,1.0,50.0,0.0...|[1.0,1.0,50.0,0.0...|[-1.9520246347246...|[0.12433276014445...|       1.0|       0|
|[1.0,0.0,21.0,0.0...|[1.0,0.0,21.0,0.0...|[-0.5063684917057...|[0.37604522093222...|       1.0|       0|
|[1.0,0.0,24.0,0.0...|[1.0,0.0,24.0,0.0...|[-0.5000163743656...|[0.37753682076914...|       1.0|       0|
|[1.0,0.0,29.0,0.0...|[1.0,0.0,29.0,0.0...|[-0.1615623337462...|[0.45969704539401...|       1.0|       0|
|[1.0,0.0,29.69911...|[1.0,0.0,29.69911...|[-0.1231865060676...|[0.46924225915865...|       1.0|       0|
+--------------------+--------------------+--------------------+--------------------+----------+--------+
only showing top 5 rows



In [91]:
predictions.select("features","prediction","survived").tail(5)

[Row(features=DenseVector([3.0, 0.0, 29.6991, 0.0, 0.0, 56.4958]), prediction=0.0, survived=1),
 Row(features=DenseVector([3.0, 0.0, 29.6991, 2.0, 0.0, 23.25]), prediction=0.0, survived=1),
 Row(features=DenseVector([3.0, 0.0, 31.0, 0.0, 0.0, 7.925]), prediction=0.0, survived=1),
 Row(features=DenseVector([3.0, 0.0, 32.0, 0.0, 0.0, 56.4958]), prediction=0.0, survived=1),
 Row(features=DenseVector([3.0, 0.0, 39.0, 0.0, 0.0, 7.925]), prediction=0.0, survived=1)]

# 정답 개수 확인 
- prediction 과 survived 같은지 아닌지 체크 

In [92]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import expr

In [99]:
predictions = predictions.withColumn("survived", col("survived").cast(IntegerType()))
predictions = predictions.withColumn("prediction", col("survived").cast(IntegerType()))

In [106]:
# correct column 추가 

compare = predictions.withColumn("correct", expr("case when survived=prediction then 1 else 0 end"))
compare.where("correct=0").count() #틀린 개수 확인

28

In [107]:
predictions.select("features","prediction","survived").tail(5)

[Row(features=DenseVector([3.0, 0.0, 29.6991, 0.0, 0.0, 56.4958]), prediction=0.0, survived=1),
 Row(features=DenseVector([3.0, 0.0, 29.6991, 2.0, 0.0, 23.25]), prediction=0.0, survived=1),
 Row(features=DenseVector([3.0, 0.0, 31.0, 0.0, 0.0, 7.925]), prediction=0.0, survived=1),
 Row(features=DenseVector([3.0, 0.0, 32.0, 0.0, 0.0, 56.4958]), prediction=0.0, survived=1),
 Row(features=DenseVector([3.0, 0.0, 39.0, 0.0, 0.0, 7.925]), prediction=0.0, survived=1)]

# 정확도 계산 > 모델 점수 

In [108]:
accu = compare.selectExpr("avg(correct) as accuracy").collect()[0]['accuracy']
accu

0.8068965517241379

# 평가기 evaluator 

In [109]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [110]:
# 모델별로 평가지표, 매트릭
# 분류모델의 평가지표 중에서 AUC 계산한다 

evaluator = BinaryClassificationEvaluator(labelCol="survived",rawPredictionCol="rawPrediction", metricName="areaUnderROC")
evaluator

BinaryClassificationEvaluator_5c462c97ebc5

In [111]:
auc = evaluator.evaluate(predictions)
auc

0.8664129586260734

In [112]:
spark.stop()