# 사이킷런 모델 생성


In [6]:
from sklearn.datasets import load_iris

In [7]:
import pandas as pd
import numpy as np

In [8]:
# iris dataset 로딩
iris = load_iris()

iris_data = iris.data # feature
iris_label = iris.target # label

iris_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
iris_pdf = pd.DataFrame(iris_data, columns=iris_columns)
iris_pdf['target'] = iris_label
iris_pdf

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [9]:
# pandas 데이터 프레임을 csv파일로 저장
iris_pdf.to_csv("./data/iris.csv", index=False)

In [10]:
from sklearn.tree import DecisionTreeClassifier # Estimator
from sklearn.model_selection import train_test_split # RandomSpliter

X_train, X_test, y_train, y_test = train_test_split(
    iris_data,
    iris_label,
    test_size=0.2,
    random_state=42
)

In [11]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train) # 훈련 수행 시에 tree_clf 모델 자체에서 훈련이 일어나게 됩니다.

pred = tree_clf.predict(X_test)
pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

# Spark ML 사용하기

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("tree-clf").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/01 12:06:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/01 12:06:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [13]:
iris_filepath = "/home/ubuntu/working/spark-examples/data/iris.csv"

iris_sdf = spark.read.csv(f"file://{iris_filepath}", inferSchema=True, header=True)
iris_sdf.show(5)

                                                                                

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width|target|
+------------+-----------+------------+-----------+------+
|         5.1|        3.5|         1.4|        0.2|     0|
|         4.9|        3.0|         1.4|        0.2|     0|
|         4.7|        3.2|         1.3|        0.2|     0|
|         4.6|        3.1|         1.5|        0.2|     0|
|         5.0|        3.6|         1.4|        0.2|     0|
+------------+-----------+------------+-----------+------+
only showing top 5 rows



In [17]:
# randomSplit 메소드를 활용해 훈련 / 테스트 데이터 세트 분할
train_sdf, test_sdf = iris_sdf.randomSplit([0.8, 0.2], seed=42)

# 만약에 계층적 분할도 수행도 하고 싶으면 randomSplit이 아닌, sampleBy 함수 활용
# train_sdf = iris_sdf.sampleBy("target", fractions={0:0.8, 1:0.8, 2:0.8}, seed=42)
# test_sdf = iris_sdf.subtract(train_sdf) # iris_sdf - train_sdf (차집합)

In [18]:
# 훈련 데이터는 하나의 파이프라인, 하나의 모델에만 사용되는 것이 아닌, 여러 경우의 수를 따지는 경우가 많다.
# 전처리 과정도 여러 개, 모델도 여러 개를 사용해야 하기 때문에 원천이 되는 데이터는 훈련 데이터는 캐싱을 한다.
train_sdf.cache()

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, target: int]

# Vector Assemble
`VectorAssembler`를 이용하여 모든 `feature` 컬럼을 하나의 `feature vector`로 만드는 작업이 필요하다. (행 벡터)

In [20]:
from pyspark.ml.feature import VectorAssembler

# 합쳐질 컬럼 목록
iris_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

vec_assembler = VectorAssembler(inputCols=iris_columns, outputCol="features")
vec_assembler

VectorAssembler_f34155ba2083

In [21]:
# 변환 - VectorAssembler Transform

train_feature_vector_sdf = vec_assembler.transform(train_sdf)
train_feature_vector_sdf.show(5)

+------------+-----------+------------+-----------+------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|
+------------+-----------+------------+-----------+------+-----------------+
|         4.3|        3.0|         1.1|        0.1|     0|[4.3,3.0,1.1,0.1]|
|         4.4|        2.9|         1.4|        0.2|     0|[4.4,2.9,1.4,0.2]|
|         4.4|        3.2|         1.3|        0.2|     0|[4.4,3.2,1.3,0.2]|
|         4.5|        2.3|         1.3|        0.3|     0|[4.5,2.3,1.3,0.3]|
|         4.6|        3.1|         1.5|        0.2|     0|[4.6,3.1,1.5,0.2]|
+------------+-----------+------------+-----------+------+-----------------+
only showing top 5 rows



# Estimator
Spark ML의 모델은 추정기(Estimator)지만, feature를 받아서 prediction 컬럼을 추가하는 Transformer에 해당한다.

In [33]:
from pyspark.ml.classification import DecisionTreeClassifier

# 모델 생성. 어떤 컬럼의 데이터를 이용해서 학습할지 결정을 지어줘야

dt = DecisionTreeClassifier(
    featuresCol="features",
    labelCol="target",
    maxDepth=5 # 의사결정나무 깊이 한다.
)

type(dt)

pyspark.ml.classification.DecisionTreeClassifier

In [34]:
# 모델 학습. fit() 메소드를 이용하여 학습을 수행하고, 그 결과를 ML 모델로 반환한다.
dt_model = dt.fit(train_feature_vector_sdf)
type(dt_model)

pyspark.ml.classification.DecisionTreeClassificationModel

In [35]:
# 테스트 데이터 예측
test_sdf.show(5)

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width|target|
+------------+-----------+------------+-----------+------+
|         4.4|        3.0|         1.3|        0.2|     0|
|         4.6|        3.2|         1.4|        0.2|     0|
|         4.6|        3.6|         1.0|        0.2|     0|
|         4.8|        3.1|         1.6|        0.2|     0|
|         4.9|        3.1|         1.5|        0.1|     0|
+------------+-----------+------------+-----------+------+
only showing top 5 rows



In [36]:
# 훈련 데이터에서 적용시켰던 Transformer를 테스트 세트에다가도 그대로 적용시킨다. ⭐⭐
test_feature_vector_sdf = vec_assembler.transform(test_sdf)
test_feature_vector_sdf.show(5)

+------------+-----------+------------+-----------+------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|
+------------+-----------+------------+-----------+------+-----------------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|
+------------+-----------+------------+-----------+------+-----------------+
only showing top 5 rows



In [37]:
predictions = dt_model.transform(test_feature_vector_sdf)
predictions.show()

+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.0|        2.3|         3.3|        1.0|     1|[5.0,2

* `rawPrediction` : 머신러닝 모델 알고리즘 별로 다를 수 있다.
    * 머신러닝 알고리즘에 의해서 계산된 값
    * 값에 대한 정확한 의미는 없다.
    * `LogisticRegression`의 경우 예측 label 별로, 예측 수행 전 `sigmoid` 함수 적용 전 값
        * $ \hat{y} = \sigma(WX + b) $
        * $ WX + b $의 결과가 `rawPrediction`
* `probability` : 예측 label 별 예측 확률 값
* `prediction` : 최종 예측 label 값

# 모델 평가

In [38]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [39]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol = 'target',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

accuracy = evaluator_accuracy.evaluate(predictions)
accuracy

1.0

In [41]:
# LogitiscRegression 사용하기
from pyspark.ml.classification import LogisticRegression

# ML 알고리즘 객체 생성
lr = LogisticRegression(featuresCol='features', labelCol='target', maxIter=10)

lr_model = lr.fit(train_feature_vector_sdf)

predictions = lr_model.transform(test_feature_vector_sdf)
predictions.show(5)

accuracy = evaluator_accuracy.evaluate(predictions)
print("정확도", accuracy)

23/08/01 13:14:06 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


+------------+-----------+------------+-----------+------+-----------------+--------------------+--------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------------+--------------------+----------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|[18.6086266693530...|[0.99997762791224...|       0.0|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|[18.8180066107267...|[0.99997581287298...|       0.0|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|[22.6963845270311...|[0.99999942608846...|       0.0|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|[16.7506644665748...|[0.99971232954776...|       0.0|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|[17.3393987944

In [42]:
spark.stop()