# 24. 고급 분석과 머신러닝
## 24.4 MLlib 실제 사용하기
- 범주형 레이블 1, 범주형 변수 1, 수치형 변수 2 데이터로 실습

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName("ML examples") \
  .config("spark.some.config.option", "some-value") \
  .getOrCreate()
  
spark.conf.set("spark.sql.shuffle.partition", 5)

In [3]:
dbutils.fs.ls("/databricks-datasets/definitive-guide/data/simple-ml/")

In [4]:
df = spark.read.json("/databricks-datasets/definitive-guide/data/simple-ml/")
display(df.limit(10))

color,lab,value1,value2
green,good,1,14.386294994851127
blue,bad,8,14.386294994851127
blue,bad,12,14.386294994851127
green,good,15,38.97187133755819
green,good,12,14.386294994851127
green,bad,16,14.386294994851127
red,good,35,14.386294994851127
red,bad,1,38.97187133755819
red,bad,2,14.386294994851127
red,bad,16,14.386294994851127


In [5]:
display(df.orderBy("value2").limit(20))

color,lab,value1,value2
red,bad,16,14.386294994851127
blue,bad,12,14.386294994851127
green,good,1,14.386294994851127
blue,bad,8,14.386294994851127
blue,bad,8,14.386294994851127
green,good,12,14.386294994851127
blue,bad,8,14.386294994851127
green,bad,16,14.386294994851127
red,good,35,14.386294994851127
red,good,35,14.386294994851127


## 24.4.1 변환자를 사용해 피처 엔지니어링
- 모든 입력변수는 Double, Vector[Double] 타입으로 구성 필요
- RFormula를 사용해 쉽게 변환 가능
- fit -> transform

In [7]:
from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
fittedRF = supervised.fit(df)

preparedDF = fittedRF.transform(df)
display(preparedDF.limit(10))

color,lab,value1,value2,features,label
green,good,1,14.386294994851127,"List(0, 10, List(1, 2, 3, 5, 8), List(1.0, 1.0, 14.386294994851129, 1.0, 14.386294994851129))",1.0
blue,bad,8,14.386294994851127,"List(0, 10, List(2, 3, 6, 9), List(8.0, 14.386294994851129, 8.0, 14.386294994851129))",0.0
blue,bad,12,14.386294994851127,"List(0, 10, List(2, 3, 6, 9), List(12.0, 14.386294994851129, 12.0, 14.386294994851129))",0.0
green,good,15,38.97187133755819,"List(0, 10, List(1, 2, 3, 5, 8), List(1.0, 15.0, 38.97187133755819, 15.0, 38.97187133755819))",1.0
green,good,12,14.386294994851127,"List(0, 10, List(1, 2, 3, 5, 8), List(1.0, 12.0, 14.386294994851129, 12.0, 14.386294994851129))",1.0
green,bad,16,14.386294994851127,"List(0, 10, List(1, 2, 3, 5, 8), List(1.0, 16.0, 14.386294994851129, 16.0, 14.386294994851129))",0.0
red,good,35,14.386294994851127,"List(0, 10, List(0, 2, 3, 4, 7), List(1.0, 35.0, 14.386294994851129, 35.0, 14.386294994851129))",1.0
red,bad,1,38.97187133755819,"List(0, 10, List(0, 2, 3, 4, 7), List(1.0, 1.0, 38.97187133755819, 1.0, 38.97187133755819))",0.0
red,bad,2,14.386294994851127,"List(0, 10, List(0, 2, 3, 4, 7), List(1.0, 2.0, 14.386294994851129, 2.0, 14.386294994851129))",0.0
red,bad,16,14.386294994851127,"List(0, 10, List(0, 2, 3, 4, 7), List(1.0, 16.0, 14.386294994851129, 16.0, 14.386294994851129))",0.0


- 데이터 분할 : 0.7, 0.3

In [9]:
train, test = preparedDF.randomSplit([0.7, 0.3])

## 24.4.2 추정자(Estimator)

In [11]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features")
print(lr.explainParams())

In [12]:
fittedLR = lr.fit(train)

display(fittedLR.transform(train).select("label", "prediction"))

label,prediction
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0


## 24.4.3 워크플로 파이프라인
- transform 객체 혹은 모델 객체가 다른 파이프라인에서 재사용되지 않도록 함

In [14]:
from pyspark.ml import Pipeline


train, test = df.randomSplit([0.7, 0.3])
rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

### 24.4.4 모델 학습 및 평가
- 다양한 하이퍼파라미터 테스트 사례
  - 두 개 번전의 RFomula
  - 세개의 다른 옵션의 ElasticNet 파라미터
  - 두개의 서로 다른 옵션의 일반화 파라미터

In [16]:
from pyspark.ml.tuning import ParamGridBuilder

params = ParamGridBuilder() \
  .addGrid(rForm.formula, [
    "lab ~ . + color:value1",
    "lab ~ . + color:value1 + color:value2"]) \
  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
  .addGrid(lr.regParam, [0.1, 0.2]) \
  .build()

In [17]:
# 평가
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator() \
  .setMetricName("areaUnderROC") \
  .setRawPredictionCol("prediction") \
  .setLabelCol("label")

In [18]:
from pyspark.ml.tuning import TrainValidationSplit

tvs = TrainValidationSplit() \
  .setTrainRatio(0.75) \
  .setEstimatorParamMaps(params)\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)

In [19]:
tvsFitted = tvs.fit(train)

In [20]:
display(tvsFitted.transform(test).limit(20))

color,lab,value1,value2,features,label,rawPrediction,probability,prediction
blue,bad,8,14.386294994851127,"List(0, 7, List(2, 3, 6), List(8.0, 14.386294994851129, 8.0))",0.0,"List(1, 2, List(), List(0.9825105809506951, -0.9825105809506951))","List(1, 2, List(), List(0.7276060863443826, 0.27239391365561744))",0.0
blue,bad,8,14.386294994851127,"List(0, 7, List(2, 3, 6), List(8.0, 14.386294994851129, 8.0))",0.0,"List(1, 2, List(), List(0.9825105809506951, -0.9825105809506951))","List(1, 2, List(), List(0.7276060863443826, 0.27239391365561744))",0.0
blue,bad,8,14.386294994851127,"List(0, 7, List(2, 3, 6), List(8.0, 14.386294994851129, 8.0))",0.0,"List(1, 2, List(), List(0.9825105809506951, -0.9825105809506951))","List(1, 2, List(), List(0.7276060863443826, 0.27239391365561744))",0.0
blue,bad,8,14.386294994851127,"List(0, 7, List(2, 3, 6), List(8.0, 14.386294994851129, 8.0))",0.0,"List(1, 2, List(), List(0.9825105809506951, -0.9825105809506951))","List(1, 2, List(), List(0.7276060863443826, 0.27239391365561744))",0.0
blue,bad,12,14.386294994851127,"List(0, 7, List(2, 3, 6), List(12.0, 14.386294994851129, 12.0))",0.0,"List(1, 2, List(), List(1.0481760005149288, -1.0481760005149288))","List(1, 2, List(), List(0.7404244874216859, 0.2595755125783141))",0.0
blue,bad,12,14.386294994851127,"List(0, 7, List(2, 3, 6), List(12.0, 14.386294994851129, 12.0))",0.0,"List(1, 2, List(), List(1.0481760005149288, -1.0481760005149288))","List(1, 2, List(), List(0.7404244874216859, 0.2595755125783141))",0.0
blue,bad,12,14.386294994851127,"List(0, 7, List(2, 3, 6), List(12.0, 14.386294994851129, 12.0))",0.0,"List(1, 2, List(), List(1.0481760005149288, -1.0481760005149288))","List(1, 2, List(), List(0.7404244874216859, 0.2595755125783141))",0.0
green,bad,16,14.386294994851127,"List(1, 7, List(), List(0.0, 1.0, 16.0, 14.386294994851129, 0.0, 16.0, 0.0))",0.0,"List(1, 2, List(), List(-0.7218464587179891, 0.7218464587179891))","List(1, 2, List(), List(0.3269865097972553, 0.6730134902027447))",1.0
green,bad,16,14.386294994851127,"List(1, 7, List(), List(0.0, 1.0, 16.0, 14.386294994851129, 0.0, 16.0, 0.0))",0.0,"List(1, 2, List(), List(-0.7218464587179891, 0.7218464587179891))","List(1, 2, List(), List(0.3269865097972553, 0.6730134902027447))",1.0
green,bad,16,14.386294994851127,"List(1, 7, List(), List(0.0, 1.0, 16.0, 14.386294994851129, 0.0, 16.0, 0.0))",0.0,"List(1, 2, List(), List(-0.7218464587179891, 0.7218464587179891))","List(1, 2, List(), List(0.3269865097972553, 0.6730134902027447))",1.0


In [21]:
evaluator.evaluate(tvsFitted.transform(test))

### 24.4.5 모델 저장 및 적용
- 특정 알고리즘에 대한 '모델'버전을 사용하여 디스크에 저장된 모델을 불어와야 함
  - CrossValidator는 CrossValidatorModel이 저장된 버전을 읽어야 하는 방식

In [23]:
dbutils.fs.ls("/tmp")

In [24]:
tvsFitted.bestModel.write().overwrite().save("/tmp/model")

In [25]:
from pyspark.ml import PipelineModel

bestModel = PipelineModel.load("/tmp/model")

In [26]:
evaluator.evaluate(bestModel.transform(test))