# "[Spark] PySpark ML Classification 모델들"
> pyspark에서 머신러닝 분류 모델 학습

- toc: true 
- badges: true
- comments: true
- categories: [Spark]
- tags: [spark, pyspark, classification]

In [1]:
import os
MINIO_ACCESS_KEY = os.environ['MINIO_ACCESS_KEY']
MINIO_SECRET_KEY = os.environ['MINIO_SECRET_KEY']

spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.access.key", MINIO_ACCESS_KEY)
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.secret.key", MINIO_SECRET_KEY)
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.endpoint", "http://lab101:10170")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.connection.ssl.enabled", "false")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.path.style.access", "true")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("com.amazonaws.services.s3.enableV2", "true")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

In [2]:
bInput = spark.read.format("parquet").load("s3a://data/binary-classification")\
    .selectExpr("features", "cast(label as double) as label")

# 로지스틱 회귀분석

In [3]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bounds vector size must beequal wi

In [4]:
lrModel = lr.fit(bInput)
print("coefficients: ", lrModel.coefficients)
print("intercept: ", lrModel.intercept)

coefficients:  [18.722385741661295,-0.5693688557340798,9.361192870830632]
intercept:  -28.04329511868947


In [5]:
summary = lrModel.summary
print(summary.areaUnderROC)

1.0


In [6]:
summary.roc.show()



+---+------------------+
|FPR|               TPR|
+---+------------------+
|0.0|               0.0|
|0.0|0.3333333333333333|
|0.0|               1.0|
|1.0|               1.0|
|1.0|               1.0|
+---+------------------+



In [7]:
summary.pr.show()

+------------------+---------+
|            recall|precision|
+------------------+---------+
|               0.0|      1.0|
|0.3333333333333333|      1.0|
|               1.0|      1.0|
|               1.0|      0.6|
+------------------+---------+



In [8]:
print(summary.objectiveHistory)

[0.6730116670092565, 0.3053347667866976, 0.19572951692227344, 0.08238560717506735, 0.039904390712412495, 0.0191876057299779, 0.009480513129879598, 0.004700793975398914, 0.002342824005088809, 0.0011692212872630925, 0.0005841333526453686, 0.0002919384368144603, 0.00014593757317782482, 7.295887614374265e-05, 3.6473098822232435e-05, 1.822801708342409e-05, 9.095755464927068e-06, 4.5053062928456136e-06, 2.17434840951629e-06, 1.0422594942126336e-06, 5.28080873894856e-07, 2.62853118644462e-07, 1.3166032239693672e-07, 6.578498712561186e-08, 3.2901213738010096e-08, 1.6448921648782767e-08, 8.224786126081538e-09]


# 의사결정트리

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier()
print(dt.explainParams())

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featuresCol: features column name. (default: features)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini (default: gini)
labelCol: label column name. (default: label)
leafCol: Leaf indices column name. Predicted leaf index of each instance in each tree by preorder. (default: )
maxBins: Max number of bins for discretizing continuous features.  Must be 

In [11]:
dtModel = dt.fit(bInput)

# 랜덤 포레스트와 그래디언트 부스티드 트리

In [3]:
from pyspark.ml.classification import RandomForestClassifier

rfClassifier = RandomForestClassifier()
print(rfClassifier.explainParams())

bootstrap: Whether bootstrap samples are used when building trees. (default: True)
cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the featur

In [4]:
trainedModel = rfClassifier.fit(bInput)

In [14]:
from pyspark.ml.classification import GBTClassifier

gbtClassifier = GBTClassifier()
print(gbtClassifier.explainParams())

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 

In [15]:
trainedModel = gbtClassifier.fit(bInput)

# 나이브 베이즈

In [16]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes()
print(nb.explainParams())

featuresCol: features column name. (default: features)
labelCol: label column name. (default: label)
modelType: The model type which is a string (case-sensitive). Supported options: multinomial (default), bernoulli and gaussian. (default: multinomial)
predictionCol: prediction column name. (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)
rawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)
smoothing: The smoothing parameter, should be >= 0, default is 1.0 (default: 1.0)
thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that at most one value may be 0. The class with largest value p/t is predicted, w

In [17]:
trainedModel = nb.fit(bInput.where("label != 0"))

# 평가지표

In [51]:
from pyspark.ml.classification import RandomForestClassifier

rfClassifier = RandomForestClassifier()
trainedModel = rfClassifier.fit(bInput)

In [52]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

out = trainedModel.transform(bInput)\
    .select("prediction", "label")\
    .rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = BinaryClassificationMetrics(out)

In [55]:
metrics.areaUnderROC

1.0

In [56]:
metrics.areaUnderPR

1.0