In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-24-ML")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

### Vector

In [2]:
from pyspark.ml.linalg import Vectors
denseVec = Vectors.dense(1.0, 2.0, 3.0)

In [3]:
denseVec

DenseVector([1.0, 2.0, 3.0])

In [4]:
denseVec.array

array([1., 2., 3.])

In [5]:
denseVec.values

array([1., 2., 3.])

In [6]:
size = 3
idx = [1, 2] # locations of non-zero elements in vector
values = [2.0, 3.0]
sparseVec = Vectors.sparse(size, idx, values)

In [7]:
sparseVec

SparseVector(3, {1: 2.0, 2: 3.0})

In [8]:
sparseVec.values

array([2., 3.])

In [9]:
# COMMAND ----------

df = spark.read.json(SPARK_BOOK_DATA_PATH + "/data/simple-ml")

In [10]:
df.count()

110

In [11]:
df.printSchema()

root
 |-- color: string (nullable = true)
 |-- lab: string (nullable = true)
 |-- value1: long (nullable = true)
 |-- value2: double (nullable = true)



In [12]:
df.show(3)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 3 rows



In [13]:
df.orderBy("value1").show(10)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
|green|good|     1|14.386294994851129|
|  red| bad|     1| 38.97187133755819|
|green|good|     1|14.386294994851129|
|  red| bad|     1| 38.97187133755819|
|  red| bad|     1| 38.97187133755819|
|  red| bad|     1| 38.97187133755819|
|green|good|     1|14.386294994851129|
|  red| bad|     1| 38.97187133755819|
|green|good|     1|14.386294994851129|
+-----+----+------+------------------+
only showing top 10 rows



In [14]:
df.groupBy("color", "lab").count()\
    .orderBy("color", "lab")\
    .show(10)

+-----+----+-----+
|color| lab|count|
+-----+----+-----+
| blue| bad|   20|
|green| bad|   10|
|green|good|   30|
|  red| bad|   30|
|  red|good|   20|
+-----+----+-----+



In [20]:
# COMMAND ----------

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")

In [23]:
# COMMAND ----------

## prepare feature columns

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show(10, False)

+-----+----+------+------------------+----------------------------------------------------------------------+-----+
|color|lab |value1|value2            |features                                                              |label|
+-----+----+------+------------------+----------------------------------------------------------------------+-----+
|green|good|1     |14.386294994851129|(10,[1,2,3,5,8],[1.0,1.0,14.386294994851129,1.0,14.386294994851129])  |1.0  |
|blue |bad |8     |14.386294994851129|(10,[2,3,6,9],[8.0,14.386294994851129,8.0,14.386294994851129])        |0.0  |
|blue |bad |12    |14.386294994851129|(10,[2,3,6,9],[12.0,14.386294994851129,12.0,14.386294994851129])      |0.0  |
|green|good|15    |38.97187133755819 |(10,[1,2,3,5,8],[1.0,15.0,38.97187133755819,15.0,38.97187133755819])  |1.0  |
|green|good|12    |14.386294994851129|(10,[1,2,3,5,8],[1.0,12.0,14.386294994851129,12.0,14.386294994851129])|1.0  |
|green|bad |16    |14.386294994851129|(10,[1,2,3,5,8],[1.0,16.0,14.38629

In [24]:
# COMMAND ----------

## split train/test

train, test = preparedDF.randomSplit([0.7, 0.3])

In [25]:
# COMMAND ----------

## create model

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")

In [26]:
# COMMAND ----------

print (lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [27]:
# COMMAND ----------

## train model

fittedLR = lr.fit(train)

In [28]:
fittedLR

LogisticRegressionModel: uid = LogisticRegression_5b6ada8feb48, numClasses = 2, numFeatures = 10

In [29]:
# COMMAND ----------

train, test = df.randomSplit([0.7, 0.3])

In [30]:
df.show(3,False)

+-----+----+------+------------------+
|color|lab |value1|value2            |
+-----+----+------+------------------+
|green|good|1     |14.386294994851129|
|blue |bad |8     |14.386294994851129|
|blue |bad |12    |14.386294994851129|
+-----+----+------+------------------+
only showing top 3 rows



In [31]:
# COMMAND ----------

rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

In [32]:
# COMMAND ----------

from pyspark.ml import Pipeline
stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

In [33]:
# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()\
  .addGrid(rForm.formula, [
    "lab ~ . + color:value1",
    "lab ~ . + color:value1 + color:value2"])\
  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
  .addGrid(lr.regParam, [0.1, 2.0])\
  .build()

In [34]:
# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
  .setMetricName("areaUnderROC")\
  .setRawPredictionCol("prediction")\
  .setLabelCol("label")

In [35]:
# COMMAND ----------

from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit()\
  .setTrainRatio(0.75)\
  .setEstimatorParamMaps(params)\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)

In [36]:
# COMMAND ----------

tvsFitted = tvs.fit(train)

In [37]:
type(tvsFitted)

pyspark.ml.tuning.TrainValidationSplitModel

In [15]:
spark.stop()