# Chapter 9: Spark MLlib and ML

In this notebook, we will see the main capabilities of Spark MLlib and ML.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark-MLlib-ML").master("local[*]").getOrCreate()
sc = spark.sparkContext

## Working with MLlilb

In this section, we will focus on MLlib

In [2]:
from pyspark.mllib.linalg import DenseVector, SparseVector
from pyspark.mllib.feature import HashingTF, Word2Vec, IDF, StandardScaler, ChiSqSelector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import numpy as np
from random import randint

### MLlib Feature Encoding and Data Preparation

#### Working with Spark Vectors

We can create Dense Vectors, Sparse Vectors and Labeled Points

In [3]:
dense_vector = DenseVector(np.array([1,2,3]))

In [4]:
dense_vector

DenseVector([1.0, 2.0, 3.0])

In [5]:
sparse_vector = SparseVector(4, {0:1.5, 2:3})

In [6]:
sparse_vector

SparseVector(4, {0: 1.5, 2: 3.0})

In [7]:
labeled_point = LabeledPoint(1, dense_vector)

In [8]:
labeled_point

LabeledPoint(1.0, [1.0,2.0,3.0])

#### Preparing Textual Data

We can also prepare text data using some in-built data transformations capabilities already included in MLlib. We first prepare some text data about Spam and Non-Spam emails.

In [9]:
ini_data = spark.read.csv("../data/spam.csv", header=True)
ini_data_rdd = ini_data.select(["label", "text"]).rdd.filter(lambda row: (isinstance(row.label, str) and isinstance(row.text, str)))

In [10]:
ini_data_rdd.count()

5573

In [11]:
ini_data_rdd.take(1)

[Row(label='ham', text='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')]

In [12]:
text_rdd = ini_data_rdd.map(lambda x: x.text)

Now we use the `HashingTF` transformer.

In [13]:
def hasing_TF(text_rdd):
    """
    Transforms an input RDD of text using the Hashing TF transformer
    
    :input text_rdd: input RDD
    :return: transformed RDD
    """
    tokenizer = HashingTF()
    text_tokenized = text_rdd.map(lambda text: text.split(" "))
    return tokenizer.transform(text_tokenized)

In [14]:
text_rdd.take(1)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']

In [15]:
hash_text = hasing_TF(text_rdd)

In [16]:
hash_text.take(1)

[SparseVector(1048576, {79172: 1.0, 244892: 1.0, 296409: 1.0, 312753: 1.0, 384022: 1.0, 407924: 1.0, 414297: 1.0, 442668: 1.0, 627241: 1.0, 639697: 1.0, 799074: 1.0, 856522: 1.0, 897134: 1.0, 901549: 1.0, 968035: 1.0, 988036: 1.0, 997716: 1.0, 1015964: 1.0, 1033917: 1.0, 1044354: 1.0})]

In [17]:
def hasing_TF_with_text(text_rdd):
    """
    Transforms an input RDD of text using the Hashing TF transformer
    keeping also the original text
    
    :input text_rdd: input RDD
    :return: transformed RDD
    """
    tokenizer = HashingTF()
    return text_rdd.map(lambda text: (text, tokenizer.transform(text.split(" "))))

In [18]:
hash_text_perserving = hasing_TF_with_text(text_rdd)

In [19]:
hash_text_perserving.take(1)

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  SparseVector(1048576, {79172: 1.0, 244892: 1.0, 296409: 1.0, 312753: 1.0, 384022: 1.0, 407924: 1.0, 414297: 1.0, 442668: 1.0, 627241: 1.0, 639697: 1.0, 799074: 1.0, 856522: 1.0, 897134: 1.0, 901549: 1.0, 968035: 1.0, 988036: 1.0, 997716: 1.0, 1015964: 1.0, 1033917: 1.0, 1044354: 1.0}))]

We can now use the `Word2Vec` transformer.

In [20]:
text_tokenized = text_rdd.map(lambda text: text.split(" "))
word2vec_trformer = Word2Vec().fit(text_tokenized)

In [21]:
word2vec_trformer.transform("great")

DenseVector([0.0615, 0.0713, 0.007, 0.0284, -0.0985, 0.0336, -0.0257, -0.0536, -0.0093, -0.0124, 0.0267, 0.0514, -0.0384, -0.0138, -0.0179, 0.0149, 0.0856, 0.0339, 0.0404, 0.0404, -0.0049, 0.0279, 0.0037, -0.0256, -0.031, -0.0083, 0.015, 0.0343, -0.1048, 0.0421, -0.0407, 0.024, -0.0908, -0.0367, 0.0241, 0.0047, -0.0095, -0.0495, -0.008, -0.0256, 0.0476, 0.0263, 0.0257, -0.0465, -0.0003, -0.0809, -0.0349, 0.0294, 0.02, -0.0207, 0.0515, 0.1077, -0.0712, 0.0291, -0.0226, -0.0408, -0.0062, -0.0055, 0.0221, 0.0365, 0.0116, 0.0146, -0.0936, -0.0651, 0.051, -0.0737, -0.0549, 0.0181, 0.1016, 0.0163, -0.0028, -0.0397, -0.0022, 0.0374, -0.0237, -0.0181, 0.0115, 0.0491, 0.0739, -0.0107, -0.0571, -0.0524, 0.0479, -0.0372, 0.0198, -0.004, 0.0157, -0.126, 0.0621, -0.0406, 0.0106, -0.0118, 0.0182, 0.0248, 0.0692, 0.0696, -0.0893, -0.0955, 0.0262, -0.0346])

In [22]:
word2vec_trformer.transform("Free")

DenseVector([0.0557, 0.024, 0.0241, 0.0173, -0.0448, 0.0286, -0.0071, -0.0053, 0.0154, -0.0092, 0.0136, 0.0356, -0.0232, 0.0009, -0.0107, 0.0118, 0.0201, 0.0304, -0.0134, -0.0077, 0.0211, 0.0231, 0.0191, -0.0143, -0.0117, -0.0044, 0.0135, 0.0265, -0.0447, 0.0147, 0.0015, 0.017, -0.0651, 0.0009, 0.0071, -0.0066, 0.0074, -0.0402, -0.0048, -0.0111, 0.0163, 0.0359, -0.0114, -0.0438, 0.003, -0.0076, -0.014, 0.0112, 0.0141, -0.0088, 0.0001, 0.0567, -0.0418, 0.0409, 0.0102, -0.0033, -0.0002, 0.0052, -0.0323, -0.0246, -0.0198, 0.0169, -0.0493, -0.0073, 0.0369, -0.0333, -0.0033, 0.0423, 0.0146, -0.0001, -0.0288, 0.0172, -0.0203, -0.0104, 0.0234, 0.0379, 0.0221, -0.0023, 0.0242, -0.0177, -0.0407, 0.0052, 0.0065, 0.0045, 0.0062, -0.0159, 0.0089, -0.0404, -0.0142, -0.0189, 0.0145, 0.0105, 0.0269, -0.0075, 0.0249, 0.0253, 0.0016, -0.0526, 0.0041, 0.0149])

#### Preparing Data for Supervised Learning

In [23]:
tf = HashingTF(1000)
tf_vectors = tf.transform(text_rdd)
idf = IDF()
idf_model = idf.fit(tf_vectors)

In [24]:
spam_text = ini_data_rdd.filter(lambda row: row.label == "spam").map(lambda row: row.text.split(" "))
gen_text = ini_data_rdd.filter(lambda row: row.label != "spam").map(lambda row: row.text.split(" "))

In [25]:
spam_points = idf_model.transform(tf.transform(spam_text)).map(lambda x: LabeledPoint(1, x))
gen_points = idf_model.transform(tf.transform(gen_text)).map(lambda x: LabeledPoint(0, x))

In [26]:
ml_data_ini = spam_points.union(gen_points)

In [27]:
ml_data = ml_data_ini.map(lambda row: (randint(0,100), row)).sortByKey().map(lambda row: row[1])

In [28]:
ml_data_train, ml_data_test = ml_data.randomSplit(weights = [0.8, 0.2])

In [29]:
ml_data_train.cache()
ml_data_test.cache()

PythonRDD[54] at RDD at PythonRDD.scala:49

#### Feature Scaling and Selection

It is useful sometimes for the ML algorithms to scale that data.

`StandardScaler()` --> to scale numerical data

In [30]:
from pyspark.mllib.feature import StandardScaler
std_scaler = StandardScaler()
std_scaler_model = std_scaler.fit(ml_data.map(lambda lpoint: lpoint.features))

In [31]:
train_label = ml_data_train.map(lambda lpoint: lpoint.label)
test_label = ml_data_test.map(lambda lpoint: lpoint.label)

In [32]:
ml_data_train_scl = train_label.zip(std_scaler_model.transform(ml_data_train.map(lambda lpoint: lpoint.features)))\
.map(lambda x: LabeledPoint(x[0], x[1]))

ml_data_test_scl = test_label.zip(std_scaler_model.transform(ml_data_test.map(lambda lpoint: lpoint.features)))\
.map(lambda x: LabeledPoint(x[0], x[1]))

In [33]:
ml_data_train_scl.take(1)

[LabeledPoint(1.0, (1000,[18,133,184,278,390,403,411,421,574,581,630,637,706,783,789,807,808,821,822,824,937],[9.10145635512,17.4260213968,9.45472540734,8.56298964927,7.26691923942,7.04358538035,13.2336916489,6.52034068072,5.96542617781,7.18167686177,16.3198325861,5.32568093583,8.68433762659,2.87708423949,7.76209066016,6.2349925733,7.79493378756,5.37756847208,13.4442310562,9.65846657093,11.5576716992]))]

In [34]:
ml_data_test_scl.take(1)

[LabeledPoint(1.0, (1000,[7,69,75,120,222,274,289,304,317,323,365,369,407,431,442,619,631,658,807,882,903,939,974,979],[14.668435857,4.1679073928,11.8409552489,7.6794964018,3.14936680147,7.96797003614,5.58238396494,4.26629506314,3.69603215981,3.72785424961,1.4119082138,14.6735203427,15.9460499929,11.0516323062,8.15313602093,10.4969379283,8.00709362589,8.11210138222,6.2349925733,8.06317518116,3.47586227908,10.7070843704,8.50572058869,15.5969494943]))]

`ChiSqSelector` --> to select the most relevant features

In [35]:
selector = ChiSqSelector(100)

In [36]:
selector_model = selector.fit(ml_data)

In [37]:
selector_model

<pyspark.mllib.feature.ChiSqSelectorModel at 0x7f081c902b38>

In [38]:
ml_data_train_sel = train_label.zip(selector_model.transform(ml_data_train.map(lambda lpoint: lpoint.features)))\
.map(lambda x: LabeledPoint(x[0], x[1]))

ml_data_test_sel = test_label.zip(selector_model.transform(ml_data_test.map(lambda lpoint: lpoint.features)))\
.map(lambda x: LabeledPoint(x[0], x[1]))

In [39]:
ml_data_train_sel.take(1)

[LabeledPoint(1.0, (100,[38,82,85],[8.62586820804,0.993951694971,2.46045035381]))]

In [40]:
ml_data_test_sel.take(1)

[LabeledPoint(1.0, (100,[7,21,23,25,28,34,91,94],[8.62586820804,8.62586820804,8.62586820804,0.281950403648,2.13362837302,8.62586820804,2.04245898588,0.0771763495663]))]

### MLlib Model Training

Once we have prepared our data, we can train some models

In [41]:
lr = LogisticRegressionWithLBFGS()
lr_model_raw = lr.train(ml_data_train)
lr_model_scl = lr.train(ml_data_train_scl)

In [42]:
lr_model_raw.weights

DenseVector([-2.0286, -1.5938, -2.5886, -0.9297, 0.4722, 12.5765, -0.0328, -5.8304, 0.1055, -4.1027, -4.4149, -3.3053, -2.1944, -2.6413, 0.1863, -0.8475, -3.0515, -0.7292, 0.4193, 2.9482, -1.0132, -2.9988, -2.6662, -4.4446, -2.4757, -6.8648, 5.6551, -0.6961, -1.2052, -2.3798, -2.7594, 17.8123, -2.7697, -0.54, -2.4009, -0.9642, -1.6688, -2.4287, 1.0176, 0.7415, -3.9635, 3.7422, -2.4016, -2.3644, -0.6487, 6.4751, 1.8152, -5.9118, 0.3514, 1.3409, -3.8598, 3.9975, -1.7918, -3.5499, -0.0916, -0.4125, -2.7814, -1.1816, -1.9391, 0.0198, 3.8314, 9.2274, -5.6853, -0.219, -2.2424, -3.1851, -2.59, -1.2827, -1.8992, 0.5443, 3.3486, 0.7074, 5.4672, 0.2179, -0.3228, 0.0619, 1.5158, -0.5706, -3.1794, -1.3888, -0.1108, -0.7209, -0.2748, -3.7414, -4.6501, 1.5655, -5.4361, -0.144, 1.3585, -2.5042, -2.9415, -2.8706, 6.9286, 1.4492, 1.3242, -2.8814, -2.1288, 10.621, -0.0668, 0.7018, -2.5423, 1.3878, -1.3022, -2.3315, -143.0627, 7.6571, -2.3749, 0.5491, 1.537, -0.8569, -0.3188, 8.0032, 0.6858, 1.1312, -1.2

In [43]:
lr_model_scl.weights

DenseVector([-8.905, -1.0862, -2.3691, -2.4398, 0.4785, 1.3584, -0.0204, -3.4286, 0.0893, -5.0467, -2.7868, -1.943, -4.5616, -1.877, 0.1455, -0.683, -6.8402, -0.8811, 0.0863, 2.1469, -0.6907, -1.8613, -1.8928, -2.6137, -1.7581, -4.1871, 5.5578, -0.6783, -1.4994, -2.0294, -3.2583, 3.0772, -1.8332, -0.4847, -1.7484, -0.8587, -1.8655, -4.8651, 0.6835, 0.4524, -3.7302, 2.8247, -2.8471, -8.1269, -0.49, 4.0189, 0.9128, -5.351, 0.2942, 0.6564, -2.1262, 3.7597, -6.2351, -3.1916, -0.0423, -0.3187, -3.8299, -1.8902, -1.3944, 0.0147, 3.9087, 6.2884, -4.2914, -0.2979, -3.4832, -2.6943, -1.8861, -1.5141, -1.0729, 1.1266, 3.3515, 0.635, 5.0638, 0.2888, -0.2625, 0.0451, 1.033, -0.3722, -4.3399, -1.0839, -0.3435, -0.4474, -0.2621, -6.8436, -2.7884, 0.5715, -6.7303, -0.0909, 1.2687, -3.9399, -2.4905, -1.7824, 0.7307, 1.677, 1.1691, -5.1921, -2.4675, 2.2127, -0.0777, 0.4576, -1.941, 5.31, -0.78, -1.6557, -4.19, 3.8528, -0.9497, 0.3038, 0.7929, -0.5758, -0.1041, 3.7151, 0.6644, 0.974, -0.6626, -5.1711, 0

### Predict

Once the model is trained, we can perform predictions.

In [44]:
raw_preds = lr_model_raw.predict(ml_data_test.map(lambda lpoint: lpoint.features))
scl_preds = lr_model_scl.predict(ml_data_test_scl.map(lambda lpoint: lpoint.features))

In [45]:
raw_preds.take(1)

[0]

In [46]:
scl_preds.take(1)

[0]

### Serving and Persistence

Many times, once we train our model, we save it and the load it in oder programs to make predictions. We try first the internal format of Spark, which allows us to save and load a model.

In [47]:
!rm -rf ../data/lr_model_raw
lr_model_raw.save(sc, "../data/lr_model_raw")

In [48]:
lr_model_raw_loaded = LogisticRegressionModel.load(sc, "../data/lr_model_raw")

In [49]:
raw_preds_loaded = lr_model_raw_loaded.predict(ml_data_test.map(lambda lpoint: lpoint.features))

In [50]:
raw_preds_loaded.take(1)

[0]

### Model Evaluation

MLlib includes some functionalities to calculate automatically some metrics of trained ML models. While there are more, here we will evaluate the LR model of the spam classification section using the `BinaryClassificationMetrics` functionality.

In [51]:
ml_data_train.take(1)

[LabeledPoint(1.0, (1000,[18,133,184,278,390,403,411,421,574,581,630,637,706,783,789,807,808,821,822,824,937],[1.87243028944,17.2517364161,8.62586820804,8.62586820804,8.62586820804,17.2517364161,8.62586820804,8.62586820804,8.62586820804,4.24016829583,8.62586820804,8.62586820804,17.2517364161,0.993951694971,8.62586820804,8.62586820804,8.62586820804,2.46045035381,8.62586820804,17.2517364161,8.62586820804]))]

In [52]:
pred_label_lr = ml_data_test.map(lambda lpoint: (float(lr_model_raw.predict(lpoint.features)), lpoint.label))
metrics_lr = BinaryClassificationMetrics(pred_label_lr)

In [53]:
print("LR model")
print("Area Under PR: {0}".format(metrics_lr.areaUnderPR))
print("Area Under ROC: {0}".format(metrics_lr.areaUnderROC))

LR model
Area Under PR: 0.5086440719665847
Area Under ROC: 0.8401651325124242


## Working with Spark ML

Now, we are going to see some of the capabilities offered by the Spark ML package, which works with DataFrames instead that MLlib that works with RDDs. In particular, we are going to do again the spam classification problem using two Pipelines: one for the data preparation and the other one for the ML model.

### Data Preparation: Data Encoding & Data Cleaning

In [54]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer, IDF, SQLTransformer, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline, PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [55]:
ini_data.show()

+-----+--------------------+----+----+----+
|label|                text| _c2| _c3| _c4|
+-----+--------------------+----+----+----+
|  ham|Go until jurong p...|null|null|null|
|  ham|Ok lar... Joking ...|null|null|null|
| spam|Free entry in 2 a...|null|null|null|
|  ham|U dun say so earl...|null|null|null|
|  ham|Nah I don't think...|null|null|null|
| spam|FreeMsg Hey there...|null|null|null|
|  ham|Even my brother i...|null|null|null|
|  ham|As per your reque...|null|null|null|
| spam|WINNER!! As a val...|null|null|null|
| spam|Had your mobile 1...|null|null|null|
|  ham|I'm gonna be home...|null|null|null|
| spam|SIX chances to wi...|null|null|null|
| spam|URGENT! You have ...|null|null|null|
|  ham|I've been searchi...|null|null|null|
|  ham|I HAVE A DATE ON ...|null|null|null|
| spam|XXXMobileMovieClu...|null|null|null|
|  ham|Oh k...i'm watchi...|null|null|null|
|  ham|Eh u remember how...|null|null|null|
|  ham|Fine if that��s t...|null|null|null|
| spam|England v Macedon...|null

In [56]:
sql_select = SQLTransformer(statement = "SELECT label, text FROM __THIS__")

In [57]:
sql_filter = SQLTransformer(statement = "SELECT * from __THIS__ WHERE text is not null AND label is not null")

In [58]:
label_indexer = StringIndexer(inputCol="label", outputCol="label_num")

In [59]:
tokenizer = Tokenizer(inputCol = "text", outputCol = "text_token")

In [60]:
count_text = SQLTransformer(statement = "SELECT *, size(text_token) as count FROM __THIS__")

In [61]:
tf = HashingTF(numFeatures = 1000, inputCol = "text_token", outputCol = "text_tf")

In [62]:
idf = IDF(inputCol = "text_tf", outputCol = "text_features")

In [63]:
assember = VectorAssembler(inputCols = ["text_features", "count"],
                           outputCol = "features_raw")

In [64]:
scaler = StandardScaler(inputCol="features_raw", outputCol="features")

In [65]:
etl_pipeline_model = Pipeline(stages=[sql_select, sql_filter, label_indexer, 
                                      tokenizer, count_text, tf, idf, assember, scaler]).fit(ini_data)

In [66]:
ml_data_pipe = etl_pipeline_model.transform(ini_data)

In [67]:
ml_data_pipe.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  ham|(1001,[7,77,150,1...|
|  ham|(1001,[20,316,484...|
| spam|(1001,[30,35,73,1...|
|  ham|(1001,[57,368,372...|
|  ham|(1001,[135,163,32...|
| spam|(1001,[25,36,91,9...|
|  ham|(1001,[18,47,48,5...|
|  ham|(1001,[36,71,92,2...|
| spam|(1001,[39,43,61,7...|
| spam|(1001,[36,73,82,1...|
|  ham|(1001,[26,41,106,...|
| spam|(1001,[15,35,36,4...|
| spam|(1001,[68,73,122,...|
|  ham|(1001,[19,36,39,1...|
|  ham|(1001,[44,82,170,...|
| spam|(1001,[41,43,49,6...|
|  ham|(1001,[275,426,44...|
|  ham|(1001,[80,147,236...|
|  ham|(1001,[159,170,29...|
| spam|(1001,[9,19,45,71...|
+-----+--------------------+
only showing top 20 rows



In [68]:
ml_data_pipe_train, ml_data_pipe_test = ml_data_pipe.randomSplit(weights=[0.8, 0.2])

In [69]:
ml_data_pipe_train.count()

4477

In [70]:
ml_data_pipe_test.count()

1096

#### Spark ML Models

Once we have our data cleaned and encoded, we can now train a ML models.

In [71]:
lr = LogisticRegression(featuresCol="features", labelCol="label_num", regParam=0.1)

In [72]:
lr_pipeline = Pipeline(stages=[lr])

In [73]:
lr_pipeline_model = lr_pipeline.fit(ml_data_pipe_train)

As in any other pipeline, we can access to one of the steps, and its metadata. Here we get the coefficientMatrix of the trained LR model.

In [74]:
lr_pipeline_model.stages[0].coefficientMatrix

DenseMatrix(3, 1001, [0.0199, -0.0057, -0.003, 0.019, 0.0014, -0.0079, 0.021, 0.0274, ..., -0.0005, -0.0001, -0.0002, -0.0003, -0.0003, -0.0001, -0.0002, -0.0009], 1)

#### Data Persistence and Spark ML

We can save and load our pipelines (including both data transformers and ML algorithms).

In [75]:
!rm -rf ../data/etl_pipeline_model
!rm -rf ../data/lr_pipeline_model
etl_pipeline_model.save("../data/etl_pipeline_model")
lr_pipeline_model.save("../data/lr_pipeline_model")

In [76]:
etl_pipeline_load = PipelineModel.load("../data/etl_pipeline_model")
lr_pipeline_load = PipelineModel.load("../data/lr_pipeline_model")

In [77]:
predictions = lr_pipeline_load.transform(etl_pipeline_load.transform(ini_data))

In [78]:
predictions.select("label_num", "prediction").show()

+---------+----------+
|label_num|prediction|
+---------+----------+
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
+---------+----------+
only showing top 20 rows



#### Automated Model Selection: Parameter Search

Spark ML offers some functionalities to perform hiperparameter tunning on ML models. Let's check our previous problem testing different regularization parameters.

In [79]:
lr = LogisticRegression(featuresCol="features", labelCol="label_num", regParam=0.1)

In [80]:
estimator_pipeline = Pipeline(stages=[lr])

In [81]:
param_grid = ParamGridBuilder().addGrid(LogisticRegression.regParam, [0.1, 0.01, 0.05]).build()

In [82]:
evaluator = BinaryClassificationEvaluator(labelCol="label_num", rawPredictionCol="prediction")

In [83]:
crossval = CrossValidator(estimator=estimator_pipeline,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=3)

In [84]:
cv_model = crossval.fit(dataset=ml_data_pipe)

In [85]:
cv_model.transform(ml_data_pipe).select("label_num", "prediction").show()

+---------+----------+
|label_num|prediction|
+---------+----------+
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
+---------+----------+
only showing top 20 rows

