# Chapter 9: Spark MLlib and ML

In this notebook, we will see the main capabilities of Spark MLlib and ML.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark-MLlib-ML").master("local[*]").getOrCreate()
sc = spark.sparkContext

## Working with MLlilb

In this section, we will focus on MLlib

In [2]:
from pyspark.mllib.linalg import DenseVector, SparseVector
from pyspark.mllib.feature import HashingTF, Word2Vec, IDF, StandardScaler, ChiSqSelector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import numpy as np
from random import randint

### MLlib Feature Encoding and Data Preparation

#### Working with Spark Vectors

We can create Dense Vectors, Sparse Vectors and Labeled Points

In [3]:
dense_vector = DenseVector(np.array([1,2,3]))

In [4]:
dense_vector

DenseVector([1.0, 2.0, 3.0])

In [5]:
sparse_vector = SparseVector(4, {0:1.5, 2:3})

In [6]:
sparse_vector

SparseVector(4, {0: 1.5, 2: 3.0})

In [7]:
labeled_point = LabeledPoint(1, dense_vector)

In [8]:
labeled_point

LabeledPoint(1.0, [1.0,2.0,3.0])

#### Preparing Textual Data

We can also prepare text data using some in-built data transformations capabilities already included in MLlib. We first prepare some text data about Spam and Non-Spam emails.

In [9]:
ini_data = spark.read.csv("../data/spam.csv", header=True)
ini_data_rdd = ini_data.select(["label", "text"]).rdd.filter(lambda row: (isinstance(row.label, str) and isinstance(row.text, str)))

In [10]:
ini_data_rdd.count()

5573

In [11]:
ini_data_rdd.take(1)

[Row(label='ham', text='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')]

In [12]:
text_rdd = ini_data_rdd.map(lambda x: x.text)

Now we use the `HashingTF` transformer.

In [13]:
def hasing_TF(text_rdd):
    """
    Transforms an input RDD of text using the Hashing TF transformer
    
    :input text_rdd: input RDD
    :return: transformed RDD
    """
    tokenizer = HashingTF()
    text_tokenized = text_rdd.map(lambda text: text.split(" "))
    return tokenizer.transform(text_tokenized)

In [14]:
text_rdd.take(1)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']

In [15]:
hash_text = hasing_TF(text_rdd)

In [16]:
hash_text.take(1)

[SparseVector(1048576, {79172: 1.0, 244892: 1.0, 296409: 1.0, 312753: 1.0, 384022: 1.0, 407924: 1.0, 414297: 1.0, 442668: 1.0, 627241: 1.0, 639697: 1.0, 799074: 1.0, 856522: 1.0, 897134: 1.0, 901549: 1.0, 968035: 1.0, 988036: 1.0, 997716: 1.0, 1015964: 1.0, 1033917: 1.0, 1044354: 1.0})]

In [17]:
def hasing_TF_with_text(text_rdd):
    """
    Transforms an input RDD of text using the Hashing TF transformer
    keeping also the original text
    
    :input text_rdd: input RDD
    :return: transformed RDD
    """
    tokenizer = HashingTF()
    return text_rdd.map(lambda text: (text, tokenizer.transform(text.split(" "))))

In [18]:
hash_text_perserving = hasing_TF_with_text(text_rdd)

In [19]:
hash_text_perserving.take(1)

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  SparseVector(1048576, {79172: 1.0, 244892: 1.0, 296409: 1.0, 312753: 1.0, 384022: 1.0, 407924: 1.0, 414297: 1.0, 442668: 1.0, 627241: 1.0, 639697: 1.0, 799074: 1.0, 856522: 1.0, 897134: 1.0, 901549: 1.0, 968035: 1.0, 988036: 1.0, 997716: 1.0, 1015964: 1.0, 1033917: 1.0, 1044354: 1.0}))]

We can now use the `Word2Vec` transformer.

In [20]:
text_tokenized = text_rdd.map(lambda text: text.split(" "))
word2vec_trformer = Word2Vec().fit(text_tokenized)

In [21]:
word2vec_trformer.transform("great")

DenseVector([-0.0222, -0.004, 0.0507, -0.019, -0.0542, -0.0246, -0.0323, -0.0121, 0.0243, -0.032, 0.0457, 0.0128, 0.0131, 0.0493, -0.0092, -0.0428, 0.0212, -0.0189, -0.0174, -0.0058, 0.0433, -0.0109, 0.0427, -0.0408, -0.0133, -0.0025, 0.0695, -0.0366, 0.0426, -0.0134, -0.0358, -0.023, 0.0397, -0.0394, 0.0227, 0.0193, 0.0633, 0.0817, 0.0049, -0.0593, 0.02, -0.0286, 0.0041, 0.0782, -0.0041, 0.0282, -0.059, -0.0303, -0.0037, 0.0536, -0.0201, 0.0277, -0.0814, -0.0039, -0.0408, -0.0377, -0.0498, -0.011, -0.0811, -0.0062, 0.0452, 0.0137, 0.0732, 0.058, -0.0503, 0.1328, -0.0513, 0.0283, 0.072, 0.0248, -0.0583, 0.0237, -0.0077, -0.0393, 0.0518, 0.0384, -0.0365, -0.0185, -0.0642, -0.0187, 0.0401, 0.0066, -0.0481, 0.0099, 0.0663, -0.0221, -0.0621, -0.0067, -0.1254, 0.0442, 0.0699, -0.0168, -0.0043, 0.018, 0.0314, 0.0414, 0.0143, 0.0309, 0.0767, 0.0346])

In [22]:
word2vec_trformer.transform("Free")

DenseVector([-0.0012, -0.0069, 0.0451, -0.019, 0.0113, -0.0161, -0.031, -0.0115, 0.0074, -0.0253, 0.0227, 0.0062, 0.0023, 0.0275, -0.0088, 0.012, 0.0383, 0.0084, 0.0547, 0.0096, 0.0477, -0.0309, -0.0034, -0.0159, -0.0136, -0.0069, 0.0255, 0.006, -0.0123, -0.0202, -0.0004, 0.0161, 0.0058, -0.0089, 0.0104, -0.0086, -0.0024, 0.0089, -0.0013, 0.0127, 0.0038, -0.0176, -0.0229, 0.0123, 0.0175, 0.0352, -0.0346, -0.0239, -0.0125, 0.0126, 0.038, 0.0167, -0.0497, -0.0183, -0.0321, -0.0239, 0.0038, 0.0037, -0.0211, -0.0095, 0.0328, 0.0064, -0.0146, 0.0024, 0.0125, 0.0065, -0.0544, 0.0047, 0.0561, 0.0046, 0.0083, -0.0018, -0.0259, 0.0232, 0.0123, 0.0021, -0.0434, -0.0162, -0.0582, -0.0282, 0.0065, 0.0094, -0.0182, 0.0136, 0.0149, -0.0163, -0.0231, -0.0161, -0.048, -0.0051, 0.0386, 0.0169, 0.0113, 0.0152, 0.0124, 0.0481, 0.0064, -0.0004, -0.0051, -0.0103])

#### Preparing Data for Supervised Learning

In [23]:
tf = HashingTF(1000)
tf_vectors = tf.transform(text_rdd)
idf = IDF()
idf_model = idf.fit(tf_vectors)

In [24]:
spam_text = ini_data_rdd.filter(lambda row: row.label == "spam").map(lambda row: row.text.split(" "))
gen_text = ini_data_rdd.filter(lambda row: row.label != "spam").map(lambda row: row.text.split(" "))

In [25]:
spam_points = idf_model.transform(tf.transform(spam_text)).map(lambda x: LabeledPoint(1, x))
gen_points = idf_model.transform(tf.transform(gen_text)).map(lambda x: LabeledPoint(0, x))

In [26]:
ml_data_ini = spam_points.union(gen_points)

In [27]:
ml_data = ml_data_ini.map(lambda row: (randint(0,100), row)).sortByKey().map(lambda row: row[1])

In [28]:
ml_data_train, ml_data_test = ml_data.randomSplit(weights = [0.8, 0.2])

In [29]:
ml_data_train.cache()
ml_data_test.cache()

PythonRDD[54] at RDD at PythonRDD.scala:49

#### Feature Scaling and Selection

It is useful sometimes for the ML algorithms to scale that data.

`StandardScaler()` --> to scale numerical data

In [30]:
from pyspark.mllib.feature import StandardScaler
std_scaler = StandardScaler()
std_scaler_model = std_scaler.fit(ml_data.map(lambda lpoint: lpoint.features))

In [31]:
train_label = ml_data_train.map(lambda lpoint: lpoint.label)
test_label = ml_data_test.map(lambda lpoint: lpoint.label)

In [32]:
ml_data_train_scl = train_label.zip(std_scaler_model.transform(ml_data_train.map(lambda lpoint: lpoint.features)))\
.map(lambda x: LabeledPoint(x[0], x[1]))

ml_data_test_scl = test_label.zip(std_scaler_model.transform(ml_data_test.map(lambda lpoint: lpoint.features)))\
.map(lambda x: LabeledPoint(x[0], x[1]))

In [33]:
ml_data_train_scl.take(1)

[LabeledPoint(1.0, (1000,[51,98,119,174,278,287,289,300,403,477,483,495,529,581,670,783,809,853,859,870,872,895,976],[9.17147586322,7.42399720804,3.45425747689,5.51614801053,8.56298964927,6.23128041373,1.86079465498,3.1673723595,3.52179269018,15.9460499929,7.4596187541,2.12420580213,14.1328441425,3.59083843089,11.0516323062,2.87708423949,17.2367161587,11.7008405545,6.85324302967,2.34265780804,4.46649798562,10.8133983969,7.2839340914]))]

In [34]:
ml_data_test_scl.take(1)

[LabeledPoint(1.0, (1000,[170,260,363,394,562,597,598,622,629,636,677,682,699,737,772,780,803,827,866,903],[11.0516323062,5.05119022394,1.62660099221,6.10844968699,11.047722177,8.55999869183,5.51614801053,6.74081779345,6.79881631632,8.01796068006,9.3131464278,11.7008405545,5.42244927422,9.3814194486,7.24821974818,10.108220851,6.87217595581,2.66911570937,11.2979706501,3.47586227908]))]

`ChiSqSelector` --> to select the most relevant features

In [35]:
selector = ChiSqSelector(100)

In [36]:
selector_model = selector.fit(ml_data)

In [37]:
selector_model

<pyspark.mllib.feature.ChiSqSelectorModel at 0x7fe1ad4b6c88>

In [38]:
ml_data_train_sel = train_label.zip(selector_model.transform(ml_data_train.map(lambda lpoint: lpoint.features)))\
.map(lambda x: LabeledPoint(x[0], x[1]))

ml_data_test_sel = test_label.zip(selector_model.transform(ml_data_test.map(lambda lpoint: lpoint.features)))\
.map(lambda x: LabeledPoint(x[0], x[1]))

In [39]:
ml_data_train_sel.take(1)

[LabeledPoint(1.0, (100,[3,12,25,27,46,71,82,84,93],[8.62586820804,8.62586820804,0.0939834678827,8.62586820804,8.62586820804,8.62586820804,0.993951694971,25.8776046241,8.62586820804]))]

In [40]:
ml_data_test_sel.take(1)

[LabeledPoint(1.0, (100,[16,33,61,73,76,80,86,94],[8.62586820804,0.0621731829742,8.62586820804,8.62586820804,8.62586820804,8.62586820804,8.62586820804,0.0771763495663]))]

### MLlib Model Training

Once we have prepared our data, we can train some models

In [41]:
lr = LogisticRegressionWithLBFGS()
lr_model_raw = lr.train(ml_data_train)
lr_model_scl = lr.train(ml_data_train_scl)

In [42]:
lr_model_raw.weights

DenseVector([-2.6086, -4.8151, -3.9921, -1.1688, -0.0367, 18.8321, 0.6745, -4.3258, -3.2294, -2.2899, -0.4485, -3.0772, -3.3741, -2.6127, 0.574, -2.4362, -3.6972, -1.0446, 9.9425, 2.7657, 3.5532, -3.7996, 2.0916, -5.0304, -3.9207, -9.0825, 2.3378, 3.9537, -1.0363, -11.3515, -2.6287, 30.6823, -4.0409, 0.7543, -2.8899, -3.0697, -0.6833, -2.2942, -2.1847, 4.4067, -2.4156, 4.2431, -0.0712, -3.7738, 1.3988, 7.6198, 6.2064, -5.9746, 0.8384, 8.5136, -8.6159, 3.8864, -2.0624, -4.6018, 2.801, 1.5305, -3.4478, 0.555, -1.443, 2.1376, 3.0497, 10.2418, -0.6784, -0.4287, -1.8162, -3.5604, -5.7638, -1.6088, -3.4644, 2.6326, 4.8454, -0.0129, 5.2212, 1.2538, 1.1603, 3.4528, 2.1786, -2.4778, -2.629, -1.4906, -0.1491, 2.6019, 0.6794, -2.7756, -5.0355, -0.7124, -6.4041, 0.2431, 1.1633, -0.2067, -3.8111, -3.0373, 43.8725, 0.2153, 0.1234, -4.7272, -3.6907, 43.9328, 0.9895, 3.7785, 2.8709, 1.6141, -1.5827, 0.9475, -219.3317, 4.6197, -3.9421, 1.4826, -2.4069, -2.7477, 4.1786, 11.4708, -0.347, 5.4015, -2.0308,

In [43]:
lr_model_scl.weights

DenseVector([-11.4511, -3.2815, -3.6537, -3.0673, -0.0372, 2.034, 0.4188, -2.5438, -2.7318, -2.8168, -0.2831, -1.8089, -7.0138, -1.8566, 0.4482, -1.9633, -8.2874, -1.2621, 2.0454, 2.0141, 2.4223, -2.3583, 1.4849, -2.9582, -2.7843, -5.5397, 2.2975, 3.853, -1.2892, -9.68, -3.1039, 5.3005, -2.6745, 0.677, -2.1045, -2.7339, -0.7638, -4.5957, -1.4675, 2.6887, -2.2734, 3.2028, -0.0844, -12.9716, 1.0567, 4.7294, 3.1208, -5.4078, 0.702, 4.1672, -4.7462, 3.6552, -7.1768, -4.1374, 1.2928, 1.1824, -4.7474, 0.8878, -1.0377, 1.5948, 3.1112, 6.9797, -0.5121, -0.5832, -2.8212, -3.0118, -4.1973, -1.8991, -1.957, 5.4485, 4.8496, -0.0116, 4.8359, 1.6619, 0.9438, 2.5153, 1.4847, -1.6162, -3.5886, -1.1634, -0.4621, 1.6149, 0.6481, -5.0769, -3.0195, -0.2601, -7.9288, 0.1535, 1.0864, -0.3252, -3.2268, -1.8858, 4.6268, 0.2491, 0.1089, -8.5181, -4.2778, 9.1526, 1.1497, 2.4637, 2.1919, 6.176, -0.948, 0.6729, -6.4237, 2.3245, -1.5763, 0.8202, -1.2416, -1.8464, 1.3648, 5.3247, -0.3361, 4.6507, -1.048, -4.5687, 2

### Predict

Once the model is trained, we can perform predictions.

In [44]:
raw_preds = lr_model_raw.predict(ml_data_test.map(lambda lpoint: lpoint.features))
scl_preds = lr_model_scl.predict(ml_data_test_scl.map(lambda lpoint: lpoint.features))

In [45]:
raw_preds.take(1)

[1]

In [46]:
scl_preds.take(1)

[1]

### Serving and Persistence

Many times, once we train our model, we save it and the load it in oder programs to make predictions. We try first the internal format of Spark, which allows us to save and load a model.

In [47]:
!rm -rf ../data/lr_model_raw
lr_model_raw.save(sc, "../data/lr_model_raw")

In [48]:
lr_model_raw_loaded = LogisticRegressionModel.load(sc, "../data/lr_model_raw")

In [49]:
raw_preds_loaded = lr_model_raw_loaded.predict(ml_data_test.map(lambda lpoint: lpoint.features))

In [50]:
raw_preds_loaded.take(1)

[1]

### Model Evaluation

MLlib includes some functionalities to calculate automatically some metrics of trained ML models. While there are more, here we will evaluate the LR model of the spam classification section using the `BinaryClassificationMetrics` functionality.

In [51]:
ml_data_train.take(1)

[LabeledPoint(1.0, (1000,[51,98,119,174,278,287,289,300,403,477,483,495,529,581,670,783,809,853,859,870,872,895,976],[8.62586820804,8.62586820804,8.62586820804,8.62586820804,8.62586820804,8.62586820804,0.0939834678827,8.62586820804,8.62586820804,8.62586820804,8.62586820804,3.62865593428,8.62586820804,2.12008414791,8.62586820804,0.993951694971,25.8776046241,8.62586820804,8.62586820804,8.62586820804,8.62586820804,8.62586820804,8.62586820804]))]

In [52]:
pred_label_lr = ml_data_test.map(lambda lpoint: (float(lr_model_raw.predict(lpoint.features)), lpoint.label))
metrics_lr = BinaryClassificationMetrics(pred_label_lr)

In [53]:
print("LR model")
print("Area Under PR: {0}".format(metrics_lr.areaUnderPR))
print("Area Under ROC: {0}".format(metrics_lr.areaUnderROC))

LR model
Area Under PR: 0.5788418237362521
Area Under ROC: 0.8663442556970273


## Working with Spark ML

In [54]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, IDF, SQLTransformer, StringIndexer
from pyspark.ml.pipeline import Pipeline

In [55]:
ini_data.show()

+-----+--------------------+----+----+----+
|label|                text| _c2| _c3| _c4|
+-----+--------------------+----+----+----+
|  ham|Go until jurong p...|null|null|null|
|  ham|Ok lar... Joking ...|null|null|null|
| spam|Free entry in 2 a...|null|null|null|
|  ham|U dun say so earl...|null|null|null|
|  ham|Nah I don't think...|null|null|null|
| spam|FreeMsg Hey there...|null|null|null|
|  ham|Even my brother i...|null|null|null|
|  ham|As per your reque...|null|null|null|
| spam|WINNER!! As a val...|null|null|null|
| spam|Had your mobile 1...|null|null|null|
|  ham|I'm gonna be home...|null|null|null|
| spam|SIX chances to wi...|null|null|null|
| spam|URGENT! You have ...|null|null|null|
|  ham|I've been searchi...|null|null|null|
|  ham|I HAVE A DATE ON ...|null|null|null|
| spam|XXXMobileMovieClu...|null|null|null|
|  ham|Oh k...i'm watchi...|null|null|null|
|  ham|Eh u remember how...|null|null|null|
|  ham|Fine if that��s t...|null|null|null|
| spam|England v Macedon...|null

In [56]:
sql_select = SQLTransformer(statement = "SELECT label, text FROM __THIS__")

In [57]:
sql_filter = SQLTransformer(statement = "SELECT * from __THIS__ WHERE text is not null AND label is not null")

In [58]:
label_indexer = StringIndexer(inputCol="label", outputCol="label_num")

In [59]:
tokenizer = Tokenizer(inputCol = "text", outputCol = "text_token")

In [60]:
tf = HashingTF(numFeatures = 1000, inputCol = "text_token", outputCol = "text_tf")

In [61]:
idf = IDF(inputCol="text_tf", outputCol="features")

In [62]:
lr = LogisticRegression(featuresCol="features", labelCol="label_num")

In [63]:
ml_pipeline = Pipeline(stages=[sql_select, sql_filter, label_indexer, tokenizer, tf, idf, lr])

In [64]:
ml_pipeline_model = ml_pipeline.fit(ini_data)

In [65]:
ml_pipeline_model.transform(ini_data).show(5)

+-----+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|                text|label_num|          text_token|             text_tf|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  ham|Go until jurong p...|      0.0|[go, until, juron...|(1000,[7,77,150,1...|(1000,[7,77,150,1...|[46.1925496142281...|[1.0,9.5478469531...|       0.0|
|  ham|Ok lar... Joking ...|      0.0|[ok, lar..., joki...|(1000,[20,316,484...|(1000,[20,316,484...|[22.7239392220272...|[0.99999999999972...|       0.0|
| spam|Free entry in 2 a...|      1.0|[free, entry, in,...|(1000,[30,35,73,1...|(1000,[30,35,73,1...|[-49.707099745267...|[5.78975520257700...|       1.0|
|  ham|U dun say so earl...|      0.0|[u, dun, say, so,...|(1000,[57,3