### Normalization vs Standardization

In [3]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

from pyspark.sql import SparkSession, SQLContext

In [4]:
spark = SparkSession.builder.getOrCreate()

In [7]:
features_df = spark.createDataFrame([
    (1, Vectors.dense([10.0, 10000.0, 1.0]),),
    (2, Vectors.dense([20.0, 30000.0, 2.0]),),
    (3, Vectors.dense([30.0, 60000.0, 3.0]),)],
     ["id", "features"])

In [8]:
features_df.show()

+---+------------------+
| id|          features|
+---+------------------+
|  1|[10.0,10000.0,1.0]|
|  2|[20.0,30000.0,2.0]|
|  3|[30.0,60000.0,3.0]|
+---+------------------+



In [9]:
feature_scaler = MinMaxScaler(inputCol="features",
                             outputCol="sfeatures")

In [10]:
smodel = feature_scaler.fit(features_df)
sfeatures_df = smodel.transform(features_df)

In [11]:
sfeatures_df.show()

+---+------------------+-------------+
| id|          features|    sfeatures|
+---+------------------+-------------+
|  1|[10.0,10000.0,1.0]|    (3,[],[])|
|  2|[20.0,30000.0,2.0]|[0.5,0.4,0.5]|
|  3|[30.0,60000.0,3.0]|[1.0,1.0,1.0]|
+---+------------------+-------------+



In [13]:
from pyspark.ml.feature import StandardScaler

In [14]:
feature_stand_scaler = StandardScaler(inputCol="features", 
                                     outputCol="stdfeatures",
                                     withStd=True, withMean=True)

In [15]:
stdmodel = feature_stand_scaler.fit(features_df)
stdfeatures_df = stdmodel.transform(features_df)

In [16]:
stdfeatures_df.show()

+---+------------------+--------------------+
| id|          features|         stdfeatures|
+---+------------------+--------------------+
|  1|[10.0,10000.0,1.0]|[-1.0,-0.92717264...|
|  2|[20.0,30000.0,2.0]|[0.0,-0.132453235...|
|  3|[30.0,60000.0,3.0]|[1.0,1.0596258856...|
+---+------------------+--------------------+



### Bucketizer

In [17]:
from pyspark.ml.feature import Bucketizer

In [18]:
splits = [-float("inf"), -10.0, 0.0, 10.0, float("inf")]

In [21]:
b_data = [(-800.0,),(-10.5,),(-1.7,), (0.0,), (8.2,), (90.1,)]
b_data

[(-800.0,), (-10.5,), (-1.7,), (0.0,), (8.2,), (90.1,)]

In [22]:
b_df = spark.createDataFrame(b_data, ["features"])
b_df.show()

+--------+
|features|
+--------+
|  -800.0|
|   -10.5|
|    -1.7|
|     0.0|
|     8.2|
|    90.1|
+--------+



In [24]:
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bfeatures")
bucketed_df = bucketizer.transform(b_df)
bucketed_df.show()

+--------+---------+
|features|bfeatures|
+--------+---------+
|  -800.0|      0.0|
|   -10.5|      0.0|
|    -1.7|      1.0|
|     0.0|      2.0|
|     8.2|      2.0|
|    90.1|      3.0|
+--------+---------+



### Tokenizer

In [25]:
from pyspark.ml.feature import Tokenizer

In [27]:
sentences_df = spark.createDataFrame([
    (1, "This is an introduction to Spark MLlib"),
    (2, "MLlib includes libraries for classification and Regression"),
    (3, "It also contains supporting tools for pipelines")
], ["id", "sentences"])

In [28]:
sentences_df.show()

+---+--------------------+
| id|           sentences|
+---+--------------------+
|  1|This is an introd...|
|  2|MLlib includes li...|
|  3|It also contains ...|
+---+--------------------+



In [32]:
sent_token = Tokenizer(inputCol="sentences", outputCol="words")
sent_tokenized_df = sent_token.transform(sentences_df)

In [33]:
sent_tokenized_df.show()

+---+--------------------+--------------------+
| id|           sentences|               words|
+---+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|
|  2|MLlib includes li...|[mllib, includes,...|
|  3|It also contains ...|[it, also, contai...|
+---+--------------------+--------------------+



### TF-IDF (Term frequency - inverse document frequency)

In [59]:
from pyspark.ml.feature import HashingTF, IDF

In [50]:
sentences_df.show()

+---+--------------------+
| id|           sentences|
+---+--------------------+
|  1|This is an introd...|
|  2|MLlib includes li...|
|  3|It also contains ...|
+---+--------------------+



In [51]:
hashingTF = HashingTF(inputCol="words", 
                      outputCol="rawFeatures",
                     numFeatures=20)

In [52]:
sent_hfTF_df = hashingTF.transform(sent_tokenized_df)

In [53]:
sent_hfTF_df.show()

+---+--------------------+--------------------+--------------------+
| id|           sentences|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|(20,[6,8,9,10,13,...|
|  2|MLlib includes li...|[mllib, includes,...|(20,[2,4,11,12,15...|
|  3|It also contains ...|[it, also, contai...|(20,[1,4,6,8,11,1...|
+---+--------------------+--------------------+--------------------+



In [54]:
sent_hfTF_df.take(1)

[Row(id=1, sentences='This is an introduction to Spark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'], rawFeatures=SparseVector(20, {6: 2.0, 8: 1.0, 9: 1.0, 10: 1.0, 13: 1.0, 15: 1.0}))]

In [63]:
idf = IDF(inputCol="rawFeatures", 
         outputCol="idf_features")

In [64]:
idfModel = idf.fit(sent_hfTF_df)

tfidf_df = idfModel.transform(sent_hfTF_df)
tfidf_df.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|           sentences|               words|         rawFeatures|        idf_features|
+---+--------------------+--------------------+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|(20,[6,8,9,10,13,...|(20,[6,8,9,10,13,...|
|  2|MLlib includes li...|[mllib, includes,...|(20,[2,4,11,12,15...|(20,[2,4,11,12,15...|
|  3|It also contains ...|[it, also, contai...|(20,[1,4,6,8,11,1...|(20,[1,4,6,8,11,1...|
+---+--------------------+--------------------+--------------------+--------------------+



In [65]:
tfidf_df.take(1)

[Row(id=1, sentences='This is an introduction to Spark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'], rawFeatures=SparseVector(20, {6: 2.0, 8: 1.0, 9: 1.0, 10: 1.0, 13: 1.0, 15: 1.0}), idf_features=SparseVector(20, {6: 0.5754, 8: 0.2877, 9: 0.6931, 10: 0.6931, 13: 0.6931, 15: 0.2877}))]