# Working with Text data

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master('local[*]').appName("Into").getOrCreate()

24/01/13 20:12:41 WARN Utils: Your hostname, quangtn933.local resolves to a loopback address: 127.0.0.1; using 192.168.1.90 instead (on interface en0)
24/01/13 20:12:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/13 20:12:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.ml.feature import Word2Vec

In [5]:
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

In [8]:
# Learning a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
word2Vec

Word2Vec_b8bd6d710576

In [9]:
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
result

24/01/12 13:25:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/01/12 13:25:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


DataFrame[text: array<string>, result: vector]

In [10]:
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

[Stage 5:>                                                          (0 + 8) / 8]

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.046886457502841955,-0.07654499709606172,-0.008032251521945]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [0.017943344394942478,-0.0315708705623235,-0.009133852552622557]

Text: [Logistic, regression, models, are, neat] => 
Vector: [0.016518119536340237,-0.009977189078927041,-0.005242400616407395]



                                                                                

In [12]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [13]:
sentenceDataFrame = spark.createDataFrame([
    (0, "Hi|I|heard|about|Spark"),
    (1, "I     wish Java     could  use case   classes"),
    (2, "Logistic, regression, model,are,neat")],
    ["id", "sentence"]                                          
)

In [15]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

In [16]:
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words").withColumn("tokens", countTokens(col("words"))).show(truncate=False)

                                                                                

+---------------------------------------------+----------------------------------------------------------------+------+
|sentence                                     |words                                                           |tokens|
+---------------------------------------------+----------------------------------------------------------------+------+
|Hi|I|heard|about|Spark                       |[hi|i|heard|about|spark]                                        |1     |
|I     wish Java     could  use case   classes|[i, , , , , wish, java, , , , , could, , use, case, , , classes]|18    |
|Logistic, regression, model,are,neat         |[logistic,, regression,, model,are,neat]                        |3     |
+---------------------------------------------+----------------------------------------------------------------+------+



In [17]:
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words").withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+---------------------------------------------+------------------------------------------+------+
|sentence                                     |words                                     |tokens|
+---------------------------------------------+------------------------------------------+------+
|Hi|I|heard|about|Spark                       |[hi, i, heard, about, spark]              |5     |
|I     wish Java     could  use case   classes|[i, wish, java, could, use, case, classes]|7     |
|Logistic, regression, model,are,neat         |[logistic, regression, model, are, neat]  |5     |
+---------------------------------------------+------------------------------------------+------+



In [18]:
from pyspark.ml.feature import StopWordsRemover

In [20]:
sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "ballon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

In [22]:
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)

24/01/12 14:29:33 WARN StopWordsRemover: Default locale set was [en_VN]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
                                                                                

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, ballon]  |[saw, red, ballon]  |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



## Code Example >> Tokenizer >> N-Gram

In [24]:
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [25]:
sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish, wish Java, Java could"),
    (2, "Logistic regression, regression models")],
            ["id", "sentence"])

In [26]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
countTokens = udf(lambda words: len(words), IntegerType())
wordDataFrame = tokenizer.transform(sentenceDataFrame)

In [27]:
wordDataFrame

DataFrame[id: bigint, sentence: string, words: array<string>]

In [28]:
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

In [29]:
ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

+-----------------------------------------------------------------+
|ngrams                                                           |
+-----------------------------------------------------------------+
|[hi i, i heard, heard about, about spark]                        |
|[i wish,, wish, wish, wish java,, java, java, java could]        |
|[logistic regression,, regression, regression, regression models]|
+-----------------------------------------------------------------+



In [32]:
from pyspark.ml.feature import Binarizer

continuousDataFrame = spark.createDataFrame([
    (0, 5.1),
    (1, 5.8),
    (2, 0.2)],
    ["id", "feature"])

In [33]:
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
binarizedDataFrame = binarizer.transform(continuousDataFrame)

In [34]:
print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

Binarizer output with Threshold = 0.500000


                                                                                

+---+-------+-----------------+
| id|feature|binarized_feature|
+---+-------+-----------------+
|  0|    5.1|              1.0|
|  1|    5.8|              1.0|
|  2|    0.2|              0.0|
+---+-------+-----------------+



In [35]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

In [36]:
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]

df = spark.createDataFrame(data, ["features"])

In [37]:
df

DataFrame[features: vector]

In [38]:
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

24/01/12 15:05:43 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [39]:
result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

+------------------------------------------------------------+
|pcaFeatures                                                 |
+------------------------------------------------------------+
|[1.6485728230883814,-4.0132827005162985,-1.0091435193998504]|
|[-4.645104331781533,-1.1167972663619048,-1.0091435193998504]|
|[-6.428880535676488,-5.337951427775359,-1.0091435193998508] |
+------------------------------------------------------------+



In [40]:
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

In [42]:
df = spark.createDataFrame([
    (Vectors.dense([2.0, 1.0]),),
    (Vectors.dense([0.0, 0.0]),),],
    ["features"])

In [43]:
polyExpansion = PolynomialExpansion(degree=5, inputCol="features", outputCol="polyFeatures")
polyDF = polyExpansion.transform(df)

In [44]:
polyDF.show(truncate=False)

+---------+------------------------------------------------------------------------------------+
|features |polyFeatures                                                                        |
+---------+------------------------------------------------------------------------------------+
|[2.0,1.0]|[2.0,4.0,8.0,16.0,32.0,1.0,2.0,4.0,8.0,16.0,1.0,2.0,4.0,8.0,1.0,2.0,4.0,1.0,2.0,1.0]|
|[0.0,0.0]|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]   |
+---------+------------------------------------------------------------------------------------+



In [45]:
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors

In [46]:
df = spark.createDataFrame([
    (Vectors.dense([0.0, 1.0, -2.0, 3.0]), ),
    (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ),
    (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"])

dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT")
dctDf = dct.transform(df)
dctDf.select("featuresDCT").show(truncate=False)

                                                                                

+----------------------------------------------------------------+
|featuresDCT                                                     |
+----------------------------------------------------------------+
|[1.0,-1.1480502970952693,2.0000000000000004,-2.7716385975338604]|
|[-1.0,3.378492794482933,-7.000000000000001,2.9301512653149677]  |
|[4.0,9.304453421915744,11.000000000000002,1.5579302036357163]   |
+----------------------------------------------------------------+



In [47]:
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors

In [48]:
dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -8.0, 200]),),
    (1, Vectors.dense([2.0, 1.0, -4.0, 2]),),
    (2, Vectors.dense([4.0, 10.0, 8.0, 0]),)],
    ["id", "features"])

In [49]:
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

In [51]:
# Compute summary statistics and generate MaxAbsScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [-1, 1]
scaledData = scalerModel.transform(dataFrame)

                                                                                

In [52]:
scaledData.select("features", "scaledFeatures").show()

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[1.0,0.1,-8.0,200.0]|[0.25,0.010000000...|
|  [2.0,1.0,-4.0,2.0]| [0.5,0.1,-0.5,0.01]|
|  [4.0,10.0,8.0,0.0]|   [1.0,1.0,1.0,0.0]|
+--------------------+--------------------+



In [53]:
from pyspark.ml.feature import Bucketizer

In [59]:
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])

In [60]:
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

In [61]:
bucketizer

Bucketizer_6d5e399063e1

In [62]:
bucktedData = bucketizer.transform(dataFrame)

In [63]:
print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))

Bucketizer output with 4 buckets


In [65]:
bucktedData.show()

+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+



In [67]:
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

In [69]:
# Create some Vector data; also works for sparse vectors
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]

In [70]:
df = spark.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                inputCol="vector", outputCol="transformedVector")
# Batch transform the vectors to create new column:
transformer.transform(df).show()

+-------------+-----------------+
|       vector|transformedVector|
+-------------+-----------------+
|[1.0,2.0,3.0]|    [0.0,2.0,6.0]|
|[4.0,5.0,6.0]|   [0.0,5.0,12.0]|
+-------------+-----------------+



                                                                                

In [4]:
from pyspark.ml.feature import Imputer

df = spark.createDataFrame([
    (1.0, float("nan")),
    (2.0, float("nan")),
    (float("nan"), 3.0),
    (4.0, 4.0),
    (5.0, 5.0)
], ["a", "b"])

imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])

In [5]:
imputer

Imputer_57304a316422

24/01/13 15:30:39 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
model = imputer.fit(df)

model.transform(df).show()

                                                                                

+---+---+-----+-----+
|  a|  b|out_a|out_b|
+---+---+-----+-----+
|1.0|NaN|  1.0|  4.0|
|2.0|NaN|  2.0|  4.0|
|NaN|3.0|  3.0|  3.0|
|4.0|4.0|  4.0|  4.0|
|5.0|5.0|  5.0|  5.0|
+---+---+-----+-----+



In [7]:
sentence_data_frame = spark.createDataFrame([
    (0, "Hi i think pyspark is cool", "happy"),
    (1, "All I want is a pyspark cluster", "indifferent"),
    (2, "I finally understand how ML works", "Fulfilled"),
    (3, "Yet another sentence about pyspark and ML", "indifferent"),
    (4, "Why didn't I know aboyt mllib before", "sad"),
    (5, "Yes, I can", "happy")],
    ["id", "sentence", "sentiment"])

In [8]:
sentence_data_frame

DataFrame[id: bigint, sentence: string, sentiment: string]

In [9]:
from pyspark.ml.feature import Tokenizer

In [11]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [12]:
tokenized = tokenizer.transform(sentence_data_frame)

In [13]:
tokenized.show(truncate=False)



+---+-----------------------------------------+-----------+-------------------------------------------------+
|id |sentence                                 |sentiment  |words                                            |
+---+-----------------------------------------+-----------+-------------------------------------------------+
|0  |Hi i think pyspark is cool               |happy      |[hi, i, think, pyspark, is, cool]                |
|1  |All I want is a pyspark cluster          |indifferent|[all, i, want, is, a, pyspark, cluster]          |
|2  |I finally understand how ML works        |Fulfilled  |[i, finally, understand, how, ml, works]         |
|3  |Yet another sentence about pyspark and ML|indifferent|[yet, another, sentence, about, pyspark, and, ml]|
|4  |Why didn't I know aboyt mllib before     |sad        |[why, didn't, i, know, aboyt, mllib, before]     |
|5  |Yes, I can                               |happy      |[yes,, i, can]                                   |
+---+-----

                                                                                

In [15]:
from pyspark.ml.feature import StopWordsRemover

In [18]:
remover = StopWordsRemover(inputCol="words", outputCol="meaningful_words")
meaningful_data_frame = remover.transform(tokenized)

meaningful_data_frame.select("words", "meaningful_words").show(truncate=False)

24/01/13 15:47:31 WARN StopWordsRemover: Default locale set was [en_VN]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


+-------------------------------------------------+-------------------------------------+
|words                                            |meaningful_words                     |
+-------------------------------------------------+-------------------------------------+
|[hi, i, think, pyspark, is, cool]                |[hi, think, pyspark, cool]           |
|[all, i, want, is, a, pyspark, cluster]          |[want, pyspark, cluster]             |
|[i, finally, understand, how, ml, works]         |[finally, understand, ml, works]     |
|[yet, another, sentence, about, pyspark, and, ml]|[yet, another, sentence, pyspark, ml]|
|[why, didn't, i, know, aboyt, mllib, before]     |[know, aboyt, mllib]                 |
|[yes,, i, can]                                   |[yes,]                               |
+-------------------------------------------------+-------------------------------------+



In [20]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="sentiment", outputCol="categoryIndex")
indexed = indexer.fit(meaningful_data_frame).transform(meaningful_data_frame)
indexed.show()

                                                                                

+---+--------------------+-----------+--------------------+--------------------+-------------+
| id|            sentence|  sentiment|               words|    meaningful_words|categoryIndex|
+---+--------------------+-----------+--------------------+--------------------+-------------+
|  0|Hi i think pyspar...|      happy|[hi, i, think, py...|[hi, think, pyspa...|          0.0|
|  1|All I want is a p...|indifferent|[all, i, want, is...|[want, pyspark, c...|          1.0|
|  2|I finally underst...|  Fulfilled|[i, finally, unde...|[finally, underst...|          2.0|
|  3|Yet another sente...|indifferent|[yet, another, se...|[yet, another, se...|          1.0|
|  4|Why didn't I know...|        sad|[why, didn't, i, ...|[know, aboyt, mllib]|          3.0|
|  5|          Yes, I can|      happy|      [yes,, i, can]|              [yes,]|          0.0|
+---+--------------------+-----------+--------------------+--------------------+-------------+



In [21]:
indexed

DataFrame[id: bigint, sentence: string, sentiment: string, words: array<string>, meaningful_words: array<string>, categoryIndex: double]

In [22]:
sentiment_data_frame = spark.createDataFrame([
    (0, 0.01, 0.43, 0.3, 0.5),
    (1, 0.097, 0.21, 0.2, 0.9),
    (2, 0.4, 0.329, 0.97, 0.4),
    (3, 0.7, 0.4, 0.3, 0.87),
    (4, 0.34, 0.4, 0.3, 0.78),
    (5, 0.1, 0.3, 0.31, 0.29)],
    ["sentence_id", "happy", "indifferent", "Fulfilled", "sad"])

sentiment_data_frame.show()

+-----------+-----+-----------+---------+----+
|sentence_id|happy|indifferent|Fulfilled| sad|
+-----------+-----+-----------+---------+----+
|          0| 0.01|       0.43|      0.3| 0.5|
|          1|0.097|       0.21|      0.2| 0.9|
|          2|  0.4|      0.329|     0.97| 0.4|
|          3|  0.7|        0.4|      0.3|0.87|
|          4| 0.34|        0.4|      0.3|0.78|
|          5|  0.1|        0.3|     0.31|0.29|
+-----------+-----+-----------+---------+----+



In [23]:
casted_data_frame = sentiment_data_frame.selectExpr("cast(happy as double)")
casted_data_frame.show()

+-----+
|happy|
+-----+
| 0.01|
|0.097|
|  0.4|
|  0.7|
| 0.34|
|  0.1|
+-----+



In [24]:
sentiment_data_frame.printSchema()

root
 |-- sentence_id: long (nullable = true)
 |-- happy: double (nullable = true)
 |-- indifferent: double (nullable = true)
 |-- Fulfilled: double (nullable = true)
 |-- sad: double (nullable = true)



In [4]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi, I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

wordsData

DataFrame[label: double, sentence: string, words: array<string>]

24/01/13 20:12:56 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [5]:
wordsData.show()

                                                                                

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi, I heard about...|[hi,, i, heard, a...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [8]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures",
                      numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

In [9]:
featurizedData

DataFrame[label: double, sentence: string, words: array<string>, rawFeatures: vector]

In [10]:
featurizedData.show()

                                                                                

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi, I heard about...|[hi,, i, heard, a...|(20,[6,11,13,16],...|
|  0.0|I wish Java could...|[i, wish, java, c...|(20,[0,2,7,13,15,...|
|  1.0|Logistic regressi...|[logistic, regres...|(20,[3,4,6,11,19]...|
+-----+--------------------+--------------------+--------------------+



In [12]:
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|Hi, I heard about...|[hi,, i, heard, a...|(20,[6,11,13,16],...|(20,[6,11,13,16],...|
|  0.0|I wish Java could...|[i, wish, java, c...|(20,[0,2,7,13,15,...|(20,[0,2,7,13,15,...|
|  1.0|Logistic regressi...|[logistic, regres...|(20,[3,4,6,11,19]...|(20,[3,4,6,11,19]...|
+-----+--------------------+--------------------+--------------------+--------------------+

