## Pyspark mllib - Word2Vec

In [None]:
from pyspark.mllib.feature import Word2Vec

inp = sc.textFile("./sample_ida_data.txt").map(lambda row: row.split(" "))

word2vec = Word2Vec()
model = word2vec.fit(inp)

synonyms = model.findSynonyms('1', 5)

for word, cosine_distance in synonyms:
    print("{}: {}".format(word, cosine_distance))

#### pyspark 3 예제

In [25]:
sent = ("a b " * 100 + "a c " * 10).split(" ")
doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])
doc

sentence
"[a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, ..."
"[a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, ..."


In [44]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import Word2VecModel
w2v = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model")
w2v.setMaxIter(10)

Word2Vec_544aacb2bd56

In [28]:
w2v.getMaxIter()

10

In [29]:
w2v.clear(w2v.maxIter)
model = w2v.fit(doc)
model.getMinCount()

5

In [30]:
model.setInputCol("sentence")

Word2VecModel: uid=Word2Vec_bfa0f6b2ee9e, numWords=3, vectorSize=5

In [31]:
model.getVectors().show()

+----+--------------------+
|word|              vector|
+----+--------------------+
|   a|[0.09511678665876...|
|   b|[-1.2028766870498...|
|   c|[0.30153277516365...|
+----+--------------------+



In [32]:
model.findSynonymsArray("a", 2)

[('b', 0.015859870240092278), ('c', -0.5680795907974243)]

In [33]:
from pyspark.sql.functions import format_number as fmt
model.findSynonyms("a", 2).select("word", fmt("similarity", 5))

word,"format_number(similarity, 5)"
b,0.01586
c,-0.56808


In [34]:
model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity"))

word,similarity
b,0.01586
c,-0.56808


In [35]:
model.transform(doc).head().model

DenseVector([-0.4833, 0.1855, -0.273, -0.0509, -0.4769])

#### save model

In [37]:
temp_path = "./"
w2vPath = temp_path + "/word2vec"
w2v.save(w2vPath)

In [39]:
modelPath = temp_path + "/word2vec-model"
model.save(modelPath)

                                                                                

#### load model

In [42]:
loadedw2v = Word2Vec.load(w2vPath)
print(loadedw2v.getVectorSize() == w2v.getVectorSize())
print(loadedw2v.getNumPartitions() == w2v.getNumPartitions())
print(loadedw2v.getMinCount() == w2v.getMinCount())

True
True
True


In [46]:
loadedModel = Word2VecModel.load(modelPath)
print(loadedModel.getVectors().first().word)
print(model.getVectors().first().word)

a
a


In [47]:
print(loadedModel.getVectors().first().vector)
print(model.getVectors().first().vector)

[0.09511678665876389,0.3910670876502991,-0.4300164580345154,-0.14111702144145966,-0.06563225388526917]
[0.09511678665876389,0.3910670876502991,-0.4300164580345154,-0.14111702144145966,-0.06563225388526917]


In [48]:
print(loadedModel.transform(doc).take(1))
print(model.transform(doc).take(1))

[Row(sentence=['a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 