### Pyspark訓練

In [1]:
# 初始設定
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("test pyspark") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
# 匯入csv
df = spark.read.csv("wordsegs_data.csv", header = True)
df.show(5)

+-------------+-------------+-----------------+--------------------------+-------------------------------------+--------------------------------+
|lication_nums|release_dates|application_dates|                    titles|                            summaries|                           words|
+-------------+-------------+-----------------+--------------------------+-------------------------------------+--------------------------------+
|      M620346|   2021/11/21|       2021/09/29|        川流式水力發電系統|本創作係揭露一種川流式水力發電系統...|一種 川流 水力 發電系統 包括 ...|
|      M620327|   2021/11/21|       2021/09/06|                  撐開螺絲|本創作係一種撐開螺絲，撐開螺絲包含...|一種 開螺絲 用以 將一連 接件 ...|
|      M620322|   2021/11/21|       2021/09/03|具食物保鮮致冷裝置之烹煮台|一種具食物保鮮致冷裝置之烹煮台，主...|一種 食物 保鮮 致冷 裝置 烹煮...|
|      M620321|   2021/11/21|       2021/09/03|              餐飲製作設備|一種餐飲製作設備，主要包括有一點餐...|一種 設有 一容置 空間 容置 空...|
|      M620318|   2021/11/21|       2021/09/02|      應用程式異常偵測系統|本創作為一種應用程式異常偵測系統，...|一種 應用 程式 異常 偵測 系統...|
+-------------+----

In [3]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

# words轉list
df = df.withColumn('words_split', F.split(df.words, " "))

In [4]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(
        vectorSize = 5,
        minCount = 0,
        inputCol = "words_split",
        outputCol = "word2Vec")

model = word2Vec.fit(df)
df_word2vec = model.transform(df)
df_word2vec.printSchema()

root
 |-- lication_nums: string (nullable = true)
 |-- release_dates: string (nullable = true)
 |-- application_dates: string (nullable = true)
 |-- titles: string (nullable = true)
 |-- summaries: string (nullable = true)
 |-- words: string (nullable = true)
 |-- words_split: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- word2Vec: vector (nullable = true)



In [5]:
# 訓練好的向量
df_word2vec.select("word2vec").show(3, truncate = False)

+-----------------------------------------------------------------------------------------------------------+
|word2vec                                                                                                   |
+-----------------------------------------------------------------------------------------------------------+
|[-0.09264205933562714,-0.010777258267144273,-0.07921142776091919,0.0019313330530779426,0.04196841834561417]|
|[-0.07528771340774039,0.08003500958821222,-0.11259922331612585,-0.07165798314138135,-0.04151494064978185]  |
|[-0.011081156076455665,0.04892607512061405,-0.0727791852614291,-0.05979215509874459,0.10881402688703282]   |
+-----------------------------------------------------------------------------------------------------------+
only showing top 3 rows



In [6]:
# 訓練好資料儲存成csv
df_word2vec.select("lication_nums", "release_dates", "application_dates", "titles", "summaries", "word2vec") \
            .toPandas() \
            .to_csv("word2vec_data.csv", index = False)
# 將word_embedding另存檔案
df_word2vec.select("lication_nums", "word2vec") \
            .toPandas() \
            .to_csv("only_word2vec_data.csv", index = False)