In [29]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("logistic-regression").getOrCreate()

In [30]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer

# pipeline 데이터의 여러가지 처리 단계를 거칠 때 쓸 수 있음 
# 대표적으로 text data tokenize를 해서 onehotencoing 을 파이프라인을 만들 수 있음 

In [31]:
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

In [32]:
training.show()

+---+----------------+-----+
| id|            text|label|
+---+----------------+-----+
|  0| a b c d e spark|  1.0|
|  1|             b d|  0.0|
|  2|     spark f g h|  1.0|
|  3|hadoop mapreduce|  0.0|
+---+----------------+-----+



In [33]:
# text split
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

In [34]:
lr = LogisticRegression(maxIter=30, regParam=0.01)

In [35]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [36]:
model = pipeline.fit(training)

In [37]:
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])


In [38]:
prediction = model.transform(test)

In [39]:
prediction.select("id", "text", "probability", "prediction").show()

+---+------------------+--------------------+----------+
| id|              text|         probability|prediction|
+---+------------------+--------------------+----------+
|  4|       spark i j k|[0.59253766108921...|       0.0|
|  5|             l m n|[0.94444041965042...|       0.0|
|  6|spark hadoop spark|[0.23826602233961...|       1.0|
|  7|     apache hadoop|[0.97713755512847...|       0.0|
+---+------------------+--------------------+----------+



23/02/07 00:59:04 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 221077 ms exceeds timeout 120000 ms
23/02/07 00:59:04 WARN SparkContext: Killing executors is not supported by current scheduler.
