In [1]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder.master("local").appName("logistic-regresseion").getOrCreate()
# 세션 생성 

22/10/06 18:44:45 WARN Utils: Your hostname, Moon-2.local resolves to a loopback address: 127.0.0.1; using 192.168.0.4 instead (on interface en0)
22/10/06 18:44:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/06 18:44:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/06 18:44:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
from pyspark.ml.linalg import Vectors 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
# 데이터를 여러 처리를 할 때 유용하게 사용할 수 있는 것 

In [6]:
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])
# 데이터 프레임을 만들어줌 
# spark라는 단어가 나올 때 마다 1이 나오는 것을 예측하는 모델을 생성할 예정 

In [7]:
tokenizer = Tokenizer(inputCol = "text", outputCol = "words")

hashingTF = HashingTF(inputCol = tokenizer.getOutputCol(), outputCol="features")
# tokenizer가 쓰고 있는 output column을 사용하는 것 

In [10]:
lr = LogisticRegression(maxIter = 30, regParam=0.001)
# 인스턴스 생성 

## Pipeline

In [11]:
pipeline = Pipeline(stages = [tokenizer, hashingTF, lr])
# stage안에 사전에 정의한 내용이 있는 변수들을 넣어줌 
# 데이터 로딩 -> 전처리 부분 -> 학습 -> 모델 평가 

In [12]:
model = pipeline.fit(training)
# pipeline의 output은 model이며 model은 해당 파이프라인에 데이터를 넣었을 때 결과 

                                                                                

22/10/06 18:51:08 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/10/06 18:51:08 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/10/06 18:51:08 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/06 18:51:08 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [13]:
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [14]:
pred = model.transform(test)
# test진행 

In [15]:
pred.show()

+---+------------------+--------------------+--------------------+--------------------+--------------------+----------+
| id|              text|               words|            features|       rawPrediction|         probability|prediction|
+---+------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  4|       spark i j k|    [spark, i, j, k]|(262144,[19036,68...|[0.53662516439882...|[0.63102699631690...|       0.0|
|  5|             l m n|           [l, m, n]|(262144,[1303,526...|[4.17742695597525...|[0.98489377609773...|       0.0|
|  6|spark hadoop spark|[spark, hadoop, s...|(262144,[173558,1...|[-1.8520577251150...|[0.13563147748816...|       1.0|
|  7|     apache hadoop|    [apache, hadoop]|(262144,[68303,19...|[5.42954585803784...|[0.99563405823116...|       0.0|
+---+------------------+--------------------+--------------------+--------------------+--------------------+----------+



In [16]:
pred.select(['id','text', 'probability', 'prediction']).show()

+---+------------------+--------------------+----------+
| id|              text|         probability|prediction|
+---+------------------+--------------------+----------+
|  4|       spark i j k|[0.63102699631690...|       0.0|
|  5|             l m n|[0.98489377609773...|       0.0|
|  6|spark hadoop spark|[0.13563147748816...|       1.0|
|  7|     apache hadoop|[0.99563405823116...|       0.0|
+---+------------------+--------------------+----------+

