In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [2]:
df = spark.read.csv('iris.csv', header=True, inferSchema=True)

In [3]:
df.show()

+------------+-----------+------------+-----------+-------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|
+------------+-----------+------------+-----------+-------+-----------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0|
|         5.4|        3.9|         1.7|        0.4| Setosa|          0|
|         4.6|        3.4|         1.4|        0.3| Setosa|          0|
|         5.0|        3.4|         1.5|        0.2| Setosa|          0|
|         4.4|        2.9|         1.4|        0.2| Setosa|          0|
|         4.9|        3.1|         1.5|        0.1| Setosa|          0|
|         5.4|        3.7|         1.5|        0.2| Setosa|     

In [19]:
### Собираем в единый вектор для работы машинного обучения

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [5]:
assembler = VectorAssembler(inputCols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], outputCol="features")

In [6]:
df = assembler.transform(df)
df.show()

+------------+-----------+------------+-----------+-------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|         features|
+------------+-----------+------------+-----------+-------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| Setosa|          0|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3| Setosa|          0|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2| Setosa|          0|[5.0,3.4,1.5,0.2]|
|         

In [20]:
### Разобьем данные на данные для обучения и проверки

In [7]:
train, test = df.randomSplit([0.8, 0.2], seed=12345)

In [8]:
train.show()

+------------+-----------+------------+-----------+----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|variety_num|         features|
+------------+-----------+------------+-----------+----------+-----------+-----------------+
|         4.3|        3.0|         1.1|        0.1|    Setosa|          0|[4.3,3.0,1.1,0.1]|
|         4.4|        2.9|         1.4|        0.2|    Setosa|          0|[4.4,2.9,1.4,0.2]|
|         4.4|        3.0|         1.3|        0.2|    Setosa|          0|[4.4,3.0,1.3,0.2]|
|         4.4|        3.2|         1.3|        0.2|    Setosa|          0|[4.4,3.2,1.3,0.2]|
|         4.5|        2.3|         1.3|        0.3|    Setosa|          0|[4.5,2.3,1.3,0.3]|
|         4.6|        3.1|         1.5|        0.2|    Setosa|          0|[4.6,3.1,1.5,0.2]|
|         4.6|        3.4|         1.4|        0.3|    Setosa|          0|[4.6,3.4,1.4,0.3]|
|         4.6|        3.6|         1.0|        0.2|    Setosa|        

In [21]:
### Создадим и обучим модель логистической регрессии

In [9]:
from pyspark.ml.classification import LogisticRegression

In [10]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'variety_num')
lrModel = lr.fit(train)

In [11]:
train_res = lrModel.transform(train)
test_res = lrModel.transform(test)

In [12]:
train_res.show()

+------------+-----------+------------+-----------+----------+-----------+-----------------+--------------------+--------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|variety_num|         features|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+----------+-----------+-----------------+--------------------+--------------------+----------+
|         4.3|        3.0|         1.1|        0.1|    Setosa|          0|[4.3,3.0,1.1,0.1]|[80.1628043651858...|[1.0,1.2550034261...|       0.0|
|         4.4|        2.9|         1.4|        0.2|    Setosa|          0|[4.4,2.9,1.4,0.2]|[64.6334644328493...|[1.0,1.5146031885...|       0.0|
|         4.4|        3.0|         1.3|        0.2|    Setosa|          0|[4.4,3.0,1.3,0.2]|[71.698654399708,...|[1.0,8.1567961584...|       0.0|
|         4.4|        3.2|         1.3|        0.2|    Setosa|          0|[4.4,3.2,1.3,0.2]|[82.5540433395515...|[1.0,1.3787

In [None]:
### Оценим качество предсказания

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [14]:
ev = MulticlassClassificationEvaluator(labelCol='variety_num')

In [15]:
ev.evaluate(train_res)

0.9844961240310077

In [16]:
ev.evaluate(test_res)

1.0

In [None]:
### Обучаем модель дерева решений и оцениваем его качество

In [17]:
from pyspark.ml.classification import DecisionTreeClassifier

In [18]:
tr = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'variety_num')

In [19]:
trFitted = tr.fit(train)

In [20]:
train_tr_res = trFitted.transform(train)
test_tr_res = trFitted.transform(test)

In [22]:
train_tr_res.show(5)

+------------+-----------+------------+-----------+-------+-----------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|         features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+-------+-----------+-----------------+--------------+-------------+----------+
|         4.3|        3.0|         1.1|        0.1| Setosa|          0|[4.3,3.0,1.1,0.1]|[43.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        2.9|         1.4|        0.2| Setosa|          0|[4.4,2.9,1.4,0.2]|[43.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        3.0|         1.3|        0.2| Setosa|          0|[4.4,3.0,1.3,0.2]|[43.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        3.2|         1.3|        0.2| Setosa|          0|[4.4,3.2,1.3,0.2]|[43.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.5|        2.3|         1.3|        0.3| Setosa|          0|[4.5,2.3,1.3,0.3]|[

In [23]:
ev.evaluate(test_tr_res)

1.0

In [24]:
ev.evaluate(train_tr_res)

0.9922428036123125