<a href="https://colab.research.google.com/github/solobala/ABD26/blob/main/ABD26_DZ7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Домашнее задание по теме «Практика PySpark (часть 2)»
Обучите модель классификации для цветков Iris.
Примерная последовательность действий:

*   Взять данные.
*   Загрузить в PySpark
*  При помощи VectorAssembler преобразовать все колонки с признаками в одну (использовать Pipeline — опционально).
*   Разбить данные на train и test.
*   Создать модель логистической регреcсии или модель дерева и обучить её.
*  Воспользоваться MulticlassClassificationEvaluator для оценки качества на train и test множестве.


## Установка PySpark

In [None]:
!apt-get update > /dev/null

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!wget -q https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz

In [None]:
!tar -xf spark-3.3.1-bin-hadoop3.tgz

In [None]:
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

## Загрузить файл iris.csv

In [None]:
df = spark.read.csv('/content/iris.CSV', inferSchema=True, header=True)
df.show()

+------------+-----------+------------+-----------+-------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|
+------------+-----------+------------+-----------+-------+-----------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0|
|         5.4|        3.9|         1.7|        0.4| Setosa|          0|
|         4.6|        3.4|         1.4|        0.3| Setosa|          0|
|         5.0|        3.4|         1.5|        0.2| Setosa|          0|
|         4.4|        2.9|         1.4|        0.2| Setosa|          0|
|         4.9|        3.1|         1.5|        0.1| Setosa|          0|
|         5.4|        3.7|         1.5|        0.2| Setosa|     

## При помощи VectorAssembler преобразовать все колонки с признаками в одну (использовать Pipeline — опционально).

In [None]:
df_feature=df.select('sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'variety')

In [None]:
df_feature.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'variety']

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
indexer = StringIndexer(inputCol='variety', outputCol='varietyInd')
indexerTrained = indexer.fit(df_feature)
df_features = indexerTrained.transform(df_feature)

In [None]:
indexerTrained.labels

['Setosa', 'Versicolor', 'Virginica']

In [None]:
df_features.show()

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], outputCol='Features')

In [None]:
df_features = assembler.transform(df_features)

In [None]:
df_features.show()

## С использованием Pipeline

In [None]:
from pyspark.ml import Pipeline

In [None]:
pipeline = Pipeline(stages = 
[
  StringIndexer(inputCol='variety', outputCol='varietyInd'),
  VectorAssembler(inputCols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], outputCol='Features')
]
)

In [None]:
pipelineTrained = pipeline.fit(df_feature)

In [None]:
pipelineTrained.transform(df_feature).show()

In [None]:
df_features = pipelineTrained.transform(df_feature)

## Разбить данные на train и test.

In [None]:
train, test = df_features.randomSplit([0.8, 0.2], seed=12345)

## Создать модель логистической регрессии и обучить ее

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression(featuresCol = 'Features', labelCol = 'varietyInd')
lrModel = lr.fit(train)

In [None]:
train_res = lrModel.transform(train)
test_res = lrModel.transform(test)

In [None]:
train_res.show()

+------------+-----------+------------+-----------+----------+----------+-----------------+--------------------+--------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|varietyInd|         Features|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+----------+----------+-----------------+--------------------+--------------------+----------+
|         4.3|        3.0|         1.1|        0.1|    Setosa|       0.0|[4.3,3.0,1.1,0.1]|[67.7027919587531...|[1.0,1.6424718354...|       0.0|
|         4.4|        2.9|         1.4|        0.2|    Setosa|       0.0|[4.4,2.9,1.4,0.2]|[57.0087434717148...|[1.0,1.4036227809...|       0.0|
|         4.4|        3.0|         1.3|        0.2|    Setosa|       0.0|[4.4,3.0,1.3,0.2]|[61.7475610236625...|[1.0,2.4772179810...|       0.0|
|         4.4|        3.2|         1.3|        0.2|    Setosa|       0.0|[4.4,3.2,1.3,0.2]|[68.8060989728479...|[1.0,1.2455403381.

## Воспользоваться MulticlassClassificationEvaluator для оценки качества на train и test множестве.

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
ev =MulticlassClassificationEvaluator(labelCol='varietyInd')

In [None]:
ev.evaluate(train_res)

0.9844961240310077

In [None]:
ev.evaluate(test_res)

1.0

##  Для сравнения обучим модель дерева решений и оценим ее качество

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
tr = DecisionTreeClassifier(featuresCol='Features', labelCol='varietyInd')

In [None]:
trFitted = tr.fit(train)

In [None]:
train_tr_res=trFitted.transform(train)
test_tr_res=trFitted.transform(test)

In [None]:
train_tr_res.show()

+------------+-----------+------------+-----------+----------+----------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|varietyInd|         Features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+----------+----------+-----------------+--------------+-------------+----------+
|         4.3|        3.0|         1.1|        0.1|    Setosa|       0.0|[4.3,3.0,1.1,0.1]|[43.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        2.9|         1.4|        0.2|    Setosa|       0.0|[4.4,2.9,1.4,0.2]|[43.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        3.0|         1.3|        0.2|    Setosa|       0.0|[4.4,3.0,1.3,0.2]|[43.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        3.2|         1.3|        0.2|    Setosa|       0.0|[4.4,3.2,1.3,0.2]|[43.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.5|        2.3|         1.3|        0.3|    Setosa|       0.0|[4.

In [None]:
ev.evaluate(train_tr_res)

0.9922428036123125

In [None]:
ev.evaluate(test_tr_res)

1.0

## Вывод: для данного датасета модель дерева решений подошла даже лучше, чем модель логистической регрессии