# Multinomial Logisitic Regression - Iris Dataset

In [29]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [30]:
spark = SparkSession.builder.appName('Logistic_Regression').getOrCreate()

In [31]:
df = spark.read.csv(r'F:\backup\practice\Apache_spark\Python-and-Spark-for-Big-Data-master\Spark_for_Machine_Learning\Logistic_Regression\iris.csv',inferSchema=True,header=True)

In [32]:
df.printSchema()

root
 |-- sepal_len: double (nullable = true)
 |-- sepal_wid: double (nullable = true)
 |-- petal_len: double (nullable = true)
 |-- petal_wid: double (nullable = true)
 |-- class: string (nullable = true)



In [33]:
df.columns

['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']

In [34]:
class_val = StringIndexer(inputCol='class',outputCol='class_ind')
ind = class_val.fit(df).transform(df)
ind.show()

+---------+---------+---------+---------+-----------+---------+
|sepal_len|sepal_wid|petal_len|petal_wid|      class|class_ind|
+---------+---------+---------+---------+-----------+---------+
|      5.1|      3.5|      1.4|      0.2|Iris-setosa|      0.0|
|      4.9|      3.0|      1.4|      0.2|Iris-setosa|      0.0|
|      4.7|      3.2|      1.3|      0.2|Iris-setosa|      0.0|
|      4.6|      3.1|      1.5|      0.2|Iris-setosa|      0.0|
|      5.0|      3.6|      1.4|      0.2|Iris-setosa|      0.0|
|      5.4|      3.9|      1.7|      0.4|Iris-setosa|      0.0|
|      4.6|      3.4|      1.4|      0.3|Iris-setosa|      0.0|
|      5.0|      3.4|      1.5|      0.2|Iris-setosa|      0.0|
|      4.4|      2.9|      1.4|      0.2|Iris-setosa|      0.0|
|      4.9|      3.1|      1.5|      0.1|Iris-setosa|      0.0|
|      5.4|      3.7|      1.5|      0.2|Iris-setosa|      0.0|
|      4.8|      3.4|      1.6|      0.2|Iris-setosa|      0.0|
|      4.8|      3.0|      1.4|      0.1

In [35]:
assembler = VectorAssembler(inputCols=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid'], outputCol='features')

In [36]:
output = assembler.transform(ind)

In [51]:
final_data = output.select('features','class_ind')
final_data.show()

+-----------------+---------+
|         features|class_ind|
+-----------------+---------+
|[5.1,3.5,1.4,0.2]|      0.0|
|[4.9,3.0,1.4,0.2]|      0.0|
|[4.7,3.2,1.3,0.2]|      0.0|
|[4.6,3.1,1.5,0.2]|      0.0|
|[5.0,3.6,1.4,0.2]|      0.0|
|[5.4,3.9,1.7,0.4]|      0.0|
|[4.6,3.4,1.4,0.3]|      0.0|
|[5.0,3.4,1.5,0.2]|      0.0|
|[4.4,2.9,1.4,0.2]|      0.0|
|[4.9,3.1,1.5,0.1]|      0.0|
|[5.4,3.7,1.5,0.2]|      0.0|
|[4.8,3.4,1.6,0.2]|      0.0|
|[4.8,3.0,1.4,0.1]|      0.0|
|[4.3,3.0,1.1,0.1]|      0.0|
|[5.8,4.0,1.2,0.2]|      0.0|
|[5.7,4.4,1.5,0.4]|      0.0|
|[5.4,3.9,1.3,0.4]|      0.0|
|[5.1,3.5,1.4,0.3]|      0.0|
|[5.7,3.8,1.7,0.3]|      0.0|
|[5.1,3.8,1.5,0.3]|      0.0|
+-----------------+---------+
only showing top 20 rows



In [52]:
train, test = final_data.randomSplit([0.7,0.3])

In [59]:
train.describe().show()

+-------+------------------+
|summary|         class_ind|
+-------+------------------+
|  count|               104|
|   mean|1.0288461538461537|
| stddev|0.8297470753857933|
|    min|               0.0|
|    max|               2.0|
+-------+------------------+



In [56]:
lr = LogisticRegression(featuresCol='features',labelCol='class_ind',family='multinomial')

In [57]:
lr_model = lr.fit(train)

In [60]:
res = lr_model.transform(test)

In [65]:
res.select('class_ind','prediction').show()

+---------+----------+
|class_ind|prediction|
+---------+----------+
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      2.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
|      1.0|       1.0|
|      0.0|       0.0|
+---------+----------+
only showing top 20 rows



In [68]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [70]:
evaltn = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='class_ind')

In [71]:
AUC = evaltn.evaluate(res)
AUC

0.9331732488632299