In [1]:
import pyspark
from pyspark.sql import SparkSession
pyspark.__version__

'2.3.2'

In [2]:
spark = SparkSession.builder.getOrCreate()
spark

## Loading data in spark

In [3]:
df = spark.read.json('/home/lab07/data/simple-ml')
print(type(df), '\n')
print(df, '\n')

<class 'pyspark.sql.dataframe.DataFrame'> 

DataFrame[color: string, lab: string, value1: bigint, value2: double] 



#### show method

In [4]:
df.show(n=5)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|green|good|    15| 38.97187133755819|
|green|good|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 5 rows



#### orderBy method

In [5]:
df.orderBy("value2").show(n=5)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|  red|good|    35|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|  red| bad|     2|14.386294994851129|
| blue| bad|     8|14.386294994851129|
|  red| bad|    16|14.386294994851129|
+-----+----+------+------------------+
only showing top 5 rows



#### select, filter method

In [6]:
df.select('lab').filter('lab == "good"').show(n=5)

+----+
| lab|
+----+
|good|
|good|
|good|
|good|
|good|
+----+
only showing top 5 rows



## Classification Model in spark

#### 1. load data

In [7]:
bInput = spark.read.format("parquet").load('/home/lab07/data/binary-classification')\
         .selectExpr('features', 'cast(label as double) as label')
print(bInput)
print(type(bInput))
bInput.show()

DataFrame[features: vector, label: double]
<class 'pyspark.sql.dataframe.DataFrame'>
+--------------+-----+
|      features|label|
+--------------+-----+
|[3.0,10.1,3.0]|  1.0|
|[1.0,0.1,-1.0]|  0.0|
|[1.0,0.1,-1.0]|  0.0|
| [2.0,1.1,1.0]|  1.0|
| [2.0,1.1,1.0]|  1.0|
+--------------+-----+



#### 2. train model - Logistic Regression

In [8]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()

In [9]:
param_doc = lr.explainParams()  # return the explanations about all parameters
param_doc_pretty = '>>> '+ param_doc.replace('\n', '\n\n>>> ')
print(param_doc_pretty[:593])

>>> aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)

>>> elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)

>>> family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)

>>> featuresCol: features column name. (default: features)

>>> fitIntercept: whether to fit an intercept term. (default: True)

>>> labelCol: label column name. (default: label)


In [10]:
lrModel = lr.fit(bInput)

print(f'model instance : {lrModel}\n\n')
print('===== Weights of final model =====\n')
print( f'>>> coefficients : {lrModel.coefficients}\n>>>    intercept : {lrModel.intercept}' )

model instance : LogisticRegression_4edd9254861fbeb74a59


===== Weights of final model =====

>>> coefficients : [6.848741325749679,0.35356589008242356,14.814900276155138]
>>>    intercept : -10.225695864286841


#### 3. evaluate model

In [11]:
print(lrModel.summary.areaUnderROC)
print()

lrModel.summary.roc.show()
lrModel.summary.pr.show()

1.0

+---+------------------+
|FPR|               TPR|
+---+------------------+
|0.0|               0.0|
|0.0|0.3333333333333333|
|0.0|               1.0|
|1.0|               1.0|
|1.0|               1.0|
+---+------------------+

+------------------+---------+
|            recall|precision|
+------------------+---------+
|               0.0|      1.0|
|0.3333333333333333|      1.0|
|               1.0|      1.0|
|               1.0|      0.6|
+------------------+---------+

