In [1]:
from pyspark.sql import SparkSession

In [2]:
APP_NAME = "DataFramesAndMLLib"
SPARK_URL = "local[*]"

In [3]:
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()

In [6]:
iris = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load('iris.csv')

In [8]:
iris.show(5)

+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [19]:
iris = iris.withColumnRenamed('sepal.length', 'sepal_length')
iris = iris.withColumnRenamed('sepal.width', 'sepal_width')
iris = iris.withColumnRenamed('petal.length', 'petal_length')
iris = iris.withColumnRenamed('petal.width', 'petal_width')

In [20]:
iris.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [24]:
iris.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'variety']

In [21]:
from pyspark.ml.feature import VectorAssembler

In [22]:
feature_cols = [x for x in iris.columns if x != 'variety']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

iris_features = assembler.transform(iris)

In [23]:
iris_features.show(5)

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| Setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| Setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| Setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| Setosa|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-------+-----------------+
only showing top 5 rows



In [25]:
from pyspark.ml.feature import StringIndexer

In [27]:
# This step is the same as label encoder in sklearn
indexer = StringIndexer(inputCol='variety', outputCol='label')
index_model = indexer.fit(iris_features)
iris_input = index_model.transform(iris_features).select("features", "label")
iris_input.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
+-----------------+-----+
only showing top 5 rows



In [28]:
train, test = iris_input.randomSplit([0.7, 0.3], seed = 1)

print(train.count(), test.count())

110 40


In [29]:
from pyspark.ml.classification import RandomForestClassifier

In [30]:
rf = RandomForestClassifier(featuresCol='features', labelCol='label')
rfModel = rf.fit(train)

In [31]:
prediction = rfModel.transform(test)

In [32]:
prediction.show(5)

+-----------------+-----+--------------+-------------+----------+
|         features|label| rawPrediction|  probability|prediction|
+-----------------+-----+--------------+-------------+----------+
|[4.5,2.3,1.3,0.3]|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.1,1.5,0.2]|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.1,1.6,0.2]|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.6,0.2]|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.9,0.2]|  0.0|[16.0,4.0,0.0]|[0.8,0.2,0.0]|       0.0|
+-----------------+-----+--------------+-------------+----------+
only showing top 5 rows



In [33]:
prediction_rdd = prediction.select("prediction", "label").rdd.map(tuple)
prediction_rdd.take(5)

[(0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)]

In [34]:
from pyspark.mllib.evaluation import MulticlassMetrics

metrics = MulticlassMetrics(prediction_rdd)
metrics.accuracy

0.925

## Another Dataset

- Apply linear regression to the following dataset

In [37]:
insurance = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load('insurance.csv')

In [39]:
insurance.show(5)

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
+---+------+------+--------+------+---------+-----------+
only showing top 5 rows



## How many unique region do we have?

In [41]:
insurance.select('region').distinct().show()

+---------+
|   region|
+---------+
|northwest|
|southeast|
|northeast|
|southwest|
+---------+



In [40]:
from pyspark.ml.feature import StringIndexer

string_index = StringIndexer(inputCol='region', outputCol='region_index').fit(insurance)

insurance_with_label_encoder = string_index.transform(insurance)
insurance_with_label_encoder.show(5)

+---+------+------+--------+------+---------+-----------+------------+
|age|   sex|   bmi|children|smoker|   region|    charges|region_index|
+---+------+------+--------+------+---------+-----------+------------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|         2.0|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|         0.0|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|         0.0|
| 33|  male|22.705|       0|    no|northwest|21984.47061|         1.0|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|         1.0|
+---+------+------+--------+------+---------+-----------+------------+
only showing top 5 rows



In [51]:
from pyspark.ml.feature import OneHotEncoder

one_hot_index = OneHotEncoder(inputCols=["region_index"], outputCols=["region_num"]).fit(insurance_with_label_encoder)
insurance_with_label_encoder_onehot = one_hot_index.transform(insurance_with_label_encoder)
insurance_with_label_encoder_onehot.show(5)

+---+------+------+--------+------+---------+-----------+------------+-------------+
|age|   sex|   bmi|children|smoker|   region|    charges|region_index|   region_num|
+---+------+------+--------+------+---------+-----------+------------+-------------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|         2.0|(3,[2],[1.0])|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|         0.0|(3,[0],[1.0])|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|         0.0|(3,[0],[1.0])|
| 33|  male|22.705|       0|    no|northwest|21984.47061|         1.0|(3,[1],[1.0])|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|         1.0|(3,[1],[1.0])|
+---+------+------+--------+------+---------+-----------+------------+-------------+
only showing top 5 rows



In [52]:
# In order to understand what region_num values represent
insurance_with_label_encoder_onehot.toPandas().head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region_index,region_num
0,19,female,27.9,0,yes,southwest,16884.924,2.0,"(0.0, 0.0, 1.0)"
1,18,male,33.77,1,no,southeast,1725.5523,0.0,"(1.0, 0.0, 0.0)"
2,28,male,33.0,3,no,southeast,4449.462,0.0,"(1.0, 0.0, 0.0)"
3,33,male,22.705,0,no,northwest,21984.47061,1.0,"(0.0, 1.0, 0.0)"
4,32,male,28.88,0,no,northwest,3866.8552,1.0,"(0.0, 1.0, 0.0)"


## Instead of doing all the machine learning pre-processing steps one by one

- use pipeline

In [56]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

cols = ["smoker", "sex", "region"]
stages = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(insurance) for column in cols]
stages.append(OneHotEncoder(inputCols=["region_index"], outputCols=["region_num"]))

feature_cols = ['age', 'bmi', 'children', 'smoker_index', 'sex_index', 'region_num']
stages.append(VectorAssembler(inputCols=feature_cols, outputCol='features'))

pipeline = Pipeline(stages=stages)
insurance_transformed = pipeline.fit(insurance).transform(insurance)

insurance_transformed.show(5)

+---+------+------+--------+------+---------+-----------+------------+---------+------------+-------------+--------------------+
|age|   sex|   bmi|children|smoker|   region|    charges|smoker_index|sex_index|region_index|   region_num|            features|
+---+------+------+--------+------+---------+-----------+------------+---------+------------+-------------+--------------------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|         1.0|      1.0|         2.0|(3,[2],[1.0])|[19.0,27.9,0.0,1....|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|         0.0|      0.0|         0.0|(3,[0],[1.0])|(8,[0,1,2,5],[18....|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|         0.0|      0.0|         0.0|(3,[0],[1.0])|(8,[0,1,2,5],[28....|
| 33|  male|22.705|       0|    no|northwest|21984.47061|         0.0|      0.0|         1.0|(3,[1],[1.0])|(8,[0,1,6],[33.0,...|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|         0.0|      0.0|         1.0|(3,[

In [57]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='charges', maxIter=10)
lr_model = lr.fit(insurance_transformed)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [256.85635253734966,339.1934536108323,475.50054514913035,23848.534541912828,131.3143593950504,-1035.022049387815,-352.96389942466794,-960.050991300835]
Intercept: -12069.852935562118


## Why we have 8 Coefficients here?
## What we have missed for insurance dataset?