## Page 25 onwards

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
sc = pyspark.SparkContext(appName="MyAppName")

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession(sc)

In [4]:
!wget https://raw.githubusercontent.com/tidyverse/ggplot2/main/data-raw/diamonds.csv

--2023-07-08 18:25:54--  https://raw.githubusercontent.com/tidyverse/ggplot2/main/data-raw/diamonds.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2772143 (2.6M) [text/plain]
Saving to: ‘diamonds.csv.1’


2023-07-08 18:25:54 (158 MB/s) - ‘diamonds.csv.1’ saved [2772143/2772143]



In [5]:
df = spark.read.csv("diamonds.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- carat: double (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: double (nullable = true)
 |-- table: double (nullable = true)
 |-- price: integer (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



In [6]:
df.show()

+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|
| 0.23|Very Good|    H|    VS1| 59.4| 61.0|  338| 4.0|4.05|2.39|
|  0.3|     Good|    J|    SI1| 64.0| 55.0|  339|4.25|4.28|2.73|
| 0.23|    Ideal|    J|    VS1| 62.8| 56.0|  340|3.93| 3.9|2.46|
| 0.22|  Premium|    F|  

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(inputCols=["carat", "depth"],
outputCol="numeric_features")
df2 = assembler.transform(df)
df2.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+----------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|numeric_features|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+----------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|     [0.23,61.5]|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|     [0.21,59.8]|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|     [0.23,56.9]|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|     [0.29,62.4]|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|     [0.31,63.3]|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+----------------+
only showing top 5 rows



In [9]:
# Categorical variables (stored as strings) will first have to be converted to an integer index and, when used as a feature (“covariate”), then encoded using one-hot encoding (“dummy coding”)

In [10]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Convert categorical column (strings) to indexes
indexer = StringIndexer(inputCol="cut", outputCol="cut_index")
indexer_model = indexer.fit(df)

df2 = indexer_model.transform(df)
df2.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|cut_index|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|      0.0|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|      1.0|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|      3.0|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|      1.0|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|      3.0|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+
only showing top 5 rows



In [11]:
# Encode indices in one hot encoding ("dummy coding")
encoder = OneHotEncoder (inputCols=["cut_index"],
outputCols=["cut_index_encoded"])

encoder_model = encoder.fit(df2)

df3 = encoder_model.transform(df2)
df3.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+-----------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|cut_index|cut_index_encoded|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+-----------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|      0.0|    (4,[0],[1.0])|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|      1.0|    (4,[1],[1.0])|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|      3.0|    (4,[3],[1.0])|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|      1.0|    (4,[1],[1.0])|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|      3.0|    (4,[3],[1.0])|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+-----------------+
only showing top 5 rows



In [None]:
# Using pipeline

In [12]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# Convert categorical column (strings) to indexes
indexer = StringIndexer (inputCol="cut", outputCol="cut_index")

# Encode indices in one hot encoding ("dummy coding")
encoder = OneHotEncoder (inputCols=["cut_index"],
outputCols=["cut_index_encoded"])
pipeline = Pipeline(stages=[indexer, encoder] )

pipeline_model = pipeline.fit(df)

df3 = pipeline_model.transform(df)
df3.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+-----------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|cut_index|cut_index_encoded|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+-----------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|      0.0|    (4,[0],[1.0])|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|      1.0|    (4,[1],[1.0])|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|      3.0|    (4,[3],[1.0])|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|      1.0|    (4,[1],[1.0])|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|      3.0|    (4,[3],[1.0])|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+-----------------+
only showing top 5 rows



In [None]:
# Using the class RFormula

In [14]:
from pyspark.ml.feature import RFormula

formula = RFormula(formula="~carat+cut")
formula_model = formula.fit(df)

df3 = formula_model.transform(df)
df3.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|            features|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|(5,[0,1],[0.23,1.0])|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|(5,[0,2],[0.21,1.0])|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|(5,[0,4],[0.23,1.0])|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|(5,[0,2],[0.29,1.0])|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|(5,[0,4],[0.31,1.0])|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
only showing top 5 rows



In [15]:
# same as 

pipeline = Pipeline(stages=[
StringIndexer(inputCol="cut", outputCol="cut_index"),
OneHotEncoder(inputCols=["cut_index"],
outputCols=["cut_index_encoded"]),
VectorAssembler(inputCols=["carat", "cut_index_encoded"])
])
pipeline_model = pipeline.fit(df)
df3 = pipeline_model.transform(df)
df3.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+-----------------+------------------------------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|cut_index|cut_index_encoded|VectorAssembler_2de076945847__output|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+---------+-----------------+------------------------------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|      0.0|    (4,[0],[1.0])|                (5,[0,1],[0.23,1.0])|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|      1.0|    (4,[1],[1.0])|                (5,[0,2],[0.21,1.0])|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|      3.0|    (4,[3],[1.0])|                (5,[0,4],[0.23,1.0])|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|      1.0|    (4,[1],[1.0])|                (5,[0,2],[0.29,1.0])|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|      3.0|    (4,[3],[1.0])|              

### Feature transformation

In [16]:
from pyspark.ml.feature import MinMaxScaler
print(MinMaxScaler().explainParams())

inputCol: input column name. (undefined)
max: Upper bound of the output feature range (default: 1.0)
min: Lower bound of the output feature range (default: 0.0)
outputCol: output column name. (default: MinMaxScaler_10ee5e3e0466__output)


In [17]:
# To illustrate the use of the above transformations we will rescale the features we have extracted from the diamonds data set to [0, 1] using MinMaxScaler
pipeline = Pipeline(stages=[
                    RFormula(formula="~carat+cut"),
                    MinMaxScaler(inputCol="features", outputCol="scaled_features")
                    ])
pipeline_model = pipeline.fit(df)

df4 = pipeline_model.transform(df)
df4.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+--------------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|            features|     scaled_features|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+--------------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|(5,[0,1],[0.23,1.0])|(5,[0,1],[0.00623...|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|(5,[0,2],[0.21,1.0])|(5,[0,2],[0.00207...|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|(5,[0,4],[0.23,1.0])|(5,[0,4],[0.00623...|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|(5,[0,2],[0.29,1.0])|(5,[0,2],[0.01871...|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|(5,[0,4],[0.31,1.0])|(5,[0,4],[0.02286...|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+--------------------+
only showing top 5 rows



### Regression model

In [18]:
from pyspark.ml.regression import LinearRegression
pipeline = Pipeline(stages=[
            RFormula(formula="price~carat+cut"),
            LinearRegression()
            ])

pipeline_model = pipeline.fit(df)

pipeline_model.transform(df).select("label", "features", "prediction").show(5)

+-----+--------------------+-------------------+
|label|            features|         prediction|
+-----+--------------------+-------------------+
|326.0|(5,[0,1],[0.23,1.0])| -264.1968244521395|
|326.0|(5,[0,2],[0.21,1.0])|  -783.465310453415|
|327.0|(5,[0,4],[0.23,1.0])| -944.7889563555395|
|334.0|(5,[0,2],[0.29,1.0])|-153.77873973906617|
|335.0|(5,[0,4],[0.31,1.0])|-315.10238564119027|
+-----+--------------------+-------------------+
only showing top 5 rows



In [20]:
# To print the coefficients we first need to extract the linear model, which is the last stage of the pipeline.
regression_model = pipeline_model.stages[-1]

In [21]:
#We can then print the intercept (which Spark stores outside the coefficients)
print(regression_model.intercept)

-3875.4696997101078


In [22]:
#as well as the remaining coefficients.
print(regression_model.coefficients)

[7871.082133929367,1800.9239844542137,1439.0771411315254,1510.1354085136404,1120.331852550814]


### Classification

In [23]:
#no example

### Clustering

In [None]:
# see example 4 - end of pyinterface notebooks