# 10. Unsupervised Learning

In [1]:
# -> Define SparkSession

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("unsupervised learning").getOrCreate()

In [2]:
# Load modules

import utils
from pyspark.ml.feature import StandardScaler, PCA
from pyspark.ml.clustering import KMeans

import pandas as pd
import numpy as np

## 10.4 Lab 1: Principal Component Analysis

In [3]:
# -> USArrests data:

USArrests = spark.read.csv('data/USArrests.csv',inferSchema=True,header=True).drop('_c0')
states = USArrests.columns

print('\nUSArrests data:'); USArrests.show(5)
print('\nSummary table:'); USArrests.describe().show()
print('\nData types:'); USArrests.printSchema()

# -> Prepare data:

data = utils.prepare_data(df = USArrests,
                    labelCol = None,
                    label_is_categorical = None,
                    categoricalCols = [],
                    continuousCols = ['Murder', 'UrbanPop', 'Rape', 'Assault']
                   )

# Standardize features by removing the mean and scaling to unit variance:
scaler = StandardScaler(inputCol="features", 
                        outputCol="scaledFeatures",
                        withStd=True, 
                        withMean=True)

data = scaler.fit(data).transform(data)

# -> Describe and fit the model

model = PCA(k=2, inputCol="scaledFeatures", outputCol="pcaFeatures")
model_fit = model.fit(data)

# -> Print results:

result = model_fit.transform(data).select("pcaFeatures")
print('\nPCA feature values:'); result.show(5, truncate=False)

print('\nExplained variance by principal components:'); print(model_fit.explainedVariance)
print('\nPrincipal components loadings:'); print(model_fit.pc)


USArrests data:
+------+-------+--------+----+
|Murder|Assault|UrbanPop|Rape|
+------+-------+--------+----+
|  13.2|    236|      58|21.2|
|  10.0|    263|      48|44.5|
|   8.1|    294|      80|31.0|
|   8.8|    190|      50|19.5|
|   9.0|    276|      91|40.6|
+------+-------+--------+----+
only showing top 5 rows


Summary table:
+-------+-----------------+----------------+------------------+------------------+
|summary|           Murder|         Assault|          UrbanPop|              Rape|
+-------+-----------------+----------------+------------------+------------------+
|  count|               50|              50|                50|                50|
|   mean|7.787999999999999|          170.76|             65.54|21.231999999999992|
| stddev|4.355509764209288|83.3376608400171|14.474763400836784|  9.36638453105965|
|    min|              0.8|              45|                32|               7.3|
|    max|             17.4|             337|                91|              46.0|

## 10.5. Lab 2: Clustering

### *10.5.1 K-Means Clustering*

In [4]:
# -> Generate data:

n = 50
x = np.random.normal(0, 1, (n,2))

x[:25, 0] = x[:25,0] + 3
x[:25, 1] = x[:25,1] - 4

pd_data = pd.DataFrame(x,columns=['x1','x2'])
data = spark.createDataFrame(pd_data)
print('Data:'); data.show(5)

# -> Prepare data:

data = utils.prepare_data(df = data,
                    labelCol = None,
                    label_is_categorical = None,
                    categoricalCols = [],
                    continuousCols = ['x1', 'x2']
                   )

# Train K-means model

K = 3
model = KMeans(k=5).setK(K).setSeed(1)
model_fit = model.fit(data)

# -> Evaluate clustering by computing Within Set Sum of Squared Errors:
wssse = model_fit.computeCost(data)
print("Within Set Sum of Squared Errors = {:.3f}".format(wssse))

# -> print the result:
centers = model_fit.clusterCenters()
print("\nCluster Centers: ")
for center in centers:
    print(center)
    
# -> Predictions
print('\n Predictions:'); model_fit.transform(data).show(5)

Data:
+------------------+------------------+
|                x1|                x2|
+------------------+------------------+
| 2.746122098640372|-4.220500456852838|
| 3.000839055656425|-4.670859651663419|
|2.9134504593955683|-4.085528175217294|
| 4.407107624532321|-5.397542877723977|
|3.6810251804138696|-3.059545273554204|
+------------------+------------------+
only showing top 5 rows

Within Set Sum of Squared Errors = 60.794

Cluster Centers: 
[0.07989059 0.62020926]
[ 3.09812141 -3.8719231 ]
[ 0.07953395 -1.1674171 ]

 Predictions:
+------------------+------------------+--------------------+----------+
|                x1|                x2|            features|prediction|
+------------------+------------------+--------------------+----------+
| 2.746122098640372|-4.220500456852838|[2.74612209864037...|         1|
| 3.000839055656425|-4.670859651663419|[3.00083905565642...|         1|
|2.9134504593955683|-4.085528175217294|[2.91345045939556...|         1|
| 4.407107624532321|-5.39

### *10.5.2 Hierachical Clustering*